Adding upstream version 6.338.2.upstream/6.338.2 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-15 20:38:23 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-15 20:38:23 +0000
commit: ff6e3c025658a5fa1affd094f220b623e7e1b24b (patch)
tree: 9faab72d69c92d24e349d184f5869b9796f17e0c
parent: Initial commit. (diff)
download: libplacebo-upstream.tar.xz
libplacebo-upstream.zip
239 files changed, 80664 insertions, 0 deletions
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
new file mode 100644
index 0000000..557dda7
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1,3 @@
+github: haasn
+patreon: haasn
+open_collective: haasn
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..51b8dea
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,18 @@
+name: ci
+on:
+  push:
+    branches:
+      - master
+      - pages-test
+permissions:
+  contents: write
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.x
+      - run: pip install mkdocs-material
+      - run: mkdocs gh-deploy --force
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7c7a0e5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+/build*
+/tags
+/TAGS
+/demos/3rdparty
+/3rdparty
+*.exe
+*.o
+.cache
+__pycache__
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000..5d287ad
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,180 @@
+stages:
+    - compile
+    - test
+    - sanitize
+
+variables:
+    GIT_SUBMODULE_STRATEGY: recursive
+    IMAGE_UBUNTU_JAMMY: registry.videolan.org/libplacebo-ubuntu-jammy:20230730213642
+    IMAGE_UBUNTU_JAMMY_AARCH: registry.videolan.org/libplacebo-ubuntu-jammy-aarch64:20230203024122
+
+linux:
+    image: $IMAGE_UBUNTU_JAMMY
+    stage: compile
+    tags:
+        - docker
+        - amd64
+    script:
+        - meson build --buildtype release
+                      --werror
+                      -Dtests=true
+                      -Dshaderc=enabled
+                      -Dglslang=enabled
+        - ninja -C build
+
+static:
+    image: $IMAGE_UBUNTU_JAMMY
+    stage: compile
+    tags:
+        - docker
+        - amd64
+    script:
+        - meson build --buildtype release
+                      --default-library static
+                      --werror
+                      -Dshaderc=enabled
+                      -Dglslang=enabled
+        - ninja -C build
+
+win32:
+    image: $IMAGE_UBUNTU_JAMMY
+    stage: compile
+    tags:
+        - docker
+        - amd64
+    script:
+        - meson build --buildtype release
+                      --werror
+                      -Dtests=true
+                      -Ddebug-abort=true
+                      -Dd3d11=enabled
+                      --cross-file /opt/crossfiles/i686-w64-mingw32.meson
+        - ninja -C build
+        - cd build && meson test -t 5 -v --num-processes=1
+
+win64:
+    image: $IMAGE_UBUNTU_JAMMY
+    stage: compile
+    tags:
+        - docker
+        - amd64
+    script:
+        - meson build --buildtype release
+                      --werror
+                      -Dtests=true
+                      -Ddebug-abort=true
+                      -Dd3d11=enabled
+                      --cross-file /opt/crossfiles/x86_64-w64-mingw32.meson
+        - ninja -C build
+        - cd build && meson test -t 5 -v --num-processes=1
+
+aarch64:
+    image: $IMAGE_UBUNTU_JAMMY_AARCH
+    stage: compile
+    tags:
+        - docker
+        - aarch64
+    script:
+        - meson build --buildtype release --werror -Dtests=true
+        - ninja -C build
+        - cd build && meson test -t 5 -v --num-processes=1
+
+macos:
+    stage: compile
+    tags:
+        - amd64
+        - monterey
+    script:
+        - meson build --buildtype release
+                      -Ddefault_library=both
+                      -Dtests=true
+                      -Ddebug-abort=true
+                      -Dc_args='-mmacosx-version-min=10.11 -Wunguarded-availability'
+                      --werror
+        - ninja -C build
+        - cd build && meson test -t 5 -v --num-processes=1
+
+scan:
+    image: $IMAGE_UBUNTU_JAMMY
+    stage: compile
+    tags:
+        - docker
+        - amd64
+    script:
+        - env CC=clang CXX=clang++ CC_LD=lld CXX_LD=lld
+            meson build --buildtype debugoptimized
+                        --werror
+                        -Dtests=true
+                        -Dbench=true
+                        -Dshaderc=enabled
+                        -Dglslang=enabled
+        - ninja -C build scan-build
+
+llvmpipe:
+    image: $IMAGE_UBUNTU_JAMMY
+    stage: test
+    tags:
+        - docker
+        - amd64
+    script:
+        - meson build --buildtype release
+                      --werror
+                      -Dtests=true
+                      -Ddebug-abort=true
+                      -Dc_args='-DCI_ALLOW_SW -DCI_MAXGL'
+                      -Dshaderc=enabled
+                      -Dglslang=enabled
+        - ninja -C build
+        - cd build && meson test -t 20 -v --num-processes=1
+
+gpu:
+    image: $IMAGE_UBUNTU_JAMMY
+    stage: test
+    tags:
+        - gpu
+    script:
+        - meson build --buildtype release
+                      --werror
+                      -Dtests=true
+                      -Ddemos=false
+                      -Ddebug-abort=true
+                      -Dshaderc=enabled
+                      -Dglslang=enabled
+                      -Db_coverage=true
+        - ninja -C build
+        - vulkaninfo
+        - cd build && meson test -t 5 -v --num-processes=1
+        - ninja coverage-html
+        - mv meson-logs/coveragereport ../coverage
+        - ninja coverage-xml
+        - grep -Eo 'line-rate="[^"]+"' meson-logs/coverage.xml | head -n 1 |
+          grep -Eo '[0-9.]+' | awk '{ print "coverage:", $1 * 100 } '
+    coverage: '/^coverage: (\d+.\d+)$/'
+    artifacts:
+        expose_as: 'Coverage HTML report'
+        paths:
+            - coverage/
+        reports:
+            coverage_report:
+              coverage_format: cobertura
+              path: build/meson-logs/coverage.xml
+
+sanitize:
+    image: $IMAGE_UBUNTU_JAMMY
+    stage: sanitize
+    tags:
+        - gpu
+    variables:
+        UBSAN_OPTIONS: 'print_stacktrace=1:halt_on_error=1'
+    script:
+        - env CC=clang CXX=clang++ CC_LD=lld CXX_LD=lld
+            meson build --buildtype debugoptimized
+                        --werror
+                        -Dtests=true
+                        -Ddebug-abort=true
+                        -Dc_args='-DCI_MAXGL -Wno-deprecated-declarations'
+                        -Db_sanitize=address,undefined
+                        -Db_lundef=false
+                        -Dshaderc=enabled
+        - ninja -C build
+        - cd build && time meson test -t 5 -v --num-processes=1
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..3245c21
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,18 @@
+[submodule "demos/3rdparty/nuklear"]
+	path = demos/3rdparty/nuklear
+	url = https://github.com/Immediate-Mode-UI/Nuklear.git
+[submodule "3rdparty/glad"]
+	path = 3rdparty/glad
+	url = https://github.com/Dav1dde/glad
+[submodule "3rdparty/jinja"]
+	path = 3rdparty/jinja
+	url = https://github.com/pallets/jinja
+[submodule "3rdparty/markupsafe"]
+	path = 3rdparty/markupsafe
+	url = https://github.com/pallets/markupsafe
+[submodule "3rdparty/Vulkan-Headers"]
+	path = 3rdparty/Vulkan-Headers
+	url = https://github.com/KhronosGroup/Vulkan-Headers
+[submodule "3rdparty/fast_float"]
+	path = 3rdparty/fast_float
+	url = https://github.com/fastfloat/fast_float.git
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..20fb9c7
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,458 @@
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+ 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded.  In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+                            NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..813e6df
--- /dev/null
+++ b/README.md
@@ -0,0 +1,347 @@
+# libplacebo
+
+[![gitlab-ci badge](https://code.videolan.org/videolan/libplacebo/badges/master/pipeline.svg)](https://code.videolan.org/videolan/libplacebo/pipelines)
+[![gitlab-ci coverage](https://code.videolan.org/videolan/libplacebo/badges/master/coverage.svg)](https://code.videolan.org/videolan/libplacebo/-/jobs/artifacts/master/file/coverage/index.html?job=test-gpu)
+[![GitHub](https://img.shields.io/github/sponsors/haasn?logo=github)](https://github.com/sponsors/haasn)
+[![PayPal](https://img.shields.io/badge/donate-PayPal-blue.svg?logo=paypal)](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=SFJUTMPSZEAHC)
+[![Patreon](https://img.shields.io/badge/pledge-Patreon-red.svg?logo=patreon)](https://www.patreon.com/haasn)
+
+**libplacebo** is, in a nutshell, the core rendering algorithms and ideas of
+[mpv](https://mpv.io) rewritten as an independent library. As of today,
+libplacebo contains a large assortment of video processing shaders, focusing
+on both quality and performance. These include features such as the following:
+
+- High-quality, optimized **upscaling and downscaling** including support for
+  polar filters ("Jinc"), anti-aliasing, anti-ringing and gamma correct
+  scaling.
+- Dynamic **HDR tone mapping**, including real-time measurement of scene
+  histogram, scene change detection, dynamic exposure control, perceptual gamut
+  stretching, contrast recovery and more.
+- Native support for **Dolby Vision HDR**, including Profile 5 conversion to
+  HDR/PQ or SDR, reading DV side data, and reshaping. (BL only, currently)
+- A colorimetrically accurate **color management** engine with support for
+  soft gamut mapping, ICC profiles, accurate ITU-R BT.1886 emulation, black
+  point compensation, and custom 3DLUTs (.cube).
+- A pluggable, extensible [**custom shader
+  system**](http://libplacebo.org/custom-shaders/). This can be used to
+  arbitrarily extend the range of custom shaders to include popular user
+  shaders like RAVU, FSRCNNX, or Anime4K. See the [mpv wiki on user
+  scripts](https://github.com/mpv-player/mpv/wiki/User-Scripts#user-shaders)
+  for more information.
+- High performance **film grain synthesis** for AV1 and H.274, allowing media
+  players to offload this part of decoding from the CPU to the GPU.
+- Tunable, fast **debanding** and **deinterlacing** shaders.
+- High quality gamma-correct **dithering**, including error diffusion modes.
+
+Every attempt was made to provide these features at a **high level of
+abstraction**, taking away all the messy details of GPU programming, color
+spaces, obscure subsampling modes, image metadata manipulation, and so on.
+Expert-level functionality is packed into easy-to-use functions like
+`pl_frame_from_avframe` and `pl_render_image`.
+
+### Hardware requirements
+
+libplacebo currently supports Vulkan (including MoltenVK), OpenGL, and
+Direct3D 11. It currently has the following minimum hardware requirements:
+
+- **Vulkan**: Core version 1.2
+- **OpenGL**: GLSL version >= 130 (GL >= 3.0, GL ES >= 3.0)
+- **Direct3D**: Feature level >= 9_1
+
+For more documentation, including an introduction to the API, see [the project
+website](https://libplacebo.org).
+
+### Examples
+
+This screenshot from the included [plplay demo program](./demos/plplay.c)
+highlights just some of the features supported by the libplacebo rendering
+code, all of which are adjustable dynamically during video playback.
+
+[<img src="./demos/screenshots/plplay1.png" width="200" alt="plplay settings 1" />](./demos/screenshots/plplay1.png)
+[<img src="./demos/screenshots/plplay2.png" width="200" alt="plplay settings 2" />](./demos/screenshots/plplay2.png)
+[<img src="./demos/screenshots/plplay3.png" width="200" alt="plplay settings 3" />](./demos/screenshots/plplay3.png)
+
+[<img src="./demos/screenshots/plplay4.png" width="200" alt="plplay settings 4" />](./demos/screenshots/plplay4.png)
+[<img src="./demos/screenshots/plplay5.png" width="200" alt="plplay settings 5" />](./demos/screenshots/plplay5.png)
+[<img src="./demos/screenshots/plplay6.png" width="200" alt="plplay settings 6" />](./demos/screenshots/plplay6.png)
+
+### History
+
+This project grew out of an interest to accomplish the following goals:
+
+- Clean up mpv's internal [RA](#tier-1-rendering-abstraction) API and make it
+  reusable for other projects, as a general high-level backend-agnostic
+  graphics API wrapper.
+- Provide a standard library of useful GPU-accelerated image processing
+  primitives based on GLSL, so projects like media players or browsers can use
+  them without incurring a heavy dependency on `libmpv`.
+- Rewrite core parts of mpv's GPU-accelerated video renderer on top of
+  redesigned abstractions, in order to modernize it and allow supporting more
+  features.
+
+It has since been adopted by [VLC](https://www.videolan.org/vlc/) as their
+optional Vulkan-based video output path, and is provided as a Vulkan-based
+video filter in the FFmpeg project.
+
+## API Overview
+
+The public API of libplacebo is currently split up into the following
+components, the header files (and documentation) for which are available
+inside the [`src/include/libplacebo`](src/include/libplacebo) directory. The
+API is available in different "tiers", representing levels of abstraction
+inside libplacebo. The APIs in higher tiers depend on those in lower tiers.
+Which tier is used by a user depends on how much power/control they want over
+the actual rendering. The low-level tiers are more suitable for big projects
+that need strong control over the entire rendering pipeline; whereas the
+high-level tiers are more suitable for smaller or simpler projects that want
+libplacebo to take care of everything.
+
+### Tier 0 (logging, raw math primitives)
+
+- `cache.h`: Caching subsystem. Used to cache large or computationally heavy
+  binary blobs, such as compiled shaders, 3DLUTs, and so on.
+- `colorspace.h`: A collection of enums and structs for describing color
+  spaces, as well as a collection of helper functions for computing various
+  color space transformation matrices.
+- `common.h`: A collection of miscellaneous utility types and macros that are
+  shared among multiple subsystems. Usually does not need to be included
+  directly.
+- `log.h`: Logging subsystem.
+- `config.h`: Macros defining information about the way libplacebo was built,
+  including the version strings and compiled-in features/dependencies. Usually
+  does not need to be included directly. May be useful for feature tests.
+- `dither.h`: Some helper functions for generating various noise and dithering
+  matrices. Might be useful for somebody else.
+- `filters.h`: A collection of reusable reconstruction filter kernels, which
+  can be used for scaling. The generated weights arrays are semi-tailored to
+  the needs of libplacebo, but may be useful to somebody else regardless. Also
+  contains the structs needed to define a filter kernel for the purposes of
+  libplacebo's upscaling routines.
+- `tone_mapping.h`: A collection of tone mapping functions, used for
+  conversions between HDR and SDR content.
+- `gamut_mapping.h`: A collection of gamut mapping functions, used for
+  conversions between wide gamut and standard gamut content, as well as
+  for gamut recompression after tone-mapping.
+
+The API functions in this tier are either used throughout the program
+(context, common etc.) or are low-level implementations of filter kernels,
+color space conversion logic etc.; which are entirely independent of GLSL
+and even the GPU in general.
+
+### Tier 1 (rendering abstraction)
+
+- `gpu.h`: Exports the GPU abstraction API used by libplacebo internally.
+- `swapchain.h`: Exports an API for wrapping platform-specific swapchains and
+  other display APIs. This is the API used to actually queue up rendered
+  frames for presentation (e.g. to a window or display device).
+- `vulkan.h`: GPU API implementation based on Vulkan.
+- `opengl.h`: GPU API implementation based on OpenGL.
+- `d3d11.h`: GPU API implementation based on Direct3D 11.
+- `dummy.h`: Dummy GPI API (interfaces with CPU only, no shader support)
+
+As part of the public API, libplacebo exports a middle-level abstraction for
+dealing with GPU objects and state. Basically, this is the API libplacebo uses
+internally to wrap OpenGL, Vulkan, Direct3D etc. into a single unifying API
+subset that abstracts away state, messy details, synchronization etc. into a
+fairly high-level API suitable for libplacebo's image processing tasks.
+
+It's made public both because it constitutes part of the public API of various
+image processing functions, but also in the hopes that it will be useful for
+other developers of GPU-accelerated image processing software.
+
+### Tier 2 (GLSL generating primitives)
+
+- `shaders.h`: The low-level interface to shader generation. This can be used
+  to generate GLSL stubs suitable for inclusion in other programs, as part of
+  larger shaders. For example, a program might use this interface to generate
+  a specialized tone-mapping function for performing color space conversions,
+  then call that from their own fragment shader code. This abstraction has an
+  optional dependency on `gpu.h`, but can also be used independently from it.
+
+In addition to this low-level interface, there are several available shader
+routines which libplacebo exports:
+
+- `shaders/colorspace.h`: Shader routines for decoding and transforming
+  colors, tone mapping, and so forth.
+- `shaders/custom.h`: Allows directly ingesting custom GLSL logic into the
+  `pl_shader` abstraction, either as bare GLSL or in [mpv .hook
+  format](https://mpv.io/manual/master/#options-glsl-shaders).
+- `shaders/deinterlacing.h`: GPU deinterlacing shader based on yadif.
+- `shaders/dithering.h`: Shader routine for various GPU dithering methods.
+- `shaders/film_grain.h`: Film grain synthesis shaders for AV1 and H.274.
+- `shaders/icc.h`: Shader for ICC profile based color management.
+- `shaders/lut.h`: Code for applying arbitrary 1D/3D LUTs.
+- `shaders/sampling.h`: Shader routines for various algorithms that sample
+  from images, such as debanding and scaling.
+
+### Tier 3 (shader dispatch)
+
+- `dispatch.h`: A higher-level interface to the `pl_shader` system, based on
+  `gpu.h`. This dispatch mechanism generates+executes complete GLSL shaders,
+  subject to the constraints and limitations of the underlying GPU.
+
+This shader dispatch mechanism is designed to be combined with the shader
+processing routines exported by `shaders/*.h`, but takes care of the low-level
+translation of the resulting `pl_shader_res` objects into legal GLSL. It also
+takes care of resource binding, shader input placement, as well as shader
+caching and resource pooling; and makes sure all generated shaders have unique
+identifiers (so they can be freely merged together).
+
+### Tier 4 (high level renderer)
+
+- `options.h`: A high-level options framework which wraps all of the options
+  comprising `pl_render_params` into a memory-managed, serializable struct that
+  can also be treated as a key/value dictionary. Also includes an options
+  parser to load options provided by the API user in string format.
+- `renderer.h`: A high-level renderer which combines the shader primitives
+  and dispatch mechanism into a fully-fledged rendering pipeline that takes
+  raw texture data and transforms it into the desired output image.
+- `utils/frame_queue.h`: A high-level frame queuing abstraction. This API
+  can be used to interface with a decoder (or other source of frames), and
+  takes care of translating timestamped frames into a virtual stream of
+  presentation events suitable for use with `renderer.h`, including any extra
+  context required for frame interpolation (`pl_frame_mix`).
+- `utils/upload.h`: A high-level helper for uploading generic data in some
+  user-described format to a plane texture suitable for use with `renderer.h`.
+  These helpers essentially take care of picking/mapping a good image format
+  supported by the GPU. (Note: Eventually, this function will also support
+  on-CPU conversions to a different format where necessary, but for now, it
+  will just fail)
+- `utils/dav1d.h`: High level helper for translating between Dav1dPicture
+  and libplacebo's `pl_frame`. (Single header library)
+- `utils/libav.h`: High-level helpers for interoperation between
+  libplacebo and FFmpeg's libav* abstractions. (Single header library)
+
+This is the "primary" interface to libplacebo, and the one most users will be
+interested in. It takes care of internal details such as degrading to simpler
+algorithms depending on the hardware's capabilities, combining the correct
+sequence of colorspace transformations and shader passes in order to get the
+best overall image quality, and so forth.
+
+## Authors
+
+libplacebo was founded and primarily developed by Niklas Haas
+([@haasn](https://github.com/haasn)), but it would not be possible without the
+contributions of others, especially support for windows.
+
+[![contributor list](https://opencollective.com/libplacebo/contributors.svg?width=890&button=false)](https://github.com/haasn/libplacebo/graphs/contributors)
+
+### License
+
+libplacebo is currently available under the terms of the LGPLv2.1 (or later)
+license. However, it's possible to release it under a more permissive license
+(e.g. BSD2) if a use case emerges.
+
+Please open an issue if you have a use case for a BSD2-licensed libplacebo.
+
+## Installing
+
+### Obtaining
+
+When cloning libplacebo, make sure to provide the `--recursive``` flag:
+
+```bash
+$ git clone --recursive https://code.videolan.org/videolan/libplacebo
+```
+
+Alternatively (on an existing clone):
+
+```bash
+$ git submodule update --init
+```
+
+Doing either of these pulls in a handful of bundled 3rdparty dependencies.
+Alternatively, they can be provided via the system.
+
+### Building from source
+
+libplacebo is built using the [meson build system](http://mesonbuild.com/).
+You can build the project using the following steps:
+
+```bash
+$ DIR=./build
+$ meson $DIR
+$ ninja -C$DIR
+```
+
+To rebuild the project on changes, re-run `ninja -Cbuild`. If you wish to
+install the build products to the configured prefix (typically `/usr/local/`),
+you can run `ninja -Cbuild install`. Note that this is normally ill-advised
+except for developers who know what they're doing. Regular users should rely
+on distro packages.
+
+### Dependencies
+
+In principle, libplacebo has no mandatory dependencies - only optional ones.
+However, to get a useful version of libplacebo. you most likely want to build
+with support for either `opengl`, `vulkan` or `d3d11`. libplacebo built without
+these can still be used (e.g. to generate GLSL shaders such as the ones used in
+VLC), but the usefulness is severely impacted since most components will be
+missing, impaired or otherwise not functional.
+
+A full list of optional dependencies each feature requires:
+
+- **glslang**: `glslang` + its related libraries (e.g. `libSPIRV.so`)
+- **lcms**: `liblcms2`
+- **libdovi**: `libdovi`
+- **opengl**: `glad2` (*)
+- **shaderc**: `libshaderc`
+- **vulkan**: `libvulkan`, `python3-jinja2` (*)
+- **xxhash**: `libxxhash`
+
+(*) This dependency is bundled automatically when doing a recursive clone.
+
+#### Vulkan support
+
+Because the vulkan backend requires on code generation at compile time,
+`python3-Jinja2` is a hard dependency of the build system. In addition to this,
+the path to the Vulkan registry (`vk.xml`) must be locatable, ideally by
+explicitly providing it via the `-Dvulkan-registry=/path/to/vk.xml` option,
+unless it can be found in one of the built-in hard-coded locations.
+
+### Configuring
+
+To get a list of configuration options supported by libplacebo, after running
+`meson $DIR` you can run `meson configure $DIR`, e.g.:
+
+```bash
+$ meson $DIR
+$ meson configure $DIR
+```
+
+If you want to disable a component, for example Vulkan support, you can
+explicitly set it to `false`, i.e.:
+
+```bash
+$ meson configure $DIR -Dvulkan=disabled -Dshaderc=disabled
+$ ninja -C$DIR
+```
+
+### Testing
+
+To enable building and executing the tests, you need to build with
+`tests` enabled, i.e.:
+
+```bash
+$ meson configure $DIR -Dtests=true
+$ ninja -C$DIR test
+```
+
+### Benchmarking
+
+A naive benchmark suite is provided as an extra test case, disabled by default
+(due to the high execution time required). To enable it, use the `bench`
+option:
+
+```bash
+$ meson configure $DIR -Dbench=true
+$ meson test -C$DIR benchmark --verbose
+```
+
+## Using
+
+For a full documentation of the API, refer to the above [API
+Overview](#api-overview) as well as the [public header
+files](src/include/libplacebo). You can find additional examples of how to use
+the various components in the [demo programs](demos) as well as in the [unit
+tests](src/tests).
diff --git a/RELEASING.md b/RELEASING.md
new file mode 100644
index 0000000..00c2832
--- /dev/null
+++ b/RELEASING.md
@@ -0,0 +1,23 @@
+# New release steps
+
+## Pre-release (vX.Y.0-rcN)
+
+1. Tag `vX.Y.0-rcN` on `master`
+
+## Normal release (vX.Y.0)
+
+1. Tag `vX.Y.0` on `master`
+2. Create version branch `vX.Y`
+3. Force-push `release` branch (or fast-forward if possible)
+4. Update topic on IRC #libplacebo
+5. Bump 'X' version number in meson.build, for next release (optional)
+6. Tag release on github
+
+## Bugfix release (vX.Y.Z)
+
+1. Cherry-pick bug fixes onto version branch (`vX.Y`)
+2. Update `Z` version number in `meson.build`
+3. Tag `vX.Y.Z` on this branch
+4. Fast-forward `release` branch iff this is the latest major release
+5. Update topic on IRC #libplacebo
+6. Tag release on github
diff --git a/compile b/compile
new file mode 100755
index 0000000..8310bc9
--- /dev/null
+++ b/compile
@@ -0,0 +1,4 @@
+#!/bin/sh
+DIR=./build
+[ -d $DIR ] || meson $DIR
+ninja -C$DIR
diff --git a/demos/LICENSE b/demos/LICENSE
new file mode 100644
index 0000000..0e259d4
--- /dev/null
+++ b/demos/LICENSE
@@ -0,0 +1,121 @@
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.
diff --git a/demos/colors.c b/demos/colors.c
new file mode 100644
index 0000000..41712e1
--- /dev/null
+++ b/demos/colors.c
@@ -0,0 +1,88 @@
+/* Simplistic demo that just makes the window colorful, including alpha
+ * transparency if supported by the windowing system.
+ *
+ * License: CC0 / Public Domain
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <math.h>
+#include <string.h>
+
+#include "common.h"
+#include "pl_clock.h"
+#include "window.h"
+
+static pl_log logger;
+static struct window *win;
+
+static void uninit(int ret)
+{
+    window_destroy(&win);
+    pl_log_destroy(&logger);
+    exit(ret);
+}
+
+int main(int argc, char **argv)
+{
+    logger = pl_log_create(PL_API_VER, pl_log_params(
+        .log_cb = pl_log_color,
+        .log_level = PL_LOG_DEBUG,
+    ));
+
+    win = window_create(logger, &(struct window_params) {
+        .title = "colors demo",
+        .width = 640,
+        .height = 480,
+        .alpha = true,
+    });
+    if (!win)
+        uninit(1);
+
+    pl_clock_t ts_start, ts;
+    if ((ts_start = pl_clock_now()) == 0) {
+        uninit(1);
+    }
+
+    while (!win->window_lost) {
+        if (window_get_key(win, KEY_ESC))
+            break;
+
+        struct pl_swapchain_frame frame;
+        bool ok = pl_swapchain_start_frame(win->swapchain, &frame);
+        if (!ok) {
+            // Something unexpected happened, perhaps the window is not
+            // visible? Wait for events and try again.
+            window_poll(win, true);
+            continue;
+        }
+
+        if ((ts = pl_clock_now()) == 0)
+            uninit(1);
+
+        const double period = 10.; // in seconds
+        double secs = fmod(pl_clock_diff(ts, ts_start), period);
+
+        double pos = 2 * M_PI * secs / period;
+        float alpha = (cos(pos) + 1.0) / 2.0;
+
+        assert(frame.fbo->params.blit_dst);
+        pl_tex_clear(win->gpu, frame.fbo, (float[4]) {
+            alpha * (sinf(2 * pos + 0.0) + 1.0) / 2.0,
+            alpha * (sinf(2 * pos + 2.0) + 1.0) / 2.0,
+            alpha * (sinf(2 * pos + 4.0) + 1.0) / 2.0,
+            alpha,
+        });
+
+        ok = pl_swapchain_submit_frame(win->swapchain);
+        if (!ok) {
+            fprintf(stderr, "libplacebo: failed submitting frame!\n");
+            uninit(3);
+        }
+
+        pl_swapchain_swap_buffers(win->swapchain);
+        window_poll(win, false);
+    }
+
+    uninit(0);
+}
diff --git a/demos/common.h b/demos/common.h
new file mode 100644
index 0000000..c768a7c
--- /dev/null
+++ b/demos/common.h
@@ -0,0 +1,11 @@
+// License: CC0 / Public Domain
+#pragma once
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <libplacebo/log.h>
+#include <libplacebo/renderer.h>
+
+#include "config_demos.h"
diff --git a/demos/meson.build b/demos/meson.build
new file mode 100644
index 0000000..fef665a
--- /dev/null
+++ b/demos/meson.build
@@ -0,0 +1,170 @@
+glfw = dependency('glfw3', required: false)
+sdl = dependency('sdl2', required: false)
+sdl_image = dependency('SDL2_image', required: false)
+
+ffmpeg_deps = [
+  dependency('libavcodec', required: false),
+  dependency('libavformat', required: false),
+  dependency('libavutil', required: false),
+]
+
+ffmpeg_found = true
+foreach dep : ffmpeg_deps
+  ffmpeg_found = ffmpeg_found and dep.found()
+endforeach
+
+nuklear = disabler()
+nuklear_inc = include_directories('./3rdparty/nuklear')
+if cc.has_header('nuklear.h', include_directories: nuklear_inc)
+  nuklear_lib = static_library('nuklear',
+    include_directories: nuklear_inc,
+    c_args: ['-O2', '-Wno-missing-prototypes'],
+    dependencies: [ libplacebo, libm ],
+    sources: 'ui.c',
+  )
+
+  nuklear = declare_dependency(
+    include_directories: nuklear_inc,
+    link_with: nuklear_lib,
+  )
+else
+  warning('Nuklear was not found in `demos/3rdparty`. Please run ' +
+          '`git submodule update --init` followed by `meson --wipe`.')
+endif
+
+conf_demos = configuration_data()
+conf_demos.set('HAVE_NUKLEAR', nuklear.found())
+conf_demos.set('HAVE_EGL', cc.check_header('EGL/egl.h', required: false))
+
+apis = []
+
+# Enable all supported combinations of API and windowing system
+if glfw.found()
+  if components.get('vulkan')
+    conf_demos.set('HAVE_GLFW_VULKAN', true)
+    apis += static_library('glfw-vk',
+      dependencies: [libplacebo, libm, glfw, vulkan_headers],
+      sources: 'window_glfw.c',
+      c_args: ['-DUSE_VK'],
+      include_directories: vulkan_headers_inc,
+    )
+  endif
+
+  if components.get('opengl')
+    conf_demos.set('HAVE_GLFW_OPENGL', true)
+    apis += static_library('glfw-gl',
+      dependencies: [libplacebo, glfw],
+      sources: 'window_glfw.c',
+      c_args: '-DUSE_GL',
+    )
+  endif
+
+  if components.get('d3d11')
+    conf_demos.set('HAVE_GLFW_D3D11', true)
+    apis += static_library('glfw-d3d11',
+      dependencies: [libplacebo, glfw],
+      sources: 'window_glfw.c',
+      c_args: '-DUSE_D3D11',
+    )
+  endif
+endif
+
+if sdl.found()
+  if components.get('vulkan')
+    conf_demos.set('HAVE_SDL_VULKAN', true)
+    apis += static_library('sdl-vk',
+      dependencies: [libplacebo, sdl, vulkan_headers],
+      sources: 'window_sdl.c',
+      c_args: ['-DUSE_VK'],
+      include_directories: vulkan_headers_inc,
+    )
+  endif
+
+  if components.get('opengl')
+    conf_demos.set('HAVE_SDL_OPENGL', true)
+    apis += static_library('sdl-gl',
+      dependencies: [libplacebo, sdl],
+      sources: 'window_sdl.c',
+      c_args: '-DUSE_GL',
+    )
+  endif
+endif
+
+configure_file(
+  output: 'config_demos.h',
+  configuration: conf_demos,
+)
+
+if apis.length() == 0
+  warning('Demos enabled but no supported combination of windowing system ' +
+          'and graphical APIs was found. Demo programs require either GLFW or ' +
+          'SDL and either Vulkan or OpenGL to function.')
+else
+
+  additional_dep = []
+  if host_machine.system() == 'windows'
+    additional_dep += cc.find_library('winmm')
+  endif
+
+  dep = declare_dependency(
+    dependencies: [ libplacebo, build_deps ] + additional_dep,
+    sources: ['window.c', 'utils.c'],
+    include_directories: vulkan_headers_inc,
+    link_with: apis,
+  )
+
+  # Graphical demo programs
+  executable('colors', 'colors.c',
+    dependencies: [ dep, pl_clock, libm ],
+    link_args: link_args,
+    link_depends: link_depends,
+  )
+
+  if sdl_image.found()
+    executable('sdlimage', 'sdlimage.c',
+      dependencies: [ dep, libm, sdl_image ],
+      link_args: link_args,
+      link_depends: link_depends,
+    )
+  endif
+
+  if ffmpeg_found
+    plplay_deps = [ dep, pl_thread, pl_clock ] + ffmpeg_deps
+    if nuklear.found()
+      plplay_deps += nuklear
+    endif
+    if host_machine.system() == 'windows'
+      plplay_deps += cc.find_library('shlwapi', required: true)
+    endif
+    plplay_sources = ['plplay.c', 'settings.c']
+    if host_machine.system() == 'windows'
+      windows = import('windows')
+      plplay_sources += windows.compile_resources(demos_rc, depends: version_h,
+                          include_directories: meson.project_source_root()/'win32')
+    endif
+    executable('plplay', plplay_sources,
+      dependencies: plplay_deps,
+      link_args: link_args,
+      link_depends: link_depends,
+      install: true,
+    )
+  endif
+
+endif
+
+# Headless vulkan demos
+if components.get('vk-proc-addr')
+  executable('video-filtering', 'video-filtering.c',
+    dependencies: [ libplacebo, pl_clock, pl_thread, vulkan_loader ],
+    c_args: '-O2',
+    link_args: link_args,
+    link_depends: link_depends,
+  )
+
+  executable('multigpu-bench', 'multigpu-bench.c',
+    dependencies: [ libplacebo, pl_clock, vulkan_loader ],
+    c_args: '-O2',
+    link_args: link_args,
+    link_depends: link_depends,
+  )
+endif
diff --git a/demos/multigpu-bench.c b/demos/multigpu-bench.c
new file mode 100644
index 0000000..75f1135
--- /dev/null
+++ b/demos/multigpu-bench.c
@@ -0,0 +1,484 @@
+/* GPU->GPU transfer benchmarks. Requires some manual setup.
+ *
+ * License: CC0 / Public Domain
+ */
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+#include <libplacebo/gpu.h>
+#include <libplacebo/vulkan.h>
+
+#include "pl_clock.h"
+
+#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
+
+enum {
+    // Image configuration
+    NUM_TEX = 16,
+    WIDTH   = 1920,
+    HEIGHT  = 1080,
+    DEPTH   = 16,
+    COMPS   = 1,
+
+    // Queue configuration
+    NUM_QUEUES = NUM_TEX,
+    ASYNC_TX   = 1,
+    ASYNC_COMP = 1,
+
+    // Buffer configuration
+    PTR_ALIGN    = 4096,
+    PIXEL_PITCH  = DEPTH / 8,
+    ROW_PITCH    = ALIGN2(WIDTH * PIXEL_PITCH, 256),
+    IMAGE_SIZE   = ROW_PITCH * HEIGHT,
+    BUFFER_SIZE  = IMAGE_SIZE + PTR_ALIGN - 1,
+
+    // Test configuration
+    TEST_MS    = 1500,
+    WARMUP_MS  = 500,
+    POLL_FREQ  = 10,
+};
+
+static uint8_t* page_align(uint8_t *data)
+{
+    return (uint8_t *) ALIGN2((uintptr_t) data, PTR_ALIGN);
+}
+
+enum mem_owner {
+    CPU,
+    SRC,
+    DST,
+    NUM_MEM_OWNERS,
+};
+
+enum mem_type {
+    RAM,
+    GPU,
+    NUM_MEM_TYPES,
+};
+
+// This is attached to every `pl_tex.params.user_data`
+struct buffers {
+    pl_gpu gpu;
+    pl_buf buf[NUM_MEM_TYPES];
+    pl_buf exported[NUM_MEM_TYPES];
+    pl_buf imported[NUM_MEM_TYPES];
+    struct pl_tex_transfer_params async;
+};
+
+static struct buffers *alloc_buffers(pl_gpu gpu)
+{
+    struct buffers *buffers = malloc(sizeof(*buffers));
+    *buffers = (struct buffers) { .gpu = gpu };
+
+    for (enum mem_type type = 0; type < NUM_MEM_TYPES; type++) {
+        buffers->buf[type] = pl_buf_create(gpu, pl_buf_params(
+            .size          = BUFFER_SIZE,
+            .memory_type   = type == RAM ? PL_BUF_MEM_HOST : PL_BUF_MEM_DEVICE,
+            .host_mapped   = true,
+        ));
+        if (!buffers->buf[type])
+            exit(2);
+
+        if (gpu->export_caps.buf & PL_HANDLE_DMA_BUF) {
+            buffers->exported[type] = pl_buf_create(gpu, pl_buf_params(
+                .size          = BUFFER_SIZE,
+                .memory_type   = type == RAM ? PL_BUF_MEM_HOST : PL_BUF_MEM_DEVICE,
+                .export_handle = PL_HANDLE_DMA_BUF,
+            ));
+        }
+    }
+
+    return buffers;
+}
+
+static void free_buffers(struct buffers *buffers)
+{
+    for (enum mem_type type = 0; type < NUM_MEM_TYPES; type++) {
+        pl_buf_destroy(buffers->gpu, &buffers->buf[type]);
+        pl_buf_destroy(buffers->gpu, &buffers->exported[type]);
+        pl_buf_destroy(buffers->gpu, &buffers->imported[type]);
+    }
+    free(buffers);
+}
+
+static void link_buffers(pl_gpu gpu, struct buffers *buffers,
+                         const struct buffers *import)
+{
+    if (!(gpu->import_caps.buf & PL_HANDLE_DMA_BUF))
+        return;
+
+    for (enum mem_type type = 0; type < NUM_MEM_TYPES; type++) {
+        if (!import->exported[type])
+            continue;
+        buffers->imported[type] = pl_buf_create(gpu, pl_buf_params(
+            .size          = BUFFER_SIZE,
+            .memory_type   = type == RAM ? PL_BUF_MEM_HOST : PL_BUF_MEM_DEVICE,
+            .import_handle = PL_HANDLE_DMA_BUF,
+            .shared_mem    = import->exported[type]->shared_mem,
+        ));
+    }
+}
+
+struct ctx {
+    pl_gpu srcgpu, dstgpu;
+    pl_tex src, dst;
+
+    // for copy-based methods
+    enum mem_owner  owner;
+    enum mem_type   type;
+    bool noimport;
+    bool async;
+};
+
+static void await_buf(pl_gpu gpu, pl_buf buf)
+{
+    while (pl_buf_poll(gpu, buf, UINT64_MAX))
+        ; // do nothing
+}
+
+static void async_upload(void *priv)
+{
+    struct buffers *buffers = priv;
+    pl_tex_upload(buffers->gpu, &buffers->async);
+}
+
+static inline void copy_ptr(struct ctx ctx)
+{
+    const pl_gpu srcgpu = ctx.srcgpu, dstgpu = ctx.dstgpu;
+    const pl_tex src = ctx.src, dst = ctx.dst;
+    struct buffers *srcbuffers = src->params.user_data;
+    struct buffers *dstbuffers = dst->params.user_data;
+    pl_buf buf = NULL;
+    uint8_t *data = NULL;
+
+    if (ctx.owner == CPU) {
+        static uint8_t static_buffer[BUFFER_SIZE];
+        data = page_align(static_buffer);
+    } else {
+        struct buffers *b = ctx.owner == SRC ? srcbuffers : dstbuffers;
+        buf = b->buf[ctx.type];
+        data = page_align(buf->data);
+        await_buf(b->gpu, buf);
+    }
+
+    struct pl_tex_transfer_params src_params = {
+        .tex       = src,
+        .row_pitch = ROW_PITCH,
+        .no_import = ctx.noimport,
+    };
+
+    if (ctx.owner == SRC) {
+        src_params.buf = buf;
+        src_params.buf_offset = data - buf->data;
+    } else {
+        src_params.ptr = data;
+    }
+
+    struct pl_tex_transfer_params dst_params = {
+        .tex       = dst,
+        .row_pitch = ROW_PITCH,
+        .no_import = ctx.noimport,
+    };
+
+    if (ctx.owner == DST) {
+        dst_params.buf = buf;
+        dst_params.buf_offset = data - buf->data;
+    } else {
+        dst_params.ptr = data;
+    }
+
+    if (ctx.async) {
+        src_params.callback = async_upload;
+        src_params.priv = dstbuffers;
+        dstbuffers->async = dst_params;
+        pl_tex_download(srcgpu, &src_params);
+    } else {
+        pl_tex_download(srcgpu, &src_params);
+        pl_tex_upload(dstgpu, &dst_params);
+    }
+}
+
+static inline void copy_interop(struct ctx ctx)
+{
+    const pl_gpu srcgpu = ctx.srcgpu, dstgpu = ctx.dstgpu;
+    const pl_tex src = ctx.src, dst = ctx.dst;
+    struct buffers *srcbuffers = src->params.user_data;
+    struct buffers *dstbuffers = dst->params.user_data;
+
+    struct pl_tex_transfer_params src_params = {
+        .tex       = src,
+        .row_pitch = ROW_PITCH,
+    };
+
+    struct pl_tex_transfer_params dst_params = {
+        .tex       = dst,
+        .row_pitch = ROW_PITCH,
+    };
+
+    if (ctx.owner == SRC) {
+        src_params.buf = srcbuffers->exported[ctx.type];
+        dst_params.buf = dstbuffers->imported[ctx.type];
+    } else {
+        src_params.buf = srcbuffers->imported[ctx.type];
+        dst_params.buf = dstbuffers->exported[ctx.type];
+    }
+
+    await_buf(srcgpu, src_params.buf);
+    if (ctx.async) {
+        src_params.callback = async_upload;
+        src_params.priv = dstbuffers;
+        dstbuffers->async = dst_params;
+        pl_tex_download(srcgpu, &src_params);
+    } else {
+        pl_tex_download(srcgpu, &src_params);
+        await_buf(srcgpu, src_params.buf); // manual cross-GPU synchronization
+        pl_tex_upload(dstgpu, &dst_params);
+    }
+}
+
+typedef void method(struct ctx ctx);
+
+static double bench(struct ctx ctx, pl_tex srcs[], pl_tex dsts[], method fun)
+{
+    const pl_gpu srcgpu = ctx.srcgpu, dstgpu = ctx.dstgpu;
+    pl_clock_t start_warmup = 0, start_test = 0;
+    uint64_t frames = 0, frames_warmup = 0;
+
+    start_warmup = pl_clock_now();
+    do {
+        const int idx = frames % NUM_TEX;
+        ctx.src = srcs[idx];
+        ctx.dst = dsts[idx];
+
+        // Generate some quasi-unique data in the source
+        float x = M_E * (frames / 100.0);
+        pl_tex_clear(srcgpu, ctx.src, (float[4]) {
+            sinf(x + 0.0) / 2.0 + 0.5,
+            sinf(x + 2.0) / 2.0 + 0.5,
+            sinf(x + 4.0) / 2.0 + 0.5,
+            1.0,
+        });
+
+        if (fun)
+            fun(ctx);
+
+        pl_gpu_flush(srcgpu); // to rotate queues
+        pl_gpu_flush(dstgpu);
+        frames++;
+
+        if (frames % POLL_FREQ == 0) {
+            pl_clock_t now = pl_clock_now();
+            if (start_test) {
+                if (pl_clock_diff(now, start_test) > TEST_MS * 1e-3)
+                    break;
+            } else if (pl_clock_diff(now, start_warmup) > WARMUP_MS * 1e-3) {
+                start_test = now;
+                frames_warmup = frames;
+            }
+        }
+    } while (true);
+
+    pl_gpu_finish(srcgpu);
+    pl_gpu_finish(dstgpu);
+
+    return pl_clock_diff(pl_clock_now(), start_test) / (frames - frames_warmup);
+}
+
+static void run_tests(pl_gpu srcgpu, pl_gpu dstgpu)
+{
+    const enum pl_fmt_caps caps = PL_FMT_CAP_HOST_READABLE;
+    pl_fmt srcfmt = pl_find_fmt(srcgpu, PL_FMT_UNORM, COMPS, DEPTH, DEPTH, caps);
+    pl_fmt dstfmt = pl_find_fmt(dstgpu, PL_FMT_UNORM, COMPS, DEPTH, DEPTH, caps);
+    if (!srcfmt || !dstfmt)
+        exit(2);
+
+    pl_tex src[NUM_TEX], dst[NUM_TEX];
+    for (int i = 0; i < NUM_TEX; i++) {
+        struct buffers *srcbuffers = alloc_buffers(srcgpu);
+        struct buffers *dstbuffers = alloc_buffers(dstgpu);
+        if (!memcmp(srcgpu->uuid, dstgpu->uuid, sizeof(srcgpu->uuid))) {
+            link_buffers(srcgpu, srcbuffers, dstbuffers);
+            link_buffers(dstgpu, dstbuffers, srcbuffers);
+        }
+
+        src[i] = pl_tex_create(srcgpu, pl_tex_params(
+            .w             = WIDTH,
+            .h             = HEIGHT,
+            .format        = srcfmt,
+            .host_readable = true,
+            .blit_dst      = true,
+            .user_data     = srcbuffers,
+        ));
+
+        dst[i] = pl_tex_create(dstgpu, pl_tex_params(
+            .w             = WIDTH,
+            .h             = HEIGHT,
+            .format        = dstfmt,
+            .host_writable = true,
+            .blit_dst      = true,
+            .user_data     = dstbuffers,
+        ));
+
+        if (!src[i] || !dst[i])
+            exit(2);
+    }
+
+    struct ctx ctx = {
+        .srcgpu = srcgpu,
+        .dstgpu = dstgpu,
+    };
+
+    static const char *owners[] = {
+        [CPU] = "cpu",
+        [SRC] = "src",
+        [DST] = "dst",
+    };
+
+    static const char *types[] = {
+        [RAM] = "ram",
+        [GPU] = "gpu",
+    };
+
+    double baseline = bench(ctx, src, dst, NULL);
+
+    // Test all possible generic copy methods
+    for (enum mem_owner owner = 0; owner < NUM_MEM_OWNERS; owner++) {
+        for (enum mem_type type = 0; type < NUM_MEM_TYPES; type++) {
+            for (int async = 0; async <= 1; async++) {
+                for (int noimport = 0; noimport <= 1; noimport++) {
+                    // Blacklist undesirable configurations:
+                    if (owner == CPU && type != RAM)
+                        continue; // impossible
+                    if (owner == CPU && async)
+                        continue; // no synchronization on static buffer
+                    if (owner == SRC && type == GPU)
+                        continue; // GPU readback is orders of magnitude too slow
+                    if (owner == DST && !noimport)
+                        continue; // exhausts source address space
+
+                    struct ctx cfg = ctx;
+                    cfg.noimport = noimport;
+                    cfg.owner    = owner;
+                    cfg.type     = type;
+                    cfg.async    = async;
+
+                    printf("  %s %s %s %s : ",
+                           owners[owner], types[type],
+                           noimport ? "memcpy" : "      ",
+                           async    ? "async" : "     ");
+
+                    double dur = bench(cfg, src, dst, copy_ptr) - baseline;
+                    printf("avg %.0f μs\t%.3f fps\n",
+                           1e6 * dur, 1.0 / dur);
+                }
+            }
+        }
+    }
+
+    // Test DMABUF interop when supported
+    for (enum mem_owner owner = 0; owner < NUM_MEM_OWNERS; owner++) {
+        for (enum mem_type type = 0; type < NUM_MEM_TYPES; type++) {
+            for (int async = 0; async <= 1; async++) {
+                struct buffers *buffers;
+                switch (owner) {
+                case SRC:
+                    buffers = dst[0]->params.user_data;
+                    if (!buffers->imported[type])
+                        continue;
+                    break;
+                case DST:
+                    buffers = src[0]->params.user_data;
+                    if (!buffers->imported[type])
+                        continue;
+                    break;
+                default: continue;
+                }
+
+                struct ctx cfg = ctx;
+                cfg.owner = owner;
+                cfg.type = type;
+
+                printf("  %s %s %s %s : ",
+                       owners[owner], types[type], "dmabuf",
+                       async ? "async" : "     ");
+
+                double dur = bench(cfg, src, dst, copy_interop) - baseline;
+                        printf("avg %.0f μs\t%.3f fps\n",
+                               1e6 * dur, 1.0 / dur);
+            }
+        }
+    }
+
+    for (int i = 0; i < NUM_TEX; i++) {
+        free_buffers(src[i]->params.user_data);
+        free_buffers(dst[i]->params.user_data);
+        pl_tex_destroy(srcgpu, &src[i]);
+        pl_tex_destroy(dstgpu, &dst[i]);
+    }
+}
+
+int main(int argc, const char *argv[])
+{
+    if (argc < 3) {
+        fprintf(stderr, "Usage: %s 'Device 1' 'Device 2'\n\n", argv[0]);
+        fprintf(stderr, "(Use `vulkaninfo` for a list of devices)\n");
+        exit(1);
+    }
+
+    pl_log log = pl_log_create(PL_API_VER, pl_log_params(
+        .log_cb    = pl_log_color,
+        .log_level = PL_LOG_WARN,
+    ));
+
+    pl_vk_inst inst = pl_vk_inst_create(log, pl_vk_inst_params(
+        .debug = false,
+    ));
+
+    pl_vulkan dev1 = pl_vulkan_create(log, pl_vulkan_params(
+        .device_name    = argv[1],
+        .queue_count    = NUM_QUEUES,
+        .async_transfer = ASYNC_TX,
+        .async_compute  = ASYNC_COMP,
+    ));
+
+    pl_vulkan dev2 = pl_vulkan_create(log, pl_vulkan_params(
+        .device_name    = argv[2],
+        .queue_count    = NUM_QUEUES,
+        .async_transfer = ASYNC_TX,
+        .async_compute  = ASYNC_COMP,
+    ));
+
+    if (!dev1 || !dev2) {
+        fprintf(stderr, "Failed creating Vulkan device!\n");
+        exit(1);
+    }
+
+    if (ROW_PITCH % dev1->gpu->limits.align_tex_xfer_pitch) {
+        fprintf(stderr, "Warning: Row pitch %d is not a multiple of optimal "
+                "transfer pitch (%zu) for GPU '%s'\n", ROW_PITCH,
+                dev1->gpu->limits.align_tex_xfer_pitch, argv[1]);
+    }
+
+    if (ROW_PITCH % dev2->gpu->limits.align_tex_xfer_pitch) {
+        fprintf(stderr, "Warning: Row pitch %d is not a multiple of optimal "
+                "transfer pitch (%zu) for GPU '%s'\n", ROW_PITCH,
+                dev2->gpu->limits.align_tex_xfer_pitch, argv[2]);
+    }
+
+    printf("%s -> %s:\n", argv[1], argv[2]);
+    run_tests(dev1->gpu, dev2->gpu);
+    if (strcmp(argv[1], argv[2])) {
+        printf("%s -> %s:\n", argv[2], argv[1]);
+        run_tests(dev2->gpu, dev1->gpu);
+    }
+
+    pl_vulkan_destroy(&dev1);
+    pl_vulkan_destroy(&dev2);
+    pl_vk_inst_destroy(&inst);
+    pl_log_destroy(&log);
+}
diff --git a/demos/plplay.c b/demos/plplay.c
new file mode 100644
index 0000000..901653e
--- /dev/null
+++ b/demos/plplay.c
@@ -0,0 +1,766 @@
+/* Example video player based on ffmpeg. Designed to expose every libplacebo
+ * option for testing purposes. Not a serious video player, no real error
+ * handling. Simply infinitely loops its input.
+ *
+ * License: CC0 / Public Domain
+ */
+
+#include <stdatomic.h>
+
+#include <libavutil/cpu.h>
+
+#include "common.h"
+#include "window.h"
+#include "utils.h"
+#include "plplay.h"
+#include "pl_clock.h"
+#include "pl_thread.h"
+
+#ifdef HAVE_NUKLEAR
+#include "ui.h"
+#else
+struct ui;
+static void ui_destroy(struct ui **ui) {}
+static bool ui_draw(struct ui *ui, const struct pl_swapchain_frame *frame) { return true; };
+#endif
+
+#include <libplacebo/utils/libav.h>
+
+static inline void log_time(struct timing *t, double ts)
+{
+    t->sum += ts;
+    t->sum2 += ts * ts;
+    t->peak = fmax(t->peak, ts);
+    t->count++;
+}
+
+static void uninit(struct plplay *p)
+{
+    if (p->decoder_thread_created) {
+        p->exit_thread = true;
+        pl_queue_push(p->queue, NULL); // Signal EOF to wake up thread
+        pl_thread_join(p->decoder_thread);
+    }
+
+    pl_queue_destroy(&p->queue);
+    pl_renderer_destroy(&p->renderer);
+    pl_options_free(&p->opts);
+
+    for (int i = 0; i < p->shader_num; i++) {
+        pl_mpv_user_shader_destroy(&p->shader_hooks[i]);
+        free(p->shader_paths[i]);
+    }
+
+    for (int i = 0; i < MAX_FRAME_PASSES; i++)
+        pl_shader_info_deref(&p->frame_info[i].shader);
+    for (int j = 0; j < MAX_BLEND_FRAMES; j++) {
+        for (int i = 0; i < MAX_BLEND_PASSES; i++)
+            pl_shader_info_deref(&p->blend_info[j][i].shader);
+    }
+
+    free(p->shader_hooks);
+    free(p->shader_paths);
+    free(p->icc_name);
+    pl_icc_close(&p->icc);
+
+    if (p->cache) {
+        FILE *file = fopen(p->cache_file, "wb");
+        if (file) {
+            pl_cache_save_file(p->cache, file);
+            fclose(file);
+        }
+        pl_cache_destroy(&p->cache);
+    }
+
+    // Free this before destroying the window to release associated GPU buffers
+    avcodec_free_context(&p->codec);
+    avformat_free_context(p->format);
+
+    ui_destroy(&p->ui);
+    window_destroy(&p->win);
+
+    pl_log_destroy(&p->log);
+    memset(p, 0, sizeof(*p));
+}
+
+static bool open_file(struct plplay *p, const char *filename)
+{
+    static const int av_log_level[] = {
+        [PL_LOG_NONE]  = AV_LOG_QUIET,
+        [PL_LOG_FATAL] = AV_LOG_PANIC,
+        [PL_LOG_ERR]   = AV_LOG_ERROR,
+        [PL_LOG_WARN]  = AV_LOG_WARNING,
+        [PL_LOG_INFO]  = AV_LOG_INFO,
+        [PL_LOG_DEBUG] = AV_LOG_VERBOSE,
+        [PL_LOG_TRACE] = AV_LOG_DEBUG,
+    };
+
+    av_log_set_level(av_log_level[p->args.verbosity]);
+
+    printf("Opening file: '%s'\n", filename);
+    if (avformat_open_input(&p->format, filename, NULL, NULL) != 0) {
+        fprintf(stderr, "libavformat: Failed opening file!\n");
+        return false;
+    }
+
+    printf("Format: %s\n", p->format->iformat->name);
+
+    if (p->format->duration != AV_NOPTS_VALUE)
+        printf("Duration: %.3f s\n", p->format->duration / 1e6);
+
+    if (avformat_find_stream_info(p->format,  NULL) < 0) {
+        fprintf(stderr, "libavformat: Failed finding stream info!\n");
+        return false;
+    }
+
+    // Find "best" video stream
+    int stream_idx =
+        av_find_best_stream(p->format, AVMEDIA_TYPE_VIDEO, -1, -1, NULL, 0);
+
+    if (stream_idx < 0) {
+        fprintf(stderr, "plplay: File contains no video streams?\n");
+        return false;
+    }
+
+    const AVStream *stream = p->format->streams[stream_idx];
+    const AVCodecParameters *par = stream->codecpar;
+    printf("Found video track (stream %d)\n", stream_idx);
+    printf("Resolution: %d x %d\n", par->width, par->height);
+
+    if (stream->avg_frame_rate.den && stream->avg_frame_rate.num)
+        printf("FPS: %f\n", av_q2d(stream->avg_frame_rate));
+
+    if (stream->r_frame_rate.den && stream->r_frame_rate.num)
+        printf("TBR: %f\n", av_q2d(stream->r_frame_rate));
+
+    if (stream->time_base.den && stream->time_base.num)
+        printf("TBN: %f\n", av_q2d(stream->time_base));
+
+    if (par->bit_rate)
+        printf("Bitrate: %"PRIi64" kbps\n", par->bit_rate / 1000);
+
+    printf("Format: %s\n", av_get_pix_fmt_name(par->format));
+
+    p->stream = stream;
+    return true;
+}
+
+static bool init_codec(struct plplay *p)
+{
+    assert(p->stream);
+    assert(p->win->gpu);
+
+    const AVCodec *codec = avcodec_find_decoder(p->stream->codecpar->codec_id);
+    if (!codec) {
+        fprintf(stderr, "libavcodec: Failed finding matching codec\n");
+        return false;
+    }
+
+    p->codec = avcodec_alloc_context3(codec);
+    if (!p->codec) {
+        fprintf(stderr, "libavcodec: Failed allocating codec\n");
+        return false;
+    }
+
+    if (avcodec_parameters_to_context(p->codec, p->stream->codecpar) < 0) {
+        fprintf(stderr, "libavcodec: Failed copying codec parameters to codec\n");
+        return false;
+    }
+
+    printf("Codec: %s (%s)\n", codec->name, codec->long_name);
+
+    const AVCodecHWConfig *hwcfg = 0;
+    if (p->args.hwdec) {
+        for (int i = 0; (hwcfg = avcodec_get_hw_config(codec, i)); i++) {
+            if (!pl_test_pixfmt(p->win->gpu, hwcfg->pix_fmt))
+                continue;
+            if (!(hwcfg->methods & AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX))
+                continue;
+
+            int ret = av_hwdevice_ctx_create(&p->codec->hw_device_ctx,
+                                            hwcfg->device_type,
+                                            NULL, NULL, 0);
+            if (ret < 0) {
+                fprintf(stderr, "libavcodec: Failed opening HW device context, skipping\n");
+                continue;
+            }
+
+            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(hwcfg->pix_fmt);
+            printf("Using hardware frame format: %s\n", desc->name);
+            p->codec->extra_hw_frames = 4;
+            break;
+        }
+    }
+
+    if (!hwcfg)
+        printf("Using software decoding\n");
+
+    p->codec->thread_count = FFMIN(av_cpu_count() + 1, 16);
+    p->codec->get_buffer2 = pl_get_buffer2;
+    p->codec->opaque = &p->win->gpu;
+#if LIBAVCODEC_VERSION_MAJOR < 60
+    AV_NOWARN_DEPRECATED({
+        p->codec->thread_safe_callbacks = 1;
+    });
+#endif
+#if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(58, 113, 100)
+    p->codec->export_side_data |= AV_CODEC_EXPORT_DATA_FILM_GRAIN;
+#endif
+
+    if (avcodec_open2(p->codec, codec, NULL) < 0) {
+        fprintf(stderr, "libavcodec: Failed opening codec\n");
+        return false;
+    }
+
+    return true;
+}
+
+static bool map_frame(pl_gpu gpu, pl_tex *tex,
+                      const struct pl_source_frame *src,
+                      struct pl_frame *out_frame)
+{
+    AVFrame *frame = src->frame_data;
+    struct plplay *p = frame->opaque;
+    bool ok = pl_map_avframe_ex(gpu, out_frame, pl_avframe_params(
+        .frame      = frame,
+        .tex        = tex,
+        .map_dovi   = !p->ignore_dovi,
+    ));
+
+    av_frame_free(&frame); // references are preserved by `out_frame`
+    if (!ok) {
+        fprintf(stderr, "Failed mapping AVFrame!\n");
+        return false;
+    }
+
+    p->stats.mapped++;
+    pl_frame_copy_stream_props(out_frame, p->stream);
+    return true;
+}
+
+static void unmap_frame(pl_gpu gpu, struct pl_frame *frame,
+                        const struct pl_source_frame *src)
+{
+    pl_unmap_avframe(gpu, frame);
+}
+
+static void discard_frame(const struct pl_source_frame *src)
+{
+    AVFrame *frame = src->frame_data;
+    struct plplay *p = frame->opaque;
+    p->stats.dropped++;
+    av_frame_free(&frame);
+    printf("Dropped frame with PTS %.3f\n", src->pts);
+}
+
+static PL_THREAD_VOID decode_loop(void *arg)
+{
+    int ret;
+    struct plplay *p = arg;
+    AVPacket *packet = av_packet_alloc();
+    AVFrame *frame = av_frame_alloc();
+    if (!frame || !packet)
+        goto done;
+
+    float frame_duration = av_q2d(av_inv_q(p->stream->avg_frame_rate));
+    double first_pts = 0.0, base_pts = 0.0, last_pts = 0.0;
+    uint64_t num_frames = 0;
+
+    while (!p->exit_thread) {
+        switch ((ret = av_read_frame(p->format, packet))) {
+        case 0:
+            if (packet->stream_index != p->stream->index) {
+                // Ignore unrelated packets
+                av_packet_unref(packet);
+                continue;
+            }
+            ret = avcodec_send_packet(p->codec, packet);
+            av_packet_unref(packet);
+            break;
+        case AVERROR_EOF:
+            // Send empty input to flush decoder
+            ret = avcodec_send_packet(p->codec, NULL);
+            break;
+        default:
+            fprintf(stderr, "libavformat: Failed reading packet: %s\n",
+                    av_err2str(ret));
+            goto done;
+        }
+
+        if (ret < 0) {
+            fprintf(stderr, "libavcodec: Failed sending packet to decoder: %s\n",
+                    av_err2str(ret));
+            goto done;
+        }
+
+        // Decode all frames from this packet
+        while ((ret = avcodec_receive_frame(p->codec, frame)) == 0) {
+            last_pts = frame->pts * av_q2d(p->stream->time_base);
+            if (num_frames++ == 0)
+                first_pts = last_pts;
+            frame->opaque = p;
+            (void) atomic_fetch_add(&p->stats.decoded, 1);
+            pl_queue_push_block(p->queue, UINT64_MAX, &(struct pl_source_frame) {
+                .pts = last_pts - first_pts + base_pts,
+                .duration = frame_duration,
+                .map = map_frame,
+                .unmap = unmap_frame,
+                .discard = discard_frame,
+                .frame_data = frame,
+
+                // allow soft-disabling deinterlacing at the source frame level
+                .first_field = p->opts->params.deinterlace_params
+                                    ? pl_field_from_avframe(frame)
+                                    : PL_FIELD_NONE,
+            });
+            frame = av_frame_alloc();
+        }
+
+        switch (ret) {
+        case AVERROR(EAGAIN):
+            continue;
+        case AVERROR_EOF:
+            if (num_frames <= 1)
+                goto done; // still image or empty file
+            // loop infinitely
+            ret = av_seek_frame(p->format, p->stream->index, 0, AVSEEK_FLAG_BACKWARD);
+            if (ret < 0) {
+                fprintf(stderr, "libavformat: Failed seeking in stream: %s\n",
+                        av_err2str(ret));
+                goto done;
+            }
+            avcodec_flush_buffers(p->codec);
+            base_pts += last_pts;
+            num_frames = 0;
+            continue;
+        default:
+            fprintf(stderr, "libavcodec: Failed decoding frame: %s\n",
+                    av_err2str(ret));
+            goto done;
+        }
+    }
+
+done:
+    pl_queue_push(p->queue, NULL); // Signal EOF to flush queue
+    av_packet_free(&packet);
+    av_frame_free(&frame);
+    PL_THREAD_RETURN();
+}
+
+static void update_colorspace_hint(struct plplay *p, const struct pl_frame_mix *mix)
+{
+    const struct pl_frame *frame = NULL;
+
+    for (int i = 0; i < mix->num_frames; i++) {
+        if (mix->timestamps[i] > 0.0)
+            break;
+        frame = mix->frames[i];
+    }
+
+    if (!frame)
+        return;
+
+    struct pl_color_space hint = {0};
+    if (p->colorspace_hint)
+        hint = frame->color;
+    if (p->target_override)
+        apply_csp_overrides(p, &hint);
+    pl_swapchain_colorspace_hint(p->win->swapchain, &hint);
+}
+
+static bool render_frame(struct plplay *p, const struct pl_swapchain_frame *frame,
+                         const struct pl_frame_mix *mix)
+{
+    struct pl_frame target;
+    pl_options opts = p->opts;
+    pl_frame_from_swapchain(&target, frame);
+    update_settings(p, &target);
+
+    if (p->target_override) {
+        target.repr = p->force_repr;
+        pl_color_repr_merge(&target.repr, &frame->color_repr);
+        apply_csp_overrides(p, &target.color);
+
+        // Update ICC profile parameters dynamically
+        float target_luma = 0.0f;
+        if (!p->use_icc_luma) {
+            pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+                .metadata = PL_HDR_METADATA_HDR10, // use only static HDR nits
+                .scaling  = PL_HDR_NITS,
+                .color    = &target.color,
+                .out_max  = &target_luma,
+            ));
+        }
+        pl_icc_update(p->log, &p->icc, NULL, pl_icc_params(
+            .max_luma  = target_luma,
+            .force_bpc = p->force_bpc,
+        ));
+        target.icc = p->icc;
+    }
+
+    assert(mix->num_frames);
+    pl_rect2df crop = mix->frames[0]->crop;
+    if (p->stream->sample_aspect_ratio.num && p->target_zoom != ZOOM_RAW) {
+        float sar = av_q2d(p->stream->sample_aspect_ratio);
+        pl_rect2df_stretch(&crop, fmaxf(1.0f, sar), fmaxf(1.0f, 1.0 / sar));
+    }
+
+    // Apply target rotation and un-rotate crop relative to target
+    target.rotation = p->target_rot;
+    pl_rect2df_rotate(&crop, mix->frames[0]->rotation - target.rotation);
+
+    switch (p->target_zoom) {
+    case ZOOM_PAD:
+        pl_rect2df_aspect_copy(&target.crop, &crop, 0.0);
+        break;
+    case ZOOM_CROP:
+        pl_rect2df_aspect_copy(&target.crop, &crop, 1.0);
+        break;
+    case ZOOM_STRETCH:
+        break; // target.crop already covers full image
+    case ZOOM_FIT:
+        pl_rect2df_aspect_fit(&target.crop, &crop, 0.0);
+        break;
+    case ZOOM_RAW: ;
+        // Ensure pixels are exactly aligned, to avoid fractional scaling
+        int w = roundf(fabsf(pl_rect_w(crop)));
+        int h = roundf(fabsf(pl_rect_h(crop)));
+        target.crop.x0 = roundf((pl_rect_w(target.crop) - w) / 2.0f);
+        target.crop.y0 = roundf((pl_rect_h(target.crop) - h) / 2.0f);
+        target.crop.x1 = target.crop.x0 + w;
+        target.crop.y1 = target.crop.y0 + h;
+        break;
+    case ZOOM_400:
+    case ZOOM_200:
+    case ZOOM_100:
+    case ZOOM_50:
+    case ZOOM_25: ;
+        const float z = powf(2.0f, (int) ZOOM_100 - p->target_zoom);
+        const float sx = z * fabsf(pl_rect_w(crop)) / pl_rect_w(target.crop);
+        const float sy = z * fabsf(pl_rect_h(crop)) / pl_rect_h(target.crop);
+        pl_rect2df_stretch(&target.crop, sx, sy);
+        break;
+    }
+
+    struct pl_color_map_params *cpars = &opts->color_map_params;
+    if (cpars->visualize_lut) {
+        cpars->visualize_rect = (pl_rect2df) {0, 0, 1, 1};
+        float tar = pl_rect2df_aspect(&target.crop);
+        pl_rect2df_aspect_set(&cpars->visualize_rect, 1.0f / tar, 0.0f);
+    }
+
+    pl_clock_t ts_pre = pl_clock_now();
+    if (!pl_render_image_mix(p->renderer, mix, &target, &opts->params))
+        return false;
+    pl_clock_t ts_rendered = pl_clock_now();
+    if (!ui_draw(p->ui, frame))
+        return false;
+    pl_clock_t ts_ui_drawn = pl_clock_now();
+
+    log_time(&p->stats.render, pl_clock_diff(ts_rendered, ts_pre));
+    log_time(&p->stats.draw_ui, pl_clock_diff(ts_ui_drawn, ts_rendered));
+
+    p->stats.rendered++;
+    return true;
+}
+
+static bool render_loop(struct plplay *p)
+{
+    pl_options opts = p->opts;
+
+    struct pl_queue_params qparams = {
+        .interpolation_threshold = 0.01,
+        .timeout = UINT64_MAX,
+    };
+
+    // Initialize the frame queue, blocking indefinitely until done
+    struct pl_frame_mix mix;
+    switch (pl_queue_update(p->queue, &mix, &qparams)) {
+    case PL_QUEUE_OK:  break;
+    case PL_QUEUE_EOF: return true;
+    case PL_QUEUE_ERR: goto error;
+    default: abort();
+    }
+
+    struct pl_swapchain_frame frame;
+    update_colorspace_hint(p, &mix);
+    if (!pl_swapchain_start_frame(p->win->swapchain, &frame))
+        goto error;
+
+    // Disable background transparency by default if the swapchain does not
+    // appear to support alpha transaprency
+    if (frame.color_repr.alpha == PL_ALPHA_UNKNOWN)
+        opts->params.background_transparency = 0.0;
+
+    if (!render_frame(p, &frame, &mix))
+        goto error;
+    if (!pl_swapchain_submit_frame(p->win->swapchain))
+        goto error;
+
+    // Wait until rendering is complete. Do this before measuring the time
+    // start, to ensure we don't count initialization overhead as part of the
+    // first vsync.
+    pl_gpu_finish(p->win->gpu);
+    p->stats.render = p->stats.draw_ui = (struct timing) {0};
+
+    pl_clock_t ts_start = 0, ts_prev = 0;
+    pl_swapchain_swap_buffers(p->win->swapchain);
+    window_poll(p->win, false);
+
+    double pts_target = 0.0, prev_pts = 0.0;
+
+    while (!p->win->window_lost) {
+        if (window_get_key(p->win, KEY_ESC))
+            break;
+
+        if (p->toggle_fullscreen)
+            window_toggle_fullscreen(p->win, !window_is_fullscreen(p->win));
+
+        update_colorspace_hint(p, &mix);
+        pl_clock_t ts_acquire = pl_clock_now();
+        if (!pl_swapchain_start_frame(p->win->swapchain, &frame)) {
+            // Window stuck/invisible? Block for events and try again.
+            window_poll(p->win, true);
+            continue;
+        }
+
+        pl_clock_t ts_pre_update = pl_clock_now();
+        log_time(&p->stats.acquire, pl_clock_diff(ts_pre_update, ts_acquire));
+        if (!ts_start)
+            ts_start = ts_pre_update;
+
+        qparams.timeout = 0; // non-blocking update
+        qparams.radius = pl_frame_mix_radius(&p->opts->params);
+        qparams.pts = fmax(pts_target, pl_clock_diff(ts_pre_update, ts_start));
+        p->stats.current_pts = qparams.pts;
+        if (qparams.pts != prev_pts)
+            log_time(&p->stats.pts_interval, qparams.pts - prev_pts);
+        prev_pts = qparams.pts;
+
+retry:
+        switch (pl_queue_update(p->queue, &mix, &qparams)) {
+        case PL_QUEUE_ERR: goto error;
+        case PL_QUEUE_EOF:
+            printf("End of file reached\n");
+            return true;
+        case PL_QUEUE_OK:
+            break;
+        case PL_QUEUE_MORE:
+            qparams.timeout = UINT64_MAX; // retry in blocking mode
+            goto retry;
+        }
+
+        pl_clock_t ts_post_update = pl_clock_now();
+        log_time(&p->stats.update, pl_clock_diff(ts_post_update, ts_pre_update));
+
+        if (qparams.timeout) {
+            double stuck_ms = 1e3 * pl_clock_diff(ts_post_update, ts_pre_update);
+            fprintf(stderr, "Stalled for %.4f ms due to frame queue underrun!\n", stuck_ms);
+            ts_start += ts_post_update - ts_pre_update; // subtract time spent waiting
+            p->stats.stalled++;
+            p->stats.stalled_ms += stuck_ms;
+        }
+
+        if (!render_frame(p, &frame, &mix))
+            goto error;
+
+        if (pts_target) {
+            pl_gpu_flush(p->win->gpu);
+            pl_clock_t ts_wait = pl_clock_now();
+            double pts_now = pl_clock_diff(ts_wait, ts_start);
+            if (pts_target >= pts_now) {
+                log_time(&p->stats.sleep, pts_target - pts_now);
+                pl_thread_sleep(pts_target - pts_now);
+            } else {
+                double missed_ms = 1e3 * (pts_now - pts_target);
+                fprintf(stderr, "Missed PTS target %.3f (%.3f ms in the past)\n",
+                        pts_target, missed_ms);
+                p->stats.missed++;
+                p->stats.missed_ms += missed_ms;
+            }
+
+            pts_target = 0.0;
+        }
+
+        pl_clock_t ts_pre_submit = pl_clock_now();
+        if (!pl_swapchain_submit_frame(p->win->swapchain)) {
+            fprintf(stderr, "libplacebo: failed presenting frame!\n");
+            goto error;
+        }
+        pl_clock_t ts_post_submit = pl_clock_now();
+        log_time(&p->stats.submit, pl_clock_diff(ts_post_submit, ts_pre_submit));
+
+        if (ts_prev)
+            log_time(&p->stats.vsync_interval, pl_clock_diff(ts_post_submit, ts_prev));
+        ts_prev = ts_post_submit;
+
+        pl_swapchain_swap_buffers(p->win->swapchain);
+        pl_clock_t ts_post_swap = pl_clock_now();
+        log_time(&p->stats.swap, pl_clock_diff(ts_post_swap, ts_post_submit));
+
+        window_poll(p->win, false);
+
+        // In content-timed mode (frame mixing disabled), delay rendering
+        // until the next frame should become visible
+        if (!opts->params.frame_mixer) {
+            struct pl_source_frame next;
+            for (int i = 0;; i++) {
+                if (!pl_queue_peek(p->queue, i, &next))
+                    break;
+                if (next.pts > qparams.pts) {
+                    pts_target = next.pts;
+                    break;
+                }
+            }
+        }
+
+        if (p->fps_override)
+            pts_target = fmax(pts_target, qparams.pts + 1.0 / p->fps);
+    }
+
+    return true;
+
+error:
+    fprintf(stderr, "Render loop failed, exiting early...\n");
+    return false;
+}
+
+static void info_callback(void *priv, const struct pl_render_info *info)
+{
+    struct plplay *p = priv;
+    switch (info->stage) {
+    case PL_RENDER_STAGE_FRAME:
+        if (info->index >= MAX_FRAME_PASSES)
+            return;
+        p->num_frame_passes = info->index + 1;
+        pl_dispatch_info_move(&p->frame_info[info->index], info->pass);
+        return;
+
+    case PL_RENDER_STAGE_BLEND:
+        if (info->index >= MAX_BLEND_PASSES || info->count >= MAX_BLEND_FRAMES)
+            return;
+        p->num_blend_passes[info->count] = info->index + 1;
+        pl_dispatch_info_move(&p->blend_info[info->count][info->index], info->pass);
+        return;
+
+    case PL_RENDER_STAGE_COUNT:
+        break;
+    }
+
+    abort();
+}
+
+static struct plplay state;
+
+int main(int argc, char *argv[])
+{
+    state = (struct plplay) {
+        .target_override = true,
+        .use_icc_luma = true,
+        .fps = 60.0,
+        .args = {
+            .preset = &pl_render_default_params,
+            .verbosity = PL_LOG_INFO,
+        },
+    };
+
+    if (!parse_args(&state.args, argc, argv))
+        return -1;
+
+    state.log = pl_log_create(PL_API_VER, pl_log_params(
+        .log_cb    = pl_log_color,
+        .log_level = state.args.verbosity,
+    ));
+
+    pl_options opts = state.opts = pl_options_alloc(state.log);
+    pl_options_reset(opts, state.args.preset);
+
+    // Enable this by default to save one click
+    opts->params.cone_params = &opts->cone_params;
+
+    // Enable dynamic parameters by default, due to plplay's heavy reliance on
+    // GUI controls for dynamically adjusting render parameters.
+    opts->params.dynamic_constants = true;
+
+    // Hook up our pass info callback
+    opts->params.info_callback = info_callback;
+    opts->params.info_priv = &state;
+
+    struct plplay *p = &state;
+    if (!open_file(p, state.args.filename))
+        goto error;
+
+    const AVCodecParameters *par = p->stream->codecpar;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(par->format);
+    if (!desc)
+        goto error;
+
+    struct window_params params = {
+        .title = "plplay",
+        .width = par->width,
+        .height = par->height,
+        .forced_impl = state.args.window_impl,
+    };
+
+    if (desc->flags & AV_PIX_FMT_FLAG_ALPHA) {
+        params.alpha = true;
+        opts->params.background_transparency = 1.0;
+    }
+
+    p->win = window_create(p->log, &params);
+    if (!p->win)
+        goto error;
+
+    // Test the AVPixelFormat against the GPU capabilities
+    if (!pl_test_pixfmt(p->win->gpu, par->format)) {
+        fprintf(stderr, "Unsupported AVPixelFormat: %s\n", desc->name);
+        goto error;
+    }
+
+#ifdef HAVE_NUKLEAR
+    p->ui = ui_create(p->win->gpu);
+    if (!p->ui)
+        goto error;
+#endif
+
+    if (!init_codec(p))
+        goto error;
+
+    const char *cache_dir = get_cache_dir(&(char[512]) {0});
+    if (cache_dir) {
+        int ret = snprintf(p->cache_file, sizeof(p->cache_file), "%s/plplay.cache", cache_dir);
+        if (ret > 0 && ret < sizeof(p->cache_file)) {
+            p->cache = pl_cache_create(pl_cache_params(
+                .log             = p->log,
+                .max_total_size  = 50 << 20, // 50 MB
+            ));
+            pl_gpu_set_cache(p->win->gpu, p->cache);
+            FILE *file = fopen(p->cache_file, "rb");
+            if (file) {
+                pl_cache_load_file(p->cache, file);
+                fclose(file);
+            }
+        }
+    }
+
+    p->queue = pl_queue_create(p->win->gpu);
+    int ret = pl_thread_create(&p->decoder_thread, decode_loop, p);
+    if (ret != 0) {
+        fprintf(stderr, "Failed creating decode thread: %s\n", strerror(errno));
+        goto error;
+    }
+
+    p->decoder_thread_created = true;
+
+    p->renderer = pl_renderer_create(p->log, p->win->gpu);
+    if (!render_loop(p))
+        goto error;
+
+    printf("Exiting...\n");
+    uninit(p);
+    return 0;
+
+error:
+    uninit(p);
+    return 1;
+}
diff --git a/demos/plplay.h b/demos/plplay.h
new file mode 100644
index 0000000..2036562
--- /dev/null
+++ b/demos/plplay.h
@@ -0,0 +1,138 @@
+#include <libavcodec/avcodec.h>
+#include <libavformat/avformat.h>
+
+#include <libplacebo/options.h>
+#include <libplacebo/utils/frame_queue.h>
+
+#include "common.h"
+#include "pl_thread.h"
+
+#define MAX_FRAME_PASSES 256
+#define MAX_BLEND_PASSES 8
+#define MAX_BLEND_FRAMES 8
+
+enum {
+    ZOOM_PAD = 0,
+    ZOOM_CROP,
+    ZOOM_STRETCH,
+    ZOOM_FIT,
+    ZOOM_RAW,
+    ZOOM_400,
+    ZOOM_200,
+    ZOOM_100,
+    ZOOM_50,
+    ZOOM_25,
+    ZOOM_COUNT,
+};
+
+struct plplay_args {
+    const struct pl_render_params *preset;
+    enum pl_log_level verbosity;
+    const char *window_impl;
+    const char *filename;
+    bool hwdec;
+};
+
+bool parse_args(struct plplay_args *args, int argc, char *argv[]);
+
+struct plplay {
+    struct plplay_args args;
+    struct window *win;
+    struct ui *ui;
+    char cache_file[512];
+
+    // libplacebo
+    pl_log log;
+    pl_renderer renderer;
+    pl_queue queue;
+    pl_cache cache;
+
+    // libav*
+    AVFormatContext *format;
+    AVCodecContext *codec;
+    const AVStream *stream; // points to first video stream of `format`
+    pl_thread decoder_thread;
+    bool decoder_thread_created;
+    bool exit_thread;
+
+    // settings / ui state
+    pl_options opts;
+    pl_rotation target_rot;
+    int target_zoom;
+    bool colorspace_hint;
+    bool colorspace_hint_dynamic;
+    bool ignore_dovi;
+    bool toggle_fullscreen;
+    bool advanced_scalers;
+
+    bool target_override; // if false, fields below are ignored
+    struct pl_color_repr force_repr;
+    enum pl_color_primaries force_prim;
+    enum pl_color_transfer force_trc;
+    struct pl_hdr_metadata force_hdr;
+    bool force_hdr_enable;
+    bool fps_override;
+    float fps;
+
+    // ICC profile
+    pl_icc_object icc;
+    char *icc_name;
+    bool use_icc_luma;
+    bool force_bpc;
+
+    // custom shaders
+    const struct pl_hook **shader_hooks;
+    char **shader_paths;
+    size_t shader_num;
+    size_t shader_size;
+
+    // pass metadata
+    struct pl_dispatch_info blend_info[MAX_BLEND_FRAMES][MAX_BLEND_PASSES];
+    struct pl_dispatch_info frame_info[MAX_FRAME_PASSES];
+    int num_frame_passes;
+    int num_blend_passes[MAX_BLEND_FRAMES];
+
+    // playback statistics
+    struct {
+        _Atomic uint32_t decoded;
+        uint32_t rendered;
+        uint32_t mapped;
+        uint32_t dropped;
+        uint32_t missed;
+        uint32_t stalled;
+        double missed_ms;
+        double stalled_ms;
+        double current_pts;
+
+        struct timing {
+            double sum, sum2, peak;
+            uint64_t count;
+        } acquire, update, render, draw_ui, sleep, submit, swap,
+          vsync_interval, pts_interval;
+    } stats;
+};
+
+void update_settings(struct plplay *p, const struct pl_frame *target);
+
+static inline void apply_csp_overrides(struct plplay *p, struct pl_color_space *csp)
+{
+    if (p->force_prim) {
+        csp->primaries = p->force_prim;
+        csp->hdr.prim = *pl_raw_primaries_get(csp->primaries);
+    }
+    if (p->force_trc)
+        csp->transfer = p->force_trc;
+    if (p->force_hdr_enable) {
+        struct pl_hdr_metadata fix = p->force_hdr;
+        fix.prim = csp->hdr.prim;
+        csp->hdr = fix;
+    } else if (p->colorspace_hint_dynamic) {
+        pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+            .color      = csp,
+            .metadata   = PL_HDR_METADATA_ANY,
+            .scaling    = PL_HDR_NITS,
+            .out_min    = &csp->hdr.min_luma,
+            .out_max    = &csp->hdr.max_luma,
+        ));
+    }
+}
diff --git a/demos/screenshots/plplay1.png b/demos/screenshots/plplay1.png
new file mode 100644
index 0000000..ce84d89
--- /dev/null
+++ b/demos/screenshots/plplay1.png
diff --git a/demos/screenshots/plplay2.png b/demos/screenshots/plplay2.png
new file mode 100644
index 0000000..ae88051
--- /dev/null
+++ b/demos/screenshots/plplay2.png
diff --git a/demos/screenshots/plplay3.png b/demos/screenshots/plplay3.png
new file mode 100644
index 0000000..9ec4126
--- /dev/null
+++ b/demos/screenshots/plplay3.png
diff --git a/demos/screenshots/plplay4.png b/demos/screenshots/plplay4.png
new file mode 100644
index 0000000..873be13
--- /dev/null
+++ b/demos/screenshots/plplay4.png
diff --git a/demos/screenshots/plplay5.png b/demos/screenshots/plplay5.png
new file mode 100644
index 0000000..c23d609
--- /dev/null
+++ b/demos/screenshots/plplay5.png
diff --git a/demos/screenshots/plplay6.png b/demos/screenshots/plplay6.png
new file mode 100644
index 0000000..15ea8fc
--- /dev/null
+++ b/demos/screenshots/plplay6.png
diff --git a/demos/sdlimage.c b/demos/sdlimage.c
new file mode 100644
index 0000000..87e6d03
--- /dev/null
+++ b/demos/sdlimage.c
@@ -0,0 +1,281 @@
+/* Simple image viewer that opens an image using SDL2_image and presents it
+ * to the screen.
+ *
+ * License: CC0 / Public Domain
+ */
+
+#include <SDL_image.h>
+
+#include "common.h"
+#include "window.h"
+
+#include <libplacebo/renderer.h>
+#include <libplacebo/shaders/lut.h>
+#include <libplacebo/utils/upload.h>
+
+// Static configuration, done in the file to keep things simple
+static const char *icc_profile = ""; // path to ICC profile
+static const char *lut_file = ""; // path to .cube lut
+
+// Program state
+static pl_log logger;
+static struct window *win;
+
+// For rendering
+static pl_tex img_tex;
+static pl_tex osd_tex;
+static struct pl_plane img_plane;
+static struct pl_plane osd_plane;
+static pl_renderer renderer;
+static struct pl_custom_lut *lut;
+
+struct file
+{
+    void *data;
+    size_t size;
+};
+
+static struct file icc_file;
+
+static bool open_file(const char *path, struct file *out)
+{
+    if (!path || !path[0]) {
+        *out = (struct file) {0};
+        return true;
+    }
+
+    FILE *fp = NULL;
+    bool success = false;
+
+    fp = fopen(path, "rb");
+    if (!fp)
+        goto done;
+
+    if (fseeko(fp, 0, SEEK_END))
+        goto done;
+    off_t size = ftello(fp);
+    if (size < 0)
+        goto done;
+    if (fseeko(fp, 0, SEEK_SET))
+        goto done;
+
+    void *data = malloc(size);
+    if (!fread(data, size, 1, fp))
+        goto done;
+
+    *out = (struct file) {
+        .data = data,
+        .size = size,
+    };
+
+    success = true;
+done:
+    if (fp)
+        fclose(fp);
+    return success;
+}
+
+static void close_file(struct file *file)
+{
+    if (!file->data)
+        return;
+
+    free(file->data);
+    *file = (struct file) {0};
+}
+
+SDL_NORETURN static void uninit(int ret)
+{
+    pl_renderer_destroy(&renderer);
+    pl_tex_destroy(win->gpu, &img_tex);
+    pl_tex_destroy(win->gpu, &osd_tex);
+    close_file(&icc_file);
+    pl_lut_free(&lut);
+
+    window_destroy(&win);
+    pl_log_destroy(&logger);
+    exit(ret);
+}
+
+static bool upload_plane(const SDL_Surface *img, pl_tex *tex,
+                         struct pl_plane *plane)
+{
+    if (!img)
+        return false;
+
+    SDL_Surface *fixed = NULL;
+    const SDL_PixelFormat *fmt = img->format;
+    if (SDL_ISPIXELFORMAT_INDEXED(fmt->format)) {
+        // libplacebo doesn't handle indexed formats yet
+        fixed = SDL_CreateRGBSurfaceWithFormat(0, img->w, img->h, 32,
+                                               SDL_PIXELFORMAT_ABGR8888);
+        SDL_BlitSurface((SDL_Surface *) img, NULL, fixed, NULL);
+        img = fixed;
+        fmt = img->format;
+    }
+
+    struct pl_plane_data data = {
+        .type           = PL_FMT_UNORM,
+        .width          = img->w,
+        .height         = img->h,
+        .pixel_stride   = fmt->BytesPerPixel,
+        .row_stride     = img->pitch,
+        .pixels         = img->pixels,
+    };
+
+    uint64_t masks[4] = { fmt->Rmask, fmt->Gmask, fmt->Bmask, fmt->Amask };
+    pl_plane_data_from_mask(&data, masks);
+
+    bool ok = pl_upload_plane(win->gpu, plane, tex, &data);
+    SDL_FreeSurface(fixed);
+    return ok;
+}
+
+static bool render_frame(const struct pl_swapchain_frame *frame)
+{
+    pl_tex img = img_plane.texture;
+    struct pl_frame image = {
+        .num_planes = 1,
+        .planes     = { img_plane },
+        .repr       = pl_color_repr_unknown,
+        .color      = pl_color_space_unknown,
+        .crop       = {0, 0, img->params.w, img->params.h},
+    };
+
+    // This seems to be the case for SDL2_image
+    image.repr.alpha = PL_ALPHA_INDEPENDENT;
+
+    struct pl_frame target;
+    pl_frame_from_swapchain(&target, frame);
+    target.profile = (struct pl_icc_profile) {
+        .data = icc_file.data,
+        .len = icc_file.size,
+    };
+
+    image.rotation = PL_ROTATION_0; // for testing
+    pl_rect2df_aspect_copy_rot(&target.crop, &image.crop, 0.0, image.rotation);
+
+    struct pl_overlay osd;
+    struct pl_overlay_part osd_part;
+    if (osd_tex) {
+        osd_part = (struct pl_overlay_part) {
+            .src = { 0, 0, osd_tex->params.w, osd_tex->params.h },
+            .dst = { 0, 0, osd_tex->params.w, osd_tex->params.h },
+        };
+        osd = (struct pl_overlay) {
+            .tex        = osd_tex,
+            .mode       = PL_OVERLAY_NORMAL,
+            .repr       = image.repr,
+            .color      = image.color,
+            .coords     = PL_OVERLAY_COORDS_DST_FRAME,
+            .parts      = &osd_part,
+            .num_parts  = 1,
+        };
+        target.overlays = &osd;
+        target.num_overlays = 1;
+    }
+
+    // Use the heaviest preset purely for demonstration/testing purposes
+    struct pl_render_params params = pl_render_high_quality_params;
+    params.lut = lut;
+
+    return pl_render_image(renderer, &image, &target, &params);
+}
+
+int main(int argc, char **argv)
+{
+    if (argc < 2 || argc > 3) {
+        fprintf(stderr, "Usage: %s <image> [<overlay>]\n", argv[0]);
+        return 255;
+    }
+
+    const char *file = argv[1];
+    const char *overlay = argc > 2 ? argv[2] : NULL;
+    logger = pl_log_create(PL_API_VER, pl_log_params(
+        .log_cb = pl_log_color,
+        .log_level = PL_LOG_INFO,
+    ));
+
+
+    // Load image, do this first so we can use it for the window size
+    SDL_Surface *img = IMG_Load(file);
+    if (!img) {
+        fprintf(stderr, "Failed loading '%s': %s\n", file, SDL_GetError());
+        uninit(1);
+    }
+
+    // Create window
+    unsigned int start = SDL_GetTicks();
+    win = window_create(logger, &(struct window_params) {
+        .title = "SDL2_image demo",
+        .width = img->w,
+        .height = img->h,
+    });
+    if (!win)
+        uninit(1);
+
+    // Initialize rendering state
+    if (!upload_plane(img, &img_tex, &img_plane)) {
+        fprintf(stderr, "Failed uploading image plane!\n");
+        uninit(2);
+    }
+    SDL_FreeSurface(img);
+
+    if (overlay) {
+        SDL_Surface *osd = IMG_Load(overlay);
+        if (!upload_plane(osd, &osd_tex, &osd_plane))
+            fprintf(stderr, "Failed uploading OSD plane.. continuing anyway\n");
+        SDL_FreeSurface(osd);
+    }
+
+    if (!open_file(icc_profile, &icc_file))
+        fprintf(stderr, "Failed opening ICC profile.. continuing anyway\n");
+
+    struct file lutf;
+    if (open_file(lut_file, &lutf) && lutf.size) {
+        if (!(lut = pl_lut_parse_cube(logger, lutf.data, lutf.size)))
+            fprintf(stderr, "Failed parsing LUT.. continuing anyway\n");
+        close_file(&lutf);
+    }
+
+    renderer = pl_renderer_create(logger, win->gpu);
+
+    unsigned int last = SDL_GetTicks(), frames = 0;
+    printf("Took %u ms for initialization\n", last - start);
+
+    // Render loop
+    while (!win->window_lost) {
+        struct pl_swapchain_frame frame;
+        bool ok = pl_swapchain_start_frame(win->swapchain, &frame);
+        if (!ok) {
+            window_poll(win, true);
+            continue;
+        }
+
+        if (!render_frame(&frame)) {
+            fprintf(stderr, "libplacebo: Failed rendering frame!\n");
+            uninit(3);
+        }
+
+        ok = pl_swapchain_submit_frame(win->swapchain);
+        if (!ok) {
+            fprintf(stderr, "libplacebo: Failed submitting frame!\n");
+            uninit(3);
+        }
+
+        pl_swapchain_swap_buffers(win->swapchain);
+        frames++;
+
+        unsigned int now = SDL_GetTicks();
+        if (now - last > 5000) {
+            printf("%u frames in %u ms = %f FPS\n", frames, now - last,
+                   1000.0f * frames / (now - last));
+            last = now;
+            frames = 0;
+        }
+
+        window_poll(win, false);
+    }
+
+    uninit(0);
+}
diff --git a/demos/settings.c b/demos/settings.c
new file mode 100644
index 0000000..e69f280
--- /dev/null
+++ b/demos/settings.c
@@ -0,0 +1,1238 @@
+#include <stdatomic.h>
+#include <getopt.h>
+
+#include <libavutil/file.h>
+
+#include "plplay.h"
+
+#ifdef PL_HAVE_WIN32
+#include <shlwapi.h>
+#define PL_BASENAME PathFindFileNameA
+#define strdup _strdup
+#else
+#include <libgen.h>
+#define PL_BASENAME basename
+#endif
+
+#ifdef HAVE_NUKLEAR
+#include "ui.h"
+
+bool parse_args(struct plplay_args *args, int argc, char *argv[])
+{
+    static struct option long_options[] = {
+        {"verbose", no_argument,        NULL, 'v'},
+        {"quiet",   no_argument,        NULL, 'q'},
+        {"preset",  required_argument,  NULL, 'p'},
+        {"hwdec",   no_argument,        NULL, 'H'},
+        {"window",  required_argument,  NULL, 'w'},
+        {0}
+    };
+
+    int option;
+    while ((option = getopt_long(argc, argv, "vqp:Hw:", long_options, NULL)) != -1) {
+        switch (option) {
+            case 'v':
+                if (args->verbosity < PL_LOG_TRACE)
+                    args->verbosity++;
+                break;
+            case 'q':
+                if (args->verbosity > PL_LOG_NONE)
+                    args->verbosity--;
+                break;
+            case 'p':
+                if (!strcmp(optarg, "default")) {
+                    args->preset = &pl_render_default_params;
+                } else if (!strcmp(optarg, "fast")) {
+                    args->preset = &pl_render_fast_params;
+                } else if (!strcmp(optarg, "highquality") || !strcmp(optarg, "hq")) {
+                    args->preset = &pl_render_high_quality_params;
+                } else {
+                    fprintf(stderr, "Invalid value for -p/--preset: '%s'\n", optarg);
+                    goto error;
+                }
+                break;
+            case 'H':
+                args->hwdec = true;
+                break;
+            case 'w':
+                args->window_impl = optarg;
+                break;
+            case '?':
+            default:
+                goto error;
+        }
+    }
+
+    // Check for the required filename argument
+    if (optind < argc) {
+        args->filename = argv[optind++];
+    } else {
+        fprintf(stderr, "Missing filename!\n");
+        goto error;
+    }
+
+    if (optind != argc) {
+        fprintf(stderr, "Superfluous argument: %s\n", argv[optind]);
+        goto error;
+    }
+
+    return true;
+
+error:
+    fprintf(stderr, "Usage: %s [-v/--verbose] [-q/--quiet] [-p/--preset <default|fast|hq|highquality>] [--hwdec] [-w/--window <api>] <filename>\n", argv[0]);
+    fprintf(stderr, "Options:\n");
+    fprintf(stderr, "  -v, --verbose   Increase verbosity\n");
+    fprintf(stderr, "  -q, --quiet     Decrease verbosity\n");
+    fprintf(stderr, "  -p, --preset    Set the rendering preset (default|fast|hq|highquality)\n");
+    fprintf(stderr, "  -H, --hwdec     Enable hardware decoding\n");
+    fprintf(stderr, "  -w, --window    Specify the windowing API\n");
+    return false;
+}
+
+static void add_hook(struct plplay *p, const struct pl_hook *hook, const char *path)
+{
+    if (!hook)
+        return;
+
+    if (p->shader_num == p->shader_size) {
+        // Grow array if needed
+        size_t new_size = p->shader_size ? p->shader_size * 2 : 16;
+        void *new_hooks = realloc(p->shader_hooks, new_size * sizeof(void *));
+        if (!new_hooks)
+            goto error;
+        p->shader_hooks = new_hooks;
+        char **new_paths = realloc(p->shader_paths, new_size * sizeof(char *));
+        if (!new_paths)
+            goto error;
+        p->shader_paths = new_paths;
+        p->shader_size = new_size;
+    }
+
+    // strip leading path
+    while (true) {
+        const char *fname = strchr(path, '/');
+        if (!fname)
+            break;
+        path = fname + 1;
+    }
+
+    char *path_copy = strdup(path);
+    if (!path_copy)
+        goto error;
+
+    p->shader_hooks[p->shader_num] = hook;
+    p->shader_paths[p->shader_num] = path_copy;
+    p->shader_num++;
+    return;
+
+error:
+    pl_mpv_user_shader_destroy(&hook);
+}
+
+static void auto_property_int(struct nk_context *nk, int auto_val, int min, int *val,
+                         int max, int step, float inc_per_pixel)
+{
+    int value = *val;
+    if (!value)
+        value = auto_val;
+
+    // Auto label will be delayed 1 frame
+    nk_property_int(nk, *val ? "" : "Auto", min, &value, max, step, inc_per_pixel);
+
+    if (*val || value != auto_val)
+        *val = value;
+}
+
+static void draw_shader_pass(struct nk_context *nk,
+                             const struct pl_dispatch_info *info)
+{
+    pl_shader_info shader = info->shader;
+
+    char label[128];
+    int count = snprintf(label, sizeof(label), "%.3f/%.3f/%.3f ms: %s",
+             info->last / 1e6,
+             info->average / 1e6,
+             info->peak / 1e6,
+             shader->description);
+
+    if (count >= sizeof(label)) {
+        label[sizeof(label) - 4] = '.';
+        label[sizeof(label) - 3] = '.';
+        label[sizeof(label) - 2] = '.';
+    }
+
+    int id = (unsigned int) (uintptr_t) info; // pointer into `struct plplay`
+    if (nk_tree_push_id(nk, NK_TREE_NODE, label, NK_MINIMIZED, id)) {
+        nk_layout_row_dynamic(nk, 32, 1);
+        if (nk_chart_begin(nk, NK_CHART_LINES,
+                           info->num_samples,
+                           0.0f, info->peak))
+        {
+            for (int k = 0; k < info->num_samples; k++)
+                nk_chart_push(nk, info->samples[k]);
+            nk_chart_end(nk);
+        }
+
+        nk_layout_row_dynamic(nk, 24, 1);
+        for (int n = 0; n < shader->num_steps; n++)
+            nk_labelf(nk, NK_TEXT_LEFT, "%d. %s", n + 1, shader->steps[n]);
+        nk_tree_pop(nk);
+    }
+}
+
+static void draw_timing(struct nk_context *nk, const char *label,
+                        const struct timing *t)
+{
+    const double avg = t->count ? t->sum / t->count : 0.0;
+    const double stddev = t->count ? sqrt(t->sum2 / t->count - avg * avg) : 0.0;
+    nk_label(nk, label, NK_TEXT_LEFT);
+    nk_labelf(nk, NK_TEXT_LEFT, "%.4f ± %.4f ms (%.3f ms)",
+              avg * 1e3, stddev * 1e3, t->peak * 1e3);
+}
+
+static void draw_opt_data(void *priv, pl_opt_data data)
+{
+    struct nk_context *nk = priv;
+    pl_opt opt = data->opt;
+    if (opt->type == PL_OPT_FLOAT) {
+        // Print floats less verbosely than the libplacebo built-in printf
+        nk_labelf(nk, NK_TEXT_LEFT, "%s = %f", opt->key, *(const float *) data->value);
+    } else {
+        nk_labelf(nk, NK_TEXT_LEFT, "%s = %s", opt->key, data->text);
+    }
+}
+
+static void draw_cache_line(void *priv, pl_cache_obj obj)
+{
+    struct nk_context *nk = priv;
+    nk_labelf(nk, NK_TEXT_LEFT, " - 0x%016"PRIx64": %zu bytes", obj.key, obj.size);
+}
+
+void update_settings(struct plplay *p, const struct pl_frame *target)
+{
+    struct nk_context *nk = ui_get_context(p->ui);
+    enum nk_panel_flags win_flags = NK_WINDOW_BORDER | NK_WINDOW_MOVABLE |
+                                    NK_WINDOW_SCALABLE | NK_WINDOW_MINIMIZABLE |
+                                    NK_WINDOW_TITLE;
+
+    ui_update_input(p->ui, p->win);
+    const char *dropped_file = window_get_file(p->win);
+
+    pl_options opts = p->opts;
+    struct pl_render_params *par = &opts->params;
+
+    if (nk_begin(nk, "Settings", nk_rect(100, 100, 600, 600), win_flags)) {
+
+        if (nk_tree_push(nk, NK_TREE_NODE, "Window settings", NK_MAXIMIZED)) {
+            nk_layout_row_dynamic(nk, 24, 2);
+
+            bool fullscreen = window_is_fullscreen(p->win);
+            p->toggle_fullscreen = nk_checkbox_label(nk, "Fullscreen", &fullscreen);
+            nk_property_float(nk, "Corner rounding", 0.0, &par->corner_rounding, 1.0, 0.1, 0.01);
+
+            struct nk_colorf bg = {
+                par->background_color[0],
+                par->background_color[1],
+                par->background_color[2],
+                1.0 - par->background_transparency,
+            };
+
+            nk_layout_row_dynamic(nk, 24, 2);
+            nk_label(nk, "Background color:", NK_TEXT_LEFT);
+            if (nk_combo_begin_color(nk, nk_rgb_cf(bg), nk_vec2(nk_widget_width(nk), 300))) {
+                nk_layout_row_dynamic(nk, 200, 1);
+                nk_color_pick(nk, &bg, NK_RGBA);
+                nk_combo_end(nk);
+
+                par->background_color[0] = bg.r;
+                par->background_color[1] = bg.g;
+                par->background_color[2] = bg.b;
+                par->background_transparency = 1.0 - bg.a;
+            }
+
+            nk_layout_row_dynamic(nk, 24, 2);
+            par->blend_against_tiles = nk_check_label(nk, "Blend against tiles", par->blend_against_tiles);
+            nk_property_int(nk, "Tile size", 2, &par->tile_size, 256, 1, 1);
+
+            nk_layout_row(nk, NK_DYNAMIC, 24, 3, (float[]){ 0.4, 0.3, 0.3 });
+            nk_label(nk, "Tile colors:", NK_TEXT_LEFT);
+            for (int i = 0; i < 2; i++) {
+                bg = (struct nk_colorf) {
+                    par->tile_colors[i][0],
+                    par->tile_colors[i][1],
+                    par->tile_colors[i][2],
+                };
+
+                if (nk_combo_begin_color(nk, nk_rgb_cf(bg), nk_vec2(nk_widget_width(nk), 300))) {
+                    nk_layout_row_dynamic(nk, 200, 1);
+                    nk_color_pick(nk, &bg, NK_RGB);
+                    nk_combo_end(nk);
+
+                    par->tile_colors[i][0] = bg.r;
+                    par->tile_colors[i][1] = bg.g;
+                    par->tile_colors[i][2] = bg.b;
+                }
+            }
+
+            static const char *rotations[4] = {
+                [PL_ROTATION_0]   = "0°",
+                [PL_ROTATION_90]  = "90°",
+                [PL_ROTATION_180] = "180°",
+                [PL_ROTATION_270] = "270°",
+            };
+
+            nk_layout_row_dynamic(nk, 24, 2);
+            nk_label(nk, "Display orientation:", NK_TEXT_LEFT);
+            p->target_rot = nk_combo(nk, rotations, 4, p->target_rot,
+                                     16, nk_vec2(nk_widget_width(nk), 100));
+            nk_tree_pop(nk);
+        }
+
+        if (nk_tree_push(nk, NK_TREE_NODE, "Image scaling", NK_MAXIMIZED)) {
+            const struct pl_filter_config *f;
+            static const char *scale_none = "None (Built-in sampling)";
+            static const char *pscale_none = "None (Use regular upscaler)";
+            static const char *tscale_none = "None (No frame mixing)";
+            #define SCALE_DESC(scaler, fallback) (par->scaler ? par->scaler->description : fallback)
+
+            static const char *zoom_modes[ZOOM_COUNT] = {
+                [ZOOM_PAD]        = "Pad to window",
+                [ZOOM_CROP]       = "Crop to window",
+                [ZOOM_STRETCH]    = "Stretch to window",
+                [ZOOM_FIT]        = "Fit inside window",
+                [ZOOM_RAW]        = "Unscaled (raw)",
+                [ZOOM_400]        = "400% zoom",
+                [ZOOM_200]        = "200% zoom",
+                [ZOOM_100]        = "100% zoom",
+                [ZOOM_50]         = " 50% zoom",
+                [ZOOM_25]         = " 25% zoom",
+            };
+
+            nk_layout_row(nk, NK_DYNAMIC, 24, 2, (float[]){ 0.3, 0.7 });
+            nk_label(nk, "Zoom mode:", NK_TEXT_LEFT);
+            int zoom = nk_combo(nk, zoom_modes, ZOOM_COUNT, p->target_zoom, 16, nk_vec2(nk_widget_width(nk), 500));
+            if (zoom != p->target_zoom) {
+                // Image crop may change
+                pl_renderer_flush_cache(p->renderer);
+                p->target_zoom = zoom;
+            }
+
+            nk_label(nk, "Upscaler:", NK_TEXT_LEFT);
+            if (nk_combo_begin_label(nk, SCALE_DESC(upscaler, scale_none), nk_vec2(nk_widget_width(nk), 500))) {
+                nk_layout_row_dynamic(nk, 16, 1);
+                if (nk_combo_item_label(nk, scale_none, NK_TEXT_LEFT))
+                    par->upscaler = NULL;
+                for (int i = 0; i < pl_num_filter_configs; i++) {
+                    f = pl_filter_configs[i];
+                    if (!f->description)
+                        continue;
+                    if (!(f->allowed & PL_FILTER_UPSCALING))
+                        continue;
+                    if (!p->advanced_scalers && !(f->recommended & PL_FILTER_UPSCALING))
+                        continue;
+                    if (nk_combo_item_label(nk, f->description, NK_TEXT_LEFT))
+                        par->upscaler = f;
+                }
+                nk_combo_end(nk);
+            }
+
+            nk_label(nk, "Downscaler:", NK_TEXT_LEFT);
+            if (nk_combo_begin_label(nk, SCALE_DESC(downscaler, scale_none), nk_vec2(nk_widget_width(nk), 500))) {
+                nk_layout_row_dynamic(nk, 16, 1);
+                if (nk_combo_item_label(nk, scale_none, NK_TEXT_LEFT))
+                    par->downscaler = NULL;
+                for (int i = 0; i < pl_num_filter_configs; i++) {
+                    f = pl_filter_configs[i];
+                    if (!f->description)
+                        continue;
+                    if (!(f->allowed & PL_FILTER_DOWNSCALING))
+                        continue;
+                    if (!p->advanced_scalers && !(f->recommended & PL_FILTER_DOWNSCALING))
+                        continue;
+                    if (nk_combo_item_label(nk, f->description, NK_TEXT_LEFT))
+                        par->downscaler = f;
+                }
+                nk_combo_end(nk);
+            }
+
+            nk_label(nk, "Plane scaler:", NK_TEXT_LEFT);
+            if (nk_combo_begin_label(nk, SCALE_DESC(plane_upscaler, pscale_none), nk_vec2(nk_widget_width(nk), 500))) {
+                nk_layout_row_dynamic(nk, 16, 1);
+                if (nk_combo_item_label(nk, pscale_none, NK_TEXT_LEFT))
+                    par->downscaler = NULL;
+                for (int i = 0; i < pl_num_filter_configs; i++) {
+                    f = pl_filter_configs[i];
+                    if (!f->description)
+                        continue;
+                    if (!(f->allowed & PL_FILTER_UPSCALING))
+                        continue;
+                    if (!p->advanced_scalers && !(f->recommended & PL_FILTER_UPSCALING))
+                        continue;
+                    if (nk_combo_item_label(nk, f->description, NK_TEXT_LEFT))
+                        par->plane_upscaler = f;
+                }
+                nk_combo_end(nk);
+            }
+
+            nk_label(nk, "Frame mixing:", NK_TEXT_LEFT);
+            if (nk_combo_begin_label(nk, SCALE_DESC(frame_mixer, tscale_none), nk_vec2(nk_widget_width(nk), 300))) {
+                nk_layout_row_dynamic(nk, 16, 1);
+                if (nk_combo_item_label(nk, tscale_none, NK_TEXT_LEFT))
+                    par->frame_mixer = NULL;
+                for (int i = 0; i < pl_num_filter_configs; i++) {
+                    f = pl_filter_configs[i];
+                    if (!f->description)
+                        continue;
+                    if (!(f->allowed & PL_FILTER_FRAME_MIXING))
+                        continue;
+                    if (!p->advanced_scalers && !(f->recommended & PL_FILTER_FRAME_MIXING))
+                        continue;
+                    if (nk_combo_item_label(nk, f->description, NK_TEXT_LEFT))
+                        par->frame_mixer = f;
+                }
+                nk_combo_end(nk);
+            }
+
+            nk_layout_row_dynamic(nk, 24, 2);
+            par->skip_anti_aliasing = !nk_check_label(nk, "Anti-aliasing", !par->skip_anti_aliasing);
+            nk_property_float(nk, "Antiringing", 0, &par->antiringing_strength, 1.0, 0.05, 0.001);
+
+            struct pl_sigmoid_params *spar = &opts->sigmoid_params;
+            nk_layout_row_dynamic(nk, 24, 2);
+            par->sigmoid_params = nk_check_label(nk, "Sigmoidization", par->sigmoid_params) ? spar : NULL;
+            if (nk_button_label(nk, "Default values"))
+                *spar = pl_sigmoid_default_params;
+            nk_property_float(nk, "Sigmoid center", 0, &spar->center, 1, 0.1, 0.01);
+            nk_property_float(nk, "Sigmoid slope", 0, &spar->slope, 100, 1, 0.1);
+            nk_tree_pop(nk);
+        }
+
+        if (nk_tree_push(nk, NK_TREE_NODE, "Deinterlacing", NK_MINIMIZED)) {
+            struct pl_deinterlace_params *dpar = &opts->deinterlace_params;
+            nk_layout_row_dynamic(nk, 24, 2);
+            par->deinterlace_params = nk_check_label(nk, "Enable", par->deinterlace_params) ? dpar : NULL;
+            if (nk_button_label(nk, "Reset settings"))
+                *dpar = pl_deinterlace_default_params;
+
+            static const char *deint_algos[PL_DEINTERLACE_ALGORITHM_COUNT] = {
+                [PL_DEINTERLACE_WEAVE]  = "Field weaving (no-op)",
+                [PL_DEINTERLACE_BOB]    = "Naive bob (line doubling)",
+                [PL_DEINTERLACE_YADIF]  = "Yadif (\"yet another deinterlacing filter\")",
+            };
+
+            nk_label(nk, "Deinterlacing algorithm", NK_TEXT_LEFT);
+            dpar->algo = nk_combo(nk, deint_algos, PL_DEINTERLACE_ALGORITHM_COUNT,
+                                  dpar->algo, 16, nk_vec2(nk_widget_width(nk), 300));
+
+            switch (dpar->algo) {
+            case PL_DEINTERLACE_WEAVE:
+            case PL_DEINTERLACE_BOB:
+                break;
+            case PL_DEINTERLACE_YADIF:
+                nk_checkbox_label(nk, "Skip spatial check", &dpar->skip_spatial_check);
+                break;
+            default: abort();
+            }
+            nk_tree_pop(nk);
+        }
+
+        if (nk_tree_push(nk, NK_TREE_NODE, "Debanding", NK_MINIMIZED)) {
+            struct pl_deband_params *dpar = &opts->deband_params;
+            nk_layout_row_dynamic(nk, 24, 2);
+            par->deband_params = nk_check_label(nk, "Enable", par->deband_params) ? dpar : NULL;
+            if (nk_button_label(nk, "Reset settings"))
+                *dpar = pl_deband_default_params;
+            nk_property_int(nk, "Iterations", 0, &dpar->iterations, 8, 1, 0);
+            nk_property_float(nk, "Threshold", 0, &dpar->threshold, 256, 1, 0.5);
+            nk_property_float(nk, "Radius", 0, &dpar->radius, 256, 1, 0.2);
+            nk_property_float(nk, "Grain", 0, &dpar->grain, 512, 1, 0.5);
+            nk_tree_pop(nk);
+        }
+
+        if (nk_tree_push(nk, NK_TREE_NODE, "Distortion", NK_MINIMIZED)) {
+            struct pl_distort_params *dpar = &opts->distort_params;
+            nk_layout_row_dynamic(nk, 24, 2);
+            par->distort_params = nk_check_label(nk, "Enable", par->distort_params) ? dpar : NULL;
+            if (nk_button_label(nk, "Reset settings"))
+                *dpar = pl_distort_default_params;
+
+            static const char *address_modes[PL_TEX_ADDRESS_MODE_COUNT] = {
+                [PL_TEX_ADDRESS_CLAMP]  = "Clamp edges",
+                [PL_TEX_ADDRESS_REPEAT] = "Repeat edges",
+                [PL_TEX_ADDRESS_MIRROR] = "Mirror edges",
+            };
+
+            nk_checkbox_label(nk, "Constrain bounds", &dpar->constrain);
+            dpar->address_mode = nk_combo(nk, address_modes, PL_TEX_ADDRESS_MODE_COUNT,
+                                          dpar->address_mode, 16, nk_vec2(nk_widget_width(nk), 100));
+            bool alpha = nk_check_label(nk, "Transparent background", dpar->alpha_mode);
+            dpar->alpha_mode = alpha ? PL_ALPHA_INDEPENDENT : PL_ALPHA_UNKNOWN;
+            nk_checkbox_label(nk, "Bicubic interpolation", &dpar->bicubic);
+
+            struct pl_transform2x2 *tf = &dpar->transform;
+            nk_property_float(nk, "Scale X", -10.0, &tf->mat.m[0][0], 10.0, 0.1, 0.005);
+            nk_property_float(nk, "Shear X", -10.0, &tf->mat.m[0][1], 10.0, 0.1, 0.005);
+            nk_property_float(nk, "Shear Y", -10.0, &tf->mat.m[1][0], 10.0, 0.1, 0.005);
+            nk_property_float(nk, "Scale Y", -10.0, &tf->mat.m[1][1], 10.0, 0.1, 0.005);
+            nk_property_float(nk, "Offset X", -10.0, &tf->c[0], 10.0, 0.1, 0.005);
+            nk_property_float(nk, "Offset Y", -10.0, &tf->c[1], 10.0, 0.1, 0.005);
+
+            float zoom_ref = fabsf(tf->mat.m[0][0] * tf->mat.m[1][1] -
+                                   tf->mat.m[0][1] * tf->mat.m[1][0]);
+            zoom_ref = logf(fmaxf(zoom_ref, 1e-4));
+            float zoom = zoom_ref;
+            nk_property_float(nk, "log(Zoom)", -10.0, &zoom, 10.0, 0.1, 0.005);
+            pl_transform2x2_scale(tf, expf(zoom - zoom_ref));
+
+            float angle_ref = (atan2f(tf->mat.m[1][0], tf->mat.m[1][1]) -
+                               atan2f(tf->mat.m[0][1], tf->mat.m[0][0])) / 2;
+            angle_ref = fmodf(angle_ref * 180/M_PI + 540, 360) - 180;
+            float angle = angle_ref;
+            nk_property_float(nk, "Rotate (°)", -200, &angle, 200, -5, -0.2);
+            float angle_delta = (angle - angle_ref) * M_PI / 180;
+            const pl_matrix2x2 rot = pl_matrix2x2_rotation(angle_delta);
+            pl_matrix2x2_rmul(&rot, &tf->mat);
+
+            bool flip_ox = nk_button_label(nk, "Flip output X");
+            bool flip_oy = nk_button_label(nk, "Flip output Y");
+            bool flip_ix = nk_button_label(nk, "Flip input X");
+            bool flip_iy = nk_button_label(nk, "Flip input Y");
+            if (flip_ox ^ flip_ix)
+                tf->mat.m[0][0] = -tf->mat.m[0][0];
+            if (flip_ox ^ flip_iy)
+                tf->mat.m[0][1] = -tf->mat.m[0][1];
+            if (flip_oy ^ flip_ix)
+                tf->mat.m[1][0] = -tf->mat.m[1][0];
+            if (flip_oy ^ flip_iy)
+                tf->mat.m[1][1] = -tf->mat.m[1][1];
+            if (flip_ox)
+                tf->c[0] = -tf->c[0];
+            if (flip_oy)
+                tf->c[1] = -tf->c[1];
+
+            nk_tree_pop(nk);
+        }
+
+        if (nk_tree_push(nk, NK_TREE_NODE, "Color adjustment", NK_MINIMIZED)) {
+            struct pl_color_adjustment *adj = &opts->color_adjustment;
+            nk_layout_row_dynamic(nk, 24, 2);
+            par->color_adjustment = nk_check_label(nk, "Enable", par->color_adjustment) ? adj : NULL;
+            if (nk_button_label(nk, "Default values"))
+                *adj = pl_color_adjustment_neutral;
+            nk_property_float(nk, "Brightness", -1, &adj->brightness, 1, 0.1, 0.005);
+            nk_property_float(nk, "Contrast", 0, &adj->contrast, 10, 0.1, 0.005);
+
+            // Convert to (cyclical) degrees for display
+            int deg = roundf(adj->hue * 180.0 / M_PI);
+            nk_property_int(nk, "Hue (°)", -50, &deg, 400, 1, 1);
+            adj->hue = ((deg + 360) % 360) * M_PI / 180.0;
+
+            nk_property_float(nk, "Saturation", 0, &adj->saturation, 10, 0.1, 0.005);
+            nk_property_float(nk, "Gamma", 0, &adj->gamma, 10, 0.1, 0.005);
+
+            // Convert to human-friendly temperature values for display
+            int temp = (int) roundf(adj->temperature * 3500) + 6500;
+            nk_property_int(nk, "Temperature (K)", 3000, &temp, 10000, 10, 5);
+            adj->temperature = (temp - 6500) / 3500.0;
+
+            struct pl_cone_params *cpar = &opts->cone_params;
+            nk_layout_row_dynamic(nk, 24, 2);
+            par->cone_params = nk_check_label(nk, "Color blindness", par->cone_params) ? cpar : NULL;
+            if (nk_button_label(nk, "Default values"))
+                *cpar = pl_vision_normal;
+            nk_layout_row(nk, NK_DYNAMIC, 24, 5, (float[]){ 0.25, 0.25/3, 0.25/3, 0.25/3, 0.5 });
+            nk_label(nk, "Cone model:", NK_TEXT_LEFT);
+            unsigned int cones = cpar->cones;
+            nk_checkbox_flags_label(nk, "L", &cones, PL_CONE_L);
+            nk_checkbox_flags_label(nk, "M", &cones, PL_CONE_M);
+            nk_checkbox_flags_label(nk, "S", &cones, PL_CONE_S);
+            cpar->cones = cones;
+            nk_property_float(nk, "Sensitivity", 0.0, &cpar->strength, 5.0, 0.1, 0.01);
+            nk_tree_pop(nk);
+        }
+
+        if (nk_tree_push(nk, NK_TREE_NODE, "HDR peak detection", NK_MINIMIZED)) {
+            struct pl_peak_detect_params *ppar = &opts->peak_detect_params;
+            nk_layout_row_dynamic(nk, 24, 2);
+            par->peak_detect_params = nk_check_label(nk, "Enable", par->peak_detect_params) ? ppar : NULL;
+            if (nk_button_label(nk, "Reset settings"))
+                *ppar = pl_peak_detect_default_params;
+            nk_property_float(nk, "Threshold low", 0.0, &ppar->scene_threshold_low, 20.0, 0.5, 0.005);
+            nk_property_float(nk, "Threshold high", 0.0, &ppar->scene_threshold_high, 20.0, 0.5, 0.005);
+            nk_property_float(nk, "Smoothing period", 0.0, &ppar->smoothing_period, 1000.0, 5.0, 1.0);
+            nk_property_float(nk, "Peak percentile", 95.0, &ppar->percentile, 100.0, 0.01, 0.001);
+            nk_checkbox_label(nk, "Allow 1-frame delay", &ppar->allow_delayed);
+
+            struct pl_hdr_metadata metadata;
+            if (pl_renderer_get_hdr_metadata(p->renderer, &metadata)) {
+                nk_layout_row_dynamic(nk, 24, 2);
+                nk_label(nk, "Detected max luminance:", NK_TEXT_LEFT);
+                nk_labelf(nk, NK_TEXT_LEFT, "%.2f cd/m² (%.2f%% PQ)",
+                          pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, metadata.max_pq_y),
+                          100.0f * metadata.max_pq_y);
+                nk_label(nk, "Detected avg luminance:", NK_TEXT_LEFT);
+                nk_labelf(nk, NK_TEXT_LEFT, "%.2f cd/m² (%.2f%% PQ)",
+                          pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, metadata.avg_pq_y),
+                          100.0f * metadata.avg_pq_y);
+            }
+
+            nk_tree_pop(nk);
+        }
+
+        if (nk_tree_push(nk, NK_TREE_NODE, "Tone mapping", NK_MINIMIZED)) {
+            struct pl_color_map_params *cpar = &opts->color_map_params;
+            static const struct pl_color_map_params null_settings = {0};
+            nk_layout_row_dynamic(nk, 24, 2);
+            par->color_map_params = nk_check_label(nk, "Enable",
+                par->color_map_params == cpar) ? cpar : &null_settings;
+            if (nk_button_label(nk, "Reset settings"))
+                *cpar = pl_color_map_default_params;
+
+            nk_label(nk, "Gamut mapping function:", NK_TEXT_LEFT);
+            if (nk_combo_begin_label(nk, cpar->gamut_mapping->description,
+                                     nk_vec2(nk_widget_width(nk), 500)))
+            {
+                nk_layout_row_dynamic(nk, 16, 1);
+                for (int i = 0; i < pl_num_gamut_map_functions; i++) {
+                    const struct pl_gamut_map_function *f = pl_gamut_map_functions[i];
+                    if (nk_combo_item_label(nk, f->description, NK_TEXT_LEFT))
+                        cpar->gamut_mapping = f;
+                }
+                nk_combo_end(nk);
+            }
+
+            nk_label(nk, "Tone mapping function:", NK_TEXT_LEFT);
+            if (nk_combo_begin_label(nk, cpar->tone_mapping_function->description,
+                                     nk_vec2(nk_widget_width(nk), 500)))
+            {
+                nk_layout_row_dynamic(nk, 16, 1);
+                for (int i = 0; i < pl_num_tone_map_functions; i++) {
+                    const struct pl_tone_map_function *f = pl_tone_map_functions[i];
+                    if (nk_combo_item_label(nk, f->description, NK_TEXT_LEFT))
+                        cpar->tone_mapping_function = f;
+                }
+                nk_combo_end(nk);
+            }
+
+            static const char *metadata_types[PL_HDR_METADATA_TYPE_COUNT] = {
+                [PL_HDR_METADATA_ANY]               = "Automatic selection",
+                [PL_HDR_METADATA_NONE]              = "None (disabled)",
+                [PL_HDR_METADATA_HDR10]             = "HDR10 (static)",
+                [PL_HDR_METADATA_HDR10PLUS]         = "HDR10+ (MaxRGB)",
+                [PL_HDR_METADATA_CIE_Y]             = "Luminance (CIE Y)",
+            };
+
+            nk_label(nk, "HDR metadata source:", NK_TEXT_LEFT);
+            cpar->metadata = nk_combo(nk, metadata_types,
+                                      PL_HDR_METADATA_TYPE_COUNT,
+                                      cpar->metadata,
+                                      16, nk_vec2(nk_widget_width(nk), 300));
+
+            nk_property_float(nk, "Contrast recovery", 0.0, &cpar->contrast_recovery, 2.0, 0.05, 0.005);
+            nk_property_float(nk, "Contrast smoothness", 1.0, &cpar->contrast_smoothness, 32.0, 0.1, 0.005);
+
+            nk_property_int(nk, "LUT size", 16, &cpar->lut_size, 1024, 1, 1);
+            nk_property_int(nk, "3DLUT size I", 7, &cpar->lut3d_size[0], 65, 1, 1);
+            nk_property_int(nk, "3DLUT size C", 7, &cpar->lut3d_size[1], 256, 1, 1);
+            nk_property_int(nk, "3DLUT size h", 7, &cpar->lut3d_size[2], 1024, 1, 1);
+
+            nk_checkbox_label(nk, "Tricubic interpolation", &cpar->lut3d_tricubic);
+            nk_checkbox_label(nk, "Force full LUT", &cpar->force_tone_mapping_lut);
+            nk_checkbox_label(nk, "Inverse tone mapping", &cpar->inverse_tone_mapping);
+            nk_checkbox_label(nk, "Gamut expansion", &cpar->gamut_expansion);
+            nk_checkbox_label(nk, "Show clipping", &cpar->show_clipping);
+            nk_checkbox_label(nk, "Visualize LUT", &cpar->visualize_lut);
+
+            if (cpar->visualize_lut) {
+                nk_layout_row_dynamic(nk, 24, 2);
+                const float huerange = 2 * M_PI;
+                nk_property_float(nk, "Hue",   -1, &cpar->visualize_hue, huerange + 1.0, 0.1, 0.01);
+                nk_property_float(nk, "Theta", 0.0, &cpar->visualize_theta, M_PI_2, 0.1, 0.01);
+                cpar->visualize_hue = fmodf(cpar->visualize_hue + huerange, huerange);
+            }
+
+            if (nk_tree_push(nk, NK_TREE_NODE, "Fine-tune constants (advanced)", NK_MINIMIZED)) {
+                struct pl_tone_map_constants  *tc = &cpar->tone_constants;
+                struct pl_gamut_map_constants *gc = &cpar->gamut_constants;
+                nk_layout_row_dynamic(nk, 20, 2);
+                nk_property_float(nk, "Perceptual deadzone", 0.0, &gc->perceptual_deadzone, 1.0, 0.05, 0.001);
+                nk_property_float(nk, "Perceptual strength", 0.0, &gc->perceptual_strength, 1.0, 0.05, 0.001);
+                nk_property_float(nk, "Colorimetric gamma", 0.0, &gc->colorimetric_gamma, 10.0, 0.05, 0.001);
+                nk_property_float(nk, "Softclip knee", 0.0, &gc->softclip_knee, 1.0, 0.05, 0.001);
+                nk_property_float(nk, "Softclip desaturation", 0.0, &gc->softclip_desat, 1.0, 0.05, 0.001);
+                nk_property_float(nk, "Knee adaptation", 0.0, &tc->knee_adaptation, 1.0, 0.05, 0.001);
+                nk_property_float(nk, "Knee minimum", 0.0, &tc->knee_minimum, 0.5, 0.05, 0.001);
+                nk_property_float(nk, "Knee maximum", 0.5, &tc->knee_maximum, 1.0, 0.05, 0.001);
+                nk_property_float(nk, "Knee default", tc->knee_minimum, &tc->knee_default, tc->knee_maximum, 0.05, 0.001);
+                nk_property_float(nk, "BT.2390 offset", 0.5, &tc->knee_offset, 2.0, 0.05, 0.001);
+                nk_property_float(nk, "Spline slope tuning", 0.0, &tc->slope_tuning, 10.0, 0.05, 0.001);
+                nk_property_float(nk, "Spline slope offset", 0.0, &tc->slope_offset, 1.0, 0.05, 0.001);
+                nk_property_float(nk, "Spline contrast", 0.0, &tc->spline_contrast, 1.5, 0.05, 0.001);
+                nk_property_float(nk, "Reinhard contrast", 0.0, &tc->reinhard_contrast, 1.0, 0.05, 0.001);
+                nk_property_float(nk, "Linear knee point", 0.0, &tc->linear_knee, 1.0, 0.05, 0.001);
+                nk_property_float(nk, "Linear exposure", 0.0, &tc->exposure, 10.0, 0.05, 0.001);
+                nk_tree_pop(nk);
+            }
+
+            nk_layout_row_dynamic(nk, 50, 1);
+            if (ui_widget_hover(nk, "Drop .cube file here...") && dropped_file) {
+                uint8_t *buf;
+                size_t size;
+                int ret = av_file_map(dropped_file, &buf, &size, 0, NULL);
+                if (ret < 0) {
+                    fprintf(stderr, "Failed opening '%s': %s\n", dropped_file,
+                            av_err2str(ret));
+                } else {
+                    pl_lut_free((struct pl_custom_lut **) &par->lut);
+                    par->lut = pl_lut_parse_cube(p->log, (char *) buf, size);
+                    av_file_unmap(buf, size);
+                }
+            }
+
+            static const char *lut_types[] = {
+                [PL_LUT_UNKNOWN]    = "Auto (unknown)",
+                [PL_LUT_NATIVE]     = "Raw RGB (native)",
+                [PL_LUT_NORMALIZED] = "Linear RGB (normalized)",
+                [PL_LUT_CONVERSION] = "Gamut conversion (native)",
+            };
+
+            nk_layout_row(nk, NK_DYNAMIC, 24, 3, (float[]){ 0.2, 0.3, 0.5 });
+            if (nk_button_label(nk, "Reset LUT")) {
+                pl_lut_free((struct pl_custom_lut **) &par->lut);
+                par->lut_type = PL_LUT_UNKNOWN;
+            }
+
+            nk_label(nk, "LUT type:", NK_TEXT_CENTERED);
+            par->lut_type = nk_combo(nk, lut_types, 4, par->lut_type,
+                                     16, nk_vec2(nk_widget_width(nk), 100));
+
+            nk_tree_pop(nk);
+        }
+
+        if (nk_tree_push(nk, NK_TREE_NODE, "Dithering", NK_MINIMIZED)) {
+            struct pl_dither_params *dpar = &opts->dither_params;
+            nk_layout_row_dynamic(nk, 24, 2);
+            par->dither_params = nk_check_label(nk, "Enable", par->dither_params) ? dpar : NULL;
+            if (nk_button_label(nk, "Reset settings"))
+                *dpar = pl_dither_default_params;
+
+            static const char *dither_methods[PL_DITHER_METHOD_COUNT] = {
+                [PL_DITHER_BLUE_NOISE]      = "Blue noise",
+                [PL_DITHER_ORDERED_LUT]     = "Ordered (LUT)",
+                [PL_DITHER_ORDERED_FIXED]   = "Ordered (fixed size)",
+                [PL_DITHER_WHITE_NOISE]     = "White noise",
+            };
+
+            nk_label(nk, "Dither method:", NK_TEXT_LEFT);
+            dpar->method = nk_combo(nk, dither_methods, PL_DITHER_METHOD_COUNT, dpar->method,
+                                    16, nk_vec2(nk_widget_width(nk), 100));
+
+            static const char *lut_sizes[8] = {
+                "2x2", "4x4", "8x8", "16x16", "32x32", "64x64", "128x128", "256x256",
+            };
+
+            nk_label(nk, "LUT size:", NK_TEXT_LEFT);
+            switch (dpar->method) {
+            case PL_DITHER_BLUE_NOISE:
+            case PL_DITHER_ORDERED_LUT: {
+                int size = dpar->lut_size - 1;
+                nk_combobox(nk, lut_sizes, 8, &size, 16, nk_vec2(nk_widget_width(nk), 200));
+                dpar->lut_size = size + 1;
+                break;
+            }
+            case PL_DITHER_ORDERED_FIXED:
+                nk_label(nk, "64x64", NK_TEXT_LEFT);
+                break;
+            default:
+                nk_label(nk, "(N/A)", NK_TEXT_LEFT);
+                break;
+            }
+
+            nk_checkbox_label(nk, "Temporal dithering", &dpar->temporal);
+
+            nk_layout_row_dynamic(nk, 24, 2);
+            nk_label(nk, "Error diffusion:", NK_TEXT_LEFT);
+            const char *name = par->error_diffusion ? par->error_diffusion->description : "(None)";
+            if (nk_combo_begin_label(nk, name, nk_vec2(nk_widget_width(nk), 500))) {
+                nk_layout_row_dynamic(nk, 16, 1);
+                if (nk_combo_item_label(nk, "(None)", NK_TEXT_LEFT))
+                    par->error_diffusion = NULL;
+                for (int i = 0; i < pl_num_error_diffusion_kernels; i++) {
+                    const struct pl_error_diffusion_kernel *k = pl_error_diffusion_kernels[i];
+                    if (nk_combo_item_label(nk, k->description, NK_TEXT_LEFT))
+                        par->error_diffusion = k;
+                }
+                nk_combo_end(nk);
+            }
+
+            nk_tree_pop(nk);
+        }
+
+        if (nk_tree_push(nk, NK_TREE_NODE, "Output color space", NK_MINIMIZED)) {
+            nk_layout_row_dynamic(nk, 24, 2);
+            nk_checkbox_label(nk, "Enable", &p->target_override);
+            bool reset = nk_button_label(nk, "Reset settings");
+            bool reset_icc = reset;
+            char buf[64] = {0};
+
+            nk_layout_row(nk, NK_DYNAMIC, 24, 2, (float[]){ 0.3, 0.7 });
+
+            const char *primaries[PL_COLOR_PRIM_COUNT] = {
+                [PL_COLOR_PRIM_UNKNOWN]     = "Auto (unknown)",
+                [PL_COLOR_PRIM_BT_601_525]  = "ITU-R Rec. BT.601 (525-line = NTSC, SMPTE-C)",
+                [PL_COLOR_PRIM_BT_601_625]  = "ITU-R Rec. BT.601 (625-line = PAL, SECAM)",
+                [PL_COLOR_PRIM_BT_709]      = "ITU-R Rec. BT.709 (HD), also sRGB",
+                [PL_COLOR_PRIM_BT_470M]     = "ITU-R Rec. BT.470 M",
+                [PL_COLOR_PRIM_EBU_3213]    = "EBU Tech. 3213-E / JEDEC P22 phosphors",
+                [PL_COLOR_PRIM_BT_2020]     = "ITU-R Rec. BT.2020 (UltraHD)",
+                [PL_COLOR_PRIM_APPLE]       = "Apple RGB",
+                [PL_COLOR_PRIM_ADOBE]       = "Adobe RGB (1998)",
+                [PL_COLOR_PRIM_PRO_PHOTO]   = "ProPhoto RGB (ROMM)",
+                [PL_COLOR_PRIM_CIE_1931]    = "CIE 1931 RGB primaries",
+                [PL_COLOR_PRIM_DCI_P3]      = "DCI-P3 (Digital Cinema)",
+                [PL_COLOR_PRIM_DISPLAY_P3]  = "DCI-P3 (Digital Cinema) with D65 white point",
+                [PL_COLOR_PRIM_V_GAMUT]     = "Panasonic V-Gamut (VARICAM)",
+                [PL_COLOR_PRIM_S_GAMUT]     = "Sony S-Gamut",
+                [PL_COLOR_PRIM_FILM_C]      = "Traditional film primaries with Illuminant C",
+                [PL_COLOR_PRIM_ACES_AP0]    = "ACES Primaries #0",
+                [PL_COLOR_PRIM_ACES_AP1]    = "ACES Primaries #1",
+            };
+
+            if (target->color.primaries) {
+                snprintf(buf, sizeof(buf), "Auto (%s)", primaries[target->color.primaries]);
+                primaries[PL_COLOR_PRIM_UNKNOWN] = buf;
+            }
+
+            nk_label(nk, "Primaries:", NK_TEXT_LEFT);
+            p->force_prim = nk_combo(nk, primaries, PL_COLOR_PRIM_COUNT, p->force_prim,
+                                       16, nk_vec2(nk_widget_width(nk), 200));
+
+            const char *transfers[PL_COLOR_TRC_COUNT] = {
+                [PL_COLOR_TRC_UNKNOWN]      = "Auto (unknown SDR)",
+                [PL_COLOR_TRC_BT_1886]      = "ITU-R Rec. BT.1886 (CRT emulation + OOTF)",
+                [PL_COLOR_TRC_SRGB]         = "IEC 61966-2-4 sRGB (CRT emulation)",
+                [PL_COLOR_TRC_LINEAR]       = "Linear light content",
+                [PL_COLOR_TRC_GAMMA18]      = "Pure power gamma 1.8",
+                [PL_COLOR_TRC_GAMMA20]      = "Pure power gamma 2.0",
+                [PL_COLOR_TRC_GAMMA22]      = "Pure power gamma 2.2",
+                [PL_COLOR_TRC_GAMMA24]      = "Pure power gamma 2.4",
+                [PL_COLOR_TRC_GAMMA26]      = "Pure power gamma 2.6",
+                [PL_COLOR_TRC_GAMMA28]      = "Pure power gamma 2.8",
+                [PL_COLOR_TRC_PRO_PHOTO]    = "ProPhoto RGB (ROMM)",
+                [PL_COLOR_TRC_ST428]        = "Digital Cinema Distribution Master (XYZ)",
+                [PL_COLOR_TRC_PQ]           = "ITU-R BT.2100 PQ (perceptual quantizer), aka SMPTE ST2048",
+                [PL_COLOR_TRC_HLG]          = "ITU-R BT.2100 HLG (hybrid log-gamma), aka ARIB STD-B67",
+                [PL_COLOR_TRC_V_LOG]        = "Panasonic V-Log (VARICAM)",
+                [PL_COLOR_TRC_S_LOG1]       = "Sony S-Log1",
+                [PL_COLOR_TRC_S_LOG2]       = "Sony S-Log2",
+            };
+
+            if (target->color.transfer) {
+                snprintf(buf, sizeof(buf), "Auto (%s)", transfers[target->color.transfer]);
+                transfers[PL_COLOR_TRC_UNKNOWN] = buf;
+            }
+
+            nk_label(nk, "Transfer:", NK_TEXT_LEFT);
+            p->force_trc = nk_combo(nk, transfers, PL_COLOR_TRC_COUNT, p->force_trc,
+                                      16, nk_vec2(nk_widget_width(nk), 200));
+
+            nk_layout_row_dynamic(nk, 24, 2);
+            nk_checkbox_label(nk, "Override HDR levels", &p->force_hdr_enable);
+
+            // Ensure these values are always legal by going through
+            // pl_color_space_infer
+            nk_layout_row_dynamic(nk, 24, 2);
+            struct pl_color_space fix = target->color;
+            apply_csp_overrides(p, &fix);
+            pl_color_space_infer(&fix);
+
+            fix.hdr.min_luma *= 1000; // better value range
+            nk_property_float(nk, "White point (cd/m²)",
+                                10.0, &fix.hdr.max_luma, 10000.0,
+                                fix.hdr.max_luma / 100, fix.hdr.max_luma / 1000);
+            nk_property_float(nk, "Black point (mcd/m²)",
+                                PL_COLOR_HDR_BLACK * 1000, &fix.hdr.min_luma,
+                                100.0 * 1000, 5, 2);
+            fix.hdr.min_luma /= 1000;
+            pl_color_space_infer(&fix);
+            p->force_hdr = fix.hdr;
+
+            struct pl_color_repr *trepr = &p->force_repr;
+            nk_layout_row(nk, NK_DYNAMIC, 24, 2, (float[]){ 0.3, 0.7 });
+
+            const char *systems[PL_COLOR_SYSTEM_COUNT] = {
+                [PL_COLOR_SYSTEM_UNKNOWN]       = "Auto (unknown)",
+                [PL_COLOR_SYSTEM_BT_601]        = "ITU-R Rec. BT.601 (SD)",
+                [PL_COLOR_SYSTEM_BT_709]        = "ITU-R Rec. BT.709 (HD)",
+                [PL_COLOR_SYSTEM_SMPTE_240M]    = "SMPTE-240M",
+                [PL_COLOR_SYSTEM_BT_2020_NC]    = "ITU-R Rec. BT.2020 (non-constant luminance)",
+                [PL_COLOR_SYSTEM_BT_2020_C]     = "ITU-R Rec. BT.2020 (constant luminance)",
+                [PL_COLOR_SYSTEM_BT_2100_PQ]    = "ITU-R Rec. BT.2100 ICtCp PQ variant",
+                [PL_COLOR_SYSTEM_BT_2100_HLG]   = "ITU-R Rec. BT.2100 ICtCp HLG variant",
+                [PL_COLOR_SYSTEM_DOLBYVISION]   = "Dolby Vision (invalid for output)",
+                [PL_COLOR_SYSTEM_YCGCO]         = "YCgCo (derived from RGB)",
+                [PL_COLOR_SYSTEM_RGB]           = "Red, Green and Blue",
+                [PL_COLOR_SYSTEM_XYZ]           = "Digital Cinema Distribution Master (XYZ)",
+            };
+
+            if (target->repr.sys) {
+                snprintf(buf, sizeof(buf), "Auto (%s)", systems[target->repr.sys]);
+                systems[PL_COLOR_SYSTEM_UNKNOWN] = buf;
+            }
+
+            nk_label(nk, "System:", NK_TEXT_LEFT);
+            trepr->sys = nk_combo(nk, systems, PL_COLOR_SYSTEM_COUNT, trepr->sys,
+                                  16, nk_vec2(nk_widget_width(nk), 200));
+            if (trepr->sys == PL_COLOR_SYSTEM_DOLBYVISION)
+                trepr->sys = PL_COLOR_SYSTEM_UNKNOWN;
+
+            const char *levels[PL_COLOR_LEVELS_COUNT] = {
+                [PL_COLOR_LEVELS_UNKNOWN]   = "Auto (unknown)",
+                [PL_COLOR_LEVELS_LIMITED]   = "Limited/TV range, e.g. 16-235",
+                [PL_COLOR_LEVELS_FULL]      = "Full/PC range, e.g. 0-255",
+            };
+
+            if (target->repr.levels) {
+                snprintf(buf, sizeof(buf), "Auto (%s)", levels[target->repr.levels]);
+                levels[PL_COLOR_LEVELS_UNKNOWN] = buf;
+            }
+
+            nk_label(nk, "Levels:", NK_TEXT_LEFT);
+            trepr->levels = nk_combo(nk, levels, PL_COLOR_LEVELS_COUNT, trepr->levels,
+                                     16, nk_vec2(nk_widget_width(nk), 200));
+
+            const char *alphas[PL_ALPHA_MODE_COUNT] = {
+                [PL_ALPHA_UNKNOWN]          = "Auto (unknown, or no alpha)",
+                [PL_ALPHA_INDEPENDENT]      = "Independent alpha channel",
+                [PL_ALPHA_PREMULTIPLIED]    = "Premultiplied alpha channel",
+            };
+
+            if (target->repr.alpha) {
+                snprintf(buf, sizeof(buf), "Auto (%s)", alphas[target->repr.alpha]);
+                alphas[PL_ALPHA_UNKNOWN] = buf;
+            }
+
+            nk_label(nk, "Alpha:", NK_TEXT_LEFT);
+            trepr->alpha = nk_combo(nk, alphas, PL_ALPHA_MODE_COUNT, trepr->alpha,
+                                    16, nk_vec2(nk_widget_width(nk), 200));
+
+            const struct pl_bit_encoding *bits = &target->repr.bits;
+            nk_label(nk, "Bit depth:", NK_TEXT_LEFT);
+            auto_property_int(nk, bits->color_depth, 0,
+                              &trepr->bits.color_depth, 16, 1, 0);
+
+            if (bits->color_depth != bits->sample_depth) {
+                nk_label(nk, "Sample bit depth:", NK_TEXT_LEFT);
+                auto_property_int(nk, bits->sample_depth, 0,
+                                  &trepr->bits.sample_depth, 16, 1, 0);
+            } else {
+                // Adjust these two fields in unison
+                trepr->bits.sample_depth = trepr->bits.color_depth;
+            }
+
+            if (bits->bit_shift) {
+                nk_label(nk, "Bit shift:", NK_TEXT_LEFT);
+                auto_property_int(nk, bits->bit_shift, 0,
+                                  &trepr->bits.bit_shift, 16, 1, 0);
+            } else {
+                trepr->bits.bit_shift = 0;
+            }
+
+            nk_layout_row_dynamic(nk, 24, 1);
+            nk_checkbox_label(nk, "Forward input color space to display", &p->colorspace_hint);
+
+            if (p->colorspace_hint && !p->force_hdr_enable) {
+                nk_checkbox_label(nk, "Forward dynamic brightness changes to display",
+                                  &p->colorspace_hint_dynamic);
+            }
+
+            nk_layout_row_dynamic(nk, 50, 1);
+            if (ui_widget_hover(nk, "Drop ICC profile here...") && dropped_file) {
+                struct pl_icc_profile profile;
+                int ret = av_file_map(dropped_file, (uint8_t **) &profile.data,
+                                      &profile.len, 0, NULL);
+                if (ret < 0) {
+                    fprintf(stderr, "Failed opening '%s': %s\n", dropped_file,
+                            av_err2str(ret));
+                } else {
+                    free(p->icc_name);
+                    pl_icc_profile_compute_signature(&profile);
+                    pl_icc_update(p->log, &p->icc, &profile, pl_icc_params(
+                        .force_bpc = p->force_bpc,
+                        .max_luma  = p->use_icc_luma ? 0 : PL_COLOR_SDR_WHITE,
+                    ));
+                    av_file_unmap((void *) profile.data, profile.len);
+                    if (p->icc)
+                        p->icc_name = strdup(PL_BASENAME((char *) dropped_file));
+                }
+            }
+
+            if (p->icc) {
+                nk_layout_row_dynamic(nk, 24, 2);
+                nk_labelf(nk, NK_TEXT_LEFT, "Loaded: %s",
+                          p->icc_name ? p->icc_name : "(unknown)");
+                reset_icc |= nk_button_label(nk, "Reset ICC");
+                nk_checkbox_label(nk, "Force BPC", &p->force_bpc);
+                nk_checkbox_label(nk, "Use detected luminance", &p->use_icc_luma);
+            }
+
+            // Apply the reset last to prevent the UI from flashing for a frame
+            if (reset) {
+                p->force_repr = (struct pl_color_repr) {0};
+                p->force_prim = PL_COLOR_PRIM_UNKNOWN;
+                p->force_trc = PL_COLOR_TRC_UNKNOWN;
+                p->force_hdr = (struct pl_hdr_metadata) {0};
+                p->force_hdr_enable = false;
+            }
+
+            if (reset_icc && p->icc) {
+                pl_icc_close(&p->icc);
+                free(p->icc_name);
+                p->icc_name = NULL;
+            }
+
+            nk_tree_pop(nk);
+        }
+
+        if (nk_tree_push(nk, NK_TREE_NODE, "Custom shaders", NK_MINIMIZED)) {
+
+            nk_layout_row_dynamic(nk, 50, 1);
+            if (ui_widget_hover(nk, "Drop .hook/.glsl files here...") && dropped_file) {
+                uint8_t *buf;
+                size_t size;
+                int ret = av_file_map(dropped_file, &buf, &size, 0, NULL);
+                if (ret < 0) {
+                    fprintf(stderr, "Failed opening '%s': %s\n", dropped_file,
+                            av_err2str(ret));
+                } else {
+                    const struct pl_hook *hook;
+                    hook = pl_mpv_user_shader_parse(p->win->gpu, (char *) buf, size);
+                    av_file_unmap(buf, size);
+                    add_hook(p, hook, dropped_file);
+                }
+            }
+
+            const float px = 24.0;
+            nk_layout_row_template_begin(nk, px);
+            nk_layout_row_template_push_static(nk, px);
+            nk_layout_row_template_push_static(nk, px);
+            nk_layout_row_template_push_static(nk, px);
+            nk_layout_row_template_push_dynamic(nk);
+            nk_layout_row_template_end(nk);
+            for (int i = 0; i < p->shader_num; i++) {
+
+                if (i == 0) {
+                    nk_label(nk, "·", NK_TEXT_CENTERED);
+                } else if (nk_button_symbol(nk, NK_SYMBOL_TRIANGLE_UP)) {
+                    const struct pl_hook *prev_hook = p->shader_hooks[i - 1];
+                    char *prev_path = p->shader_paths[i - 1];
+                    p->shader_hooks[i - 1] = p->shader_hooks[i];
+                    p->shader_paths[i - 1] = p->shader_paths[i];
+                    p->shader_hooks[i] = prev_hook;
+                    p->shader_paths[i] = prev_path;
+                }
+
+                if (i == p->shader_num - 1) {
+                    nk_label(nk, "·", NK_TEXT_CENTERED);
+                } else if (nk_button_symbol(nk, NK_SYMBOL_TRIANGLE_DOWN)) {
+                    const struct pl_hook *next_hook = p->shader_hooks[i + 1];
+                    char *next_path = p->shader_paths[i + 1];
+                    p->shader_hooks[i + 1] = p->shader_hooks[i];
+                    p->shader_paths[i + 1] = p->shader_paths[i];
+                    p->shader_hooks[i] = next_hook;
+                    p->shader_paths[i] = next_path;
+                }
+
+                if (nk_button_symbol(nk, NK_SYMBOL_X)) {
+                    pl_mpv_user_shader_destroy(&p->shader_hooks[i]);
+                    free(p->shader_paths[i]);
+                    p->shader_num--;
+                    memmove(&p->shader_hooks[i], &p->shader_hooks[i+1],
+                            (p->shader_num - i) * sizeof(void *));
+                    memmove(&p->shader_paths[i], &p->shader_paths[i+1],
+                            (p->shader_num - i) * sizeof(char *));
+                    if (i == p->shader_num)
+                        break;
+                }
+
+                if (p->shader_hooks[i]->num_parameters == 0) {
+                    nk_label(nk, p->shader_paths[i], NK_TEXT_LEFT);
+                    continue;
+                }
+
+                if (nk_combo_begin_label(nk, p->shader_paths[i], nk_vec2(nk_widget_width(nk), 500))) {
+                    nk_layout_row_dynamic(nk, 32, 1);
+                    for (int j = 0; j < p->shader_hooks[i]->num_parameters; j++) {
+                        const struct pl_hook_par *hp = &p->shader_hooks[i]->parameters[j];
+                        const char *name = hp->description ? hp->description : hp->name;
+                        switch (hp->type) {
+                        case PL_VAR_FLOAT:
+                            nk_property_float(nk, name, hp->minimum.f,
+                                              &hp->data->f, hp->maximum.f,
+                                              hp->data->f / 100.0f,
+                                              hp->data->f / 1000.0f);
+                            break;
+                        case PL_VAR_SINT:
+                            nk_property_int(nk, name, hp->minimum.i,
+                                            &hp->data->i, hp->maximum.i,
+                                            1, 1.0f);
+                            break;
+                        case PL_VAR_UINT: {
+                            int min = FFMIN(hp->minimum.u, INT_MAX);
+                            int max = FFMIN(hp->maximum.u, INT_MAX);
+                            int val = FFMIN(hp->data->u, INT_MAX);
+                            nk_property_int(nk, name, min, &val, max, 1, 1);
+                            hp->data->u = val;
+                            break;
+                        }
+                        default: abort();
+                        }
+                    }
+                    nk_combo_end(nk);
+                }
+            }
+
+            par->hooks = p->shader_hooks;
+            par->num_hooks = p->shader_num;
+            nk_tree_pop(nk);
+        }
+
+        if (nk_tree_push(nk, NK_TREE_NODE, "Debug", NK_MINIMIZED)) {
+            nk_layout_row_dynamic(nk, 24, 1);
+            nk_checkbox_label(nk, "Preserve mixing cache", &par->preserve_mixing_cache);
+            nk_checkbox_label(nk, "Bypass mixing cache", &par->skip_caching_single_frame);
+            nk_checkbox_label(nk, "Show all scaler presets", &p->advanced_scalers);
+            nk_checkbox_label(nk, "Disable linear scaling", &par->disable_linear_scaling);
+            nk_checkbox_label(nk, "Disable built-in scalers", &par->disable_builtin_scalers);
+            nk_checkbox_label(nk, "Correct subpixel offsets", &par->correct_subpixel_offsets);
+            nk_checkbox_label(nk, "Force-enable dither", &par->force_dither);
+            nk_checkbox_label(nk, "Disable gamma-aware dither", &par->disable_dither_gamma_correction);
+            nk_checkbox_label(nk, "Disable FBOs / advanced rendering", &par->disable_fbos);
+            nk_checkbox_label(nk, "Force low-bit depth FBOs", &par->force_low_bit_depth_fbos);
+            nk_checkbox_label(nk, "Disable constant hard-coding", &par->dynamic_constants);
+
+            if (nk_check_label(nk, "Ignore Dolby Vision metadata", p->ignore_dovi) != p->ignore_dovi) {
+                // Flush the renderer cache on changes, since this can
+                // drastically alter the subjective appearance of the stream
+                pl_renderer_flush_cache(p->renderer);
+                p->ignore_dovi = !p->ignore_dovi;
+            }
+
+            nk_layout_row_dynamic(nk, 24, 2);
+
+            double prev_fps = p->fps;
+            bool fps_changed = nk_checkbox_label(nk, "Override display FPS", &p->fps_override);
+            nk_property_float(nk, "FPS", 10.0, &p->fps, 240.0, 5, 0.1);
+            if (fps_changed || p->fps != prev_fps)
+                p->stats.pts_interval = p->stats.vsync_interval = (struct timing) {0};
+
+            if (nk_button_label(nk, "Flush renderer cache"))
+                pl_renderer_flush_cache(p->renderer);
+            if (nk_button_label(nk, "Recreate renderer")) {
+                pl_renderer_destroy(&p->renderer);
+                p->renderer = pl_renderer_create(p->log, p->win->gpu);
+            }
+
+            if (nk_tree_push(nk, NK_TREE_NODE, "Shader passes / GPU timing", NK_MINIMIZED)) {
+                nk_layout_row_dynamic(nk, 26, 1);
+                nk_label(nk, "Full frames:", NK_TEXT_LEFT);
+                for (int i = 0; i < p->num_frame_passes; i++)
+                    draw_shader_pass(nk, &p->frame_info[i]);
+
+                nk_layout_row_dynamic(nk, 26, 1);
+                nk_label(nk, "Output blending:", NK_TEXT_LEFT);
+                for (int j = 0; j < MAX_BLEND_FRAMES; j++) {
+                    for (int i = 0; i < p->num_blend_passes[j]; i++)
+                        draw_shader_pass(nk, &p->blend_info[j][i]);
+                }
+
+                nk_tree_pop(nk);
+            }
+
+            if (nk_tree_push(nk, NK_TREE_NODE, "Frame statistics / CPU timing", NK_MINIMIZED)) {
+                nk_layout_row_dynamic(nk, 24, 2);
+                nk_label(nk, "Current PTS:", NK_TEXT_LEFT);
+                nk_labelf(nk, NK_TEXT_LEFT, "%.3f", p->stats.current_pts);
+                nk_label(nk, "Estimated FPS:", NK_TEXT_LEFT);
+                nk_labelf(nk, NK_TEXT_LEFT, "%.3f", pl_queue_estimate_fps(p->queue));
+                nk_label(nk, "Estimated vsync rate:", NK_TEXT_LEFT);
+                nk_labelf(nk, NK_TEXT_LEFT, "%.3f", pl_queue_estimate_vps(p->queue));
+                nk_label(nk, "Frames rendered:", NK_TEXT_LEFT);
+                nk_labelf(nk, NK_TEXT_LEFT, "%"PRIu32, p->stats.rendered);
+                nk_label(nk, "Decoded frames", NK_TEXT_LEFT);
+                nk_labelf(nk, NK_TEXT_LEFT, "%"PRIu32, atomic_load(&p->stats.decoded));
+                nk_label(nk, "Dropped frames:", NK_TEXT_LEFT);
+                nk_labelf(nk, NK_TEXT_LEFT, "%"PRIu32, p->stats.dropped);
+                nk_label(nk, "Missed timestamps:", NK_TEXT_LEFT);
+                nk_labelf(nk, NK_TEXT_LEFT, "%"PRIu32" (%.3f ms)",
+                          p->stats.missed, p->stats.missed_ms);
+                nk_label(nk, "Times stalled:", NK_TEXT_LEFT);
+                nk_labelf(nk, NK_TEXT_LEFT, "%"PRIu32" (%.3f ms)",
+                          p->stats.stalled, p->stats.stalled_ms);
+                draw_timing(nk, "Acquire FBO:", &p->stats.acquire);
+                draw_timing(nk, "Update queue:", &p->stats.update);
+                draw_timing(nk, "Render frame:", &p->stats.render);
+                draw_timing(nk, "Draw interface:", &p->stats.draw_ui);
+                draw_timing(nk, "Voluntary sleep:", &p->stats.sleep);
+                draw_timing(nk, "Submit frame:", &p->stats.submit);
+                draw_timing(nk, "Swap buffers:", &p->stats.swap);
+                draw_timing(nk, "Vsync interval:", &p->stats.vsync_interval);
+                draw_timing(nk, "PTS interval:", &p->stats.pts_interval);
+
+                if (nk_button_label(nk, "Reset statistics"))
+                    memset(&p->stats, 0, sizeof(p->stats));
+                nk_tree_pop(nk);
+            }
+
+            if (nk_tree_push(nk, NK_TREE_NODE, "Settings dump", NK_MINIMIZED)) {
+
+                nk_layout_row_dynamic(nk, 24, 2);
+                if (nk_button_label(nk, "Copy to clipboard"))
+                    window_set_clipboard(p->win, pl_options_save(opts));
+                if (nk_button_label(nk, "Load from clipboard"))
+                    pl_options_load(opts, window_get_clipboard(p->win));
+
+                nk_layout_row_dynamic(nk, 24, 1);
+                pl_options_iterate(opts, draw_opt_data, nk);
+                nk_tree_pop(nk);
+            }
+
+            if (nk_tree_push(nk, NK_TREE_NODE, "Cache statistics", NK_MINIMIZED)) {
+                nk_layout_row_dynamic(nk, 24, 2);
+                nk_label(nk, "Cached objects:", NK_TEXT_LEFT);
+                nk_labelf(nk, NK_TEXT_LEFT, "%d", pl_cache_objects(p->cache));
+                nk_label(nk, "Total size:", NK_TEXT_LEFT);
+                nk_labelf(nk, NK_TEXT_LEFT, "%zu", pl_cache_size(p->cache));
+                nk_label(nk, "Maximum total size:", NK_TEXT_LEFT);
+                nk_labelf(nk, NK_TEXT_LEFT, "%zu", p->cache->params.max_total_size);
+                nk_label(nk, "Maximum object size:", NK_TEXT_LEFT);
+                nk_labelf(nk, NK_TEXT_LEFT, "%zu", p->cache->params.max_object_size);
+
+                if (nk_button_label(nk, "Clear cache"))
+                    pl_cache_reset(p->cache);
+                if (nk_button_label(nk, "Save cache")) {
+                    FILE *file = fopen(p->cache_file, "wb");
+                    if (file) {
+                        pl_cache_save_file(p->cache, file);
+                        fclose(file);
+                    }
+                }
+
+                if (nk_tree_push(nk, NK_TREE_NODE, "Object list", NK_MINIMIZED)) {
+                    nk_layout_row_dynamic(nk, 24, 1);
+                    pl_cache_iterate(p->cache, draw_cache_line, nk);
+                    nk_tree_pop(nk);
+                }
+
+                nk_tree_pop(nk);
+            }
+
+            nk_tree_pop(nk);
+        }
+    }
+    nk_end(nk);
+}
+
+#else
+void update_settings(struct plplay *p, const struct pl_frame *target) { }
+#endif // HAVE_NUKLEAR
diff --git a/demos/ui.c b/demos/ui.c
new file mode 100644
index 0000000..6cdc7c6
--- /dev/null
+++ b/demos/ui.c
@@ -0,0 +1,221 @@
+// License: CC0 / Public Domain
+
+#define NK_IMPLEMENTATION
+#include "ui.h"
+
+#include <libplacebo/dispatch.h>
+#include <libplacebo/shaders/custom.h>
+
+struct ui_vertex {
+    float pos[2];
+    float coord[2];
+    uint8_t color[4];
+};
+
+#define NUM_VERTEX_ATTRIBS 3
+
+struct ui {
+    pl_gpu gpu;
+    pl_dispatch dp;
+    struct nk_context nk;
+    struct nk_font_atlas atlas;
+    struct nk_buffer cmds, verts, idx;
+    pl_tex font_tex;
+    struct pl_vertex_attrib attribs_pl[NUM_VERTEX_ATTRIBS];
+    struct nk_draw_vertex_layout_element attribs_nk[NUM_VERTEX_ATTRIBS+1];
+    struct nk_convert_config convert_cfg;
+};
+
+struct ui *ui_create(pl_gpu gpu)
+{
+    struct ui *ui = malloc(sizeof(struct ui));
+    if (!ui)
+        return NULL;
+
+    *ui = (struct ui) {
+        .gpu = gpu,
+        .dp = pl_dispatch_create(gpu->log, gpu),
+        .attribs_pl = {
+            {
+                .name = "pos",
+                .offset = offsetof(struct ui_vertex, pos),
+                .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2),
+            }, {
+                .name = "coord",
+                .offset = offsetof(struct ui_vertex, coord),
+                .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2),
+            }, {
+                .name = "vcolor",
+                .offset = offsetof(struct ui_vertex, color),
+                .fmt = pl_find_named_fmt(gpu, "rgba8"),
+            }
+        },
+        .attribs_nk = {
+            {NK_VERTEX_POSITION, NK_FORMAT_FLOAT, offsetof(struct ui_vertex, pos)},
+            {NK_VERTEX_TEXCOORD, NK_FORMAT_FLOAT, offsetof(struct ui_vertex, coord)},
+            {NK_VERTEX_COLOR, NK_FORMAT_R8G8B8A8, offsetof(struct ui_vertex, color)},
+            {NK_VERTEX_LAYOUT_END}
+        },
+        .convert_cfg = {
+            .vertex_layout = ui->attribs_nk,
+            .vertex_size = sizeof(struct ui_vertex),
+            .vertex_alignment = NK_ALIGNOF(struct ui_vertex),
+            .shape_AA = NK_ANTI_ALIASING_ON,
+            .line_AA = NK_ANTI_ALIASING_ON,
+            .circle_segment_count = 22,
+            .curve_segment_count = 22,
+            .arc_segment_count = 22,
+            .global_alpha = 1.0f,
+        },
+    };
+
+    // Initialize font atlas using built-in font
+    nk_font_atlas_init_default(&ui->atlas);
+    nk_font_atlas_begin(&ui->atlas);
+    struct nk_font *font = nk_font_atlas_add_default(&ui->atlas, 20, NULL);
+    struct pl_tex_params tparams = {
+        .format = pl_find_named_fmt(gpu, "r8"),
+        .sampleable = true,
+        .initial_data = nk_font_atlas_bake(&ui->atlas, &tparams.w, &tparams.h,
+                                           NK_FONT_ATLAS_ALPHA8),
+        .debug_tag = PL_DEBUG_TAG,
+    };
+    ui->font_tex = pl_tex_create(gpu, &tparams);
+    nk_font_atlas_end(&ui->atlas, nk_handle_ptr((void *) ui->font_tex),
+                      &ui->convert_cfg.tex_null);
+    nk_font_atlas_cleanup(&ui->atlas);
+
+    if (!ui->font_tex)
+        goto error;
+
+    // Initialize nuklear state
+    if (!nk_init_default(&ui->nk, &font->handle)) {
+        fprintf(stderr, "NK: failed initializing UI!\n");
+        goto error;
+    }
+
+    nk_buffer_init_default(&ui->cmds);
+    nk_buffer_init_default(&ui->verts);
+    nk_buffer_init_default(&ui->idx);
+
+    return ui;
+
+error:
+    ui_destroy(&ui);
+    return NULL;
+}
+
+void ui_destroy(struct ui **ptr)
+{
+    struct ui *ui = *ptr;
+    if (!ui)
+        return;
+
+    nk_buffer_free(&ui->cmds);
+    nk_buffer_free(&ui->verts);
+    nk_buffer_free(&ui->idx);
+    nk_free(&ui->nk);
+    nk_font_atlas_clear(&ui->atlas);
+    pl_tex_destroy(ui->gpu, &ui->font_tex);
+    pl_dispatch_destroy(&ui->dp);
+
+    free(ui);
+    *ptr = NULL;
+}
+
+void ui_update_input(struct ui *ui, const struct window *win)
+{
+    int x, y;
+    window_get_cursor(win, &x, &y);
+    nk_input_begin(&ui->nk);
+    nk_input_motion(&ui->nk, x, y);
+    nk_input_button(&ui->nk, NK_BUTTON_LEFT, x, y, window_get_button(win, BTN_LEFT));
+    nk_input_button(&ui->nk, NK_BUTTON_RIGHT, x, y, window_get_button(win, BTN_RIGHT));
+    nk_input_button(&ui->nk, NK_BUTTON_MIDDLE, x, y, window_get_button(win, BTN_MIDDLE));
+    struct nk_vec2 scroll;
+    window_get_scroll(win, &scroll.x, &scroll.y);
+    nk_input_scroll(&ui->nk, scroll);
+    nk_input_end(&ui->nk);
+}
+
+struct nk_context *ui_get_context(struct ui *ui)
+{
+    return &ui->nk;
+}
+
+bool ui_draw(struct ui *ui, const struct pl_swapchain_frame *frame)
+{
+    if (nk_convert(&ui->nk, &ui->cmds, &ui->verts, &ui->idx, &ui->convert_cfg) != NK_CONVERT_SUCCESS) {
+        fprintf(stderr, "NK: failed converting draw commands!\n");
+        return false;
+    }
+
+    const struct nk_draw_command *cmd = NULL;
+    const uint8_t *vertices = nk_buffer_memory(&ui->verts);
+    const nk_draw_index *indices = nk_buffer_memory(&ui->idx);
+    nk_draw_foreach(cmd, &ui->nk, &ui->cmds) {
+        if (!cmd->elem_count)
+            continue;
+
+        pl_shader sh = pl_dispatch_begin(ui->dp);
+        pl_shader_custom(sh, &(struct pl_custom_shader) {
+            .description = "nuklear UI",
+            .body = "color = textureLod(ui_tex, coord, 0.0).r * vcolor;",
+            .output = PL_SHADER_SIG_COLOR,
+            .num_descriptors = 1,
+            .descriptors = &(struct pl_shader_desc) {
+                .desc = {
+                    .name = "ui_tex",
+                    .type = PL_DESC_SAMPLED_TEX,
+                },
+                .binding = {
+                    .object = cmd->texture.ptr,
+                    .sample_mode = PL_TEX_SAMPLE_NEAREST,
+                },
+            },
+        });
+
+        struct pl_color_repr repr = frame->color_repr;
+        pl_shader_color_map_ex(sh, NULL, pl_color_map_args(
+            .src = pl_color_space_srgb,
+            .dst = frame->color_space,
+        ));
+        pl_shader_encode_color(sh, &repr);
+
+        bool ok = pl_dispatch_vertex(ui->dp, pl_dispatch_vertex_params(
+            .shader = &sh,
+            .target = frame->fbo,
+            .blend_params = &pl_alpha_overlay,
+            .scissors = {
+                .x0 = cmd->clip_rect.x,
+                .y0 = cmd->clip_rect.y,
+                .x1 = cmd->clip_rect.x + cmd->clip_rect.w,
+                .y1 = cmd->clip_rect.y + cmd->clip_rect.h,
+            },
+            .vertex_attribs = ui->attribs_pl,
+            .num_vertex_attribs = NUM_VERTEX_ATTRIBS,
+            .vertex_stride = sizeof(struct ui_vertex),
+            .vertex_position_idx = 0,
+            .vertex_coords = PL_COORDS_ABSOLUTE,
+            .vertex_flipped = frame->flipped,
+            .vertex_type = PL_PRIM_TRIANGLE_LIST,
+            .vertex_count = cmd->elem_count,
+            .vertex_data = vertices,
+            .index_data = indices,
+            .index_fmt = PL_INDEX_UINT32,
+        ));
+
+        if (!ok) {
+            fprintf(stderr, "placebo: failed rendering UI!\n");
+            return false;
+        }
+
+        indices += cmd->elem_count;
+    }
+
+    nk_clear(&ui->nk);
+    nk_buffer_clear(&ui->cmds);
+    nk_buffer_clear(&ui->verts);
+    nk_buffer_clear(&ui->idx);
+    return true;
+}
diff --git a/demos/ui.h b/demos/ui.h
new file mode 100644
index 0000000..9344e68
--- /dev/null
+++ b/demos/ui.h
@@ -0,0 +1,59 @@
+// License: CC0 / Public Domain
+#pragma once
+
+#define NK_INCLUDE_FIXED_TYPES
+#define NK_INCLUDE_DEFAULT_ALLOCATOR
+#define NK_INCLUDE_STANDARD_IO
+#define NK_INCLUDE_STANDARD_BOOL
+#define NK_INCLUDE_STANDARD_VARARGS
+#define NK_INCLUDE_VERTEX_BUFFER_OUTPUT
+#define NK_INCLUDE_FONT_BAKING
+#define NK_INCLUDE_DEFAULT_FONT
+#define NK_BUTTON_TRIGGER_ON_RELEASE
+#define NK_UINT_DRAW_INDEX
+#include <nuklear.h>
+
+#include "common.h"
+#include "window.h"
+
+struct ui;
+
+struct ui *ui_create(pl_gpu gpu);
+void ui_destroy(struct ui **ui);
+
+// Update/Logic/Draw cycle
+void ui_update_input(struct ui *ui, const struct window *window);
+struct nk_context *ui_get_context(struct ui *ui);
+bool ui_draw(struct ui *ui, const struct pl_swapchain_frame *frame);
+
+// Helper function to draw a custom widget for drag&drop operations, returns
+// true if the widget is hovered
+static inline bool ui_widget_hover(struct nk_context *nk, const char *label)
+{
+    struct nk_rect bounds;
+    if (!nk_widget(&bounds, nk))
+        return false;
+
+    struct nk_command_buffer *canvas = nk_window_get_canvas(nk);
+    bool hover = nk_input_is_mouse_hovering_rect(&nk->input, bounds);
+
+    float h, s, v;
+    nk_color_hsv_f(&h, &s, &v, nk->style.window.background);
+    struct nk_color background = nk_hsv_f(h, s, v + (hover ? 0.1f : -0.02f));
+    struct nk_color border = nk_hsv_f(h, s, v + 0.20f);
+    nk_fill_rect(canvas, bounds, 0.0f, background);
+    nk_stroke_rect(canvas, bounds, 0.0f, 2.0f, border);
+
+    const float pad = 10.0f;
+    struct nk_rect text = {
+        .x = bounds.x + pad,
+        .y = bounds.y + pad,
+        .w = bounds.w - 2 * pad,
+        .h = bounds.h - 2 * pad,
+    };
+
+    nk_draw_text(canvas, text, label, nk_strlen(label), nk->style.font,
+                 background, nk->style.text.color);
+
+    return hover;
+}
diff --git a/demos/utils.c b/demos/utils.c
new file mode 100644
index 0000000..7c95d00
--- /dev/null
+++ b/demos/utils.c
@@ -0,0 +1,49 @@
+// License: CC0 / Public Domain
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "utils.h"
+#include "../src/os.h"
+
+#ifdef PL_HAVE_WIN32
+#include <shlobj.h>
+#else
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <pwd.h>
+#endif
+
+const char *get_cache_dir(char (*buf)[512])
+{
+    // Check if XDG_CACHE_HOME is set for Linux/BSD
+    const char* xdg_cache_home = getenv("XDG_CACHE_HOME");
+    if (xdg_cache_home)
+        return xdg_cache_home;
+
+#ifdef _WIN32
+    const char* local_app_data = getenv("LOCALAPPDATA");
+    if (local_app_data)
+        return local_app_data;
+#endif
+
+#ifdef __APPLE__
+    struct passwd* pw = getpwuid(getuid());
+    if (pw) {
+        int ret = snprintf(*buf, sizeof(*buf), "%s/%s", pw->pw_dir, "Library/Caches");
+        if (ret > 0 && ret < sizeof(*buf))
+            return *buf;
+    }
+#endif
+
+    const char* home = getenv("HOME");
+    if (home) {
+        int ret = snprintf(*buf, sizeof(*buf), "%s/.cache", home);
+        if (ret > 0 && ret < sizeof(*buf))
+            return *buf;
+    }
+
+    return NULL;
+}
diff --git a/demos/utils.h b/demos/utils.h
new file mode 100644
index 0000000..e6650c3
--- /dev/null
+++ b/demos/utils.h
@@ -0,0 +1,5 @@
+// License: CC0 / Public Domain
+#pragma once
+#include "common.h"
+
+const char *get_cache_dir(char (*buf)[512]);
diff --git a/demos/video-filtering.c b/demos/video-filtering.c
new file mode 100644
index 0000000..5881c28
--- /dev/null
+++ b/demos/video-filtering.c
@@ -0,0 +1,871 @@
+/* Presented are two hypothetical scenarios of how one might use libplacebo
+ * as something like an FFmpeg or mpv video filter. We examine two example
+ * APIs (loosely modeled after real video filtering APIs) and how each style
+ * would like to use libplacebo.
+ *
+ * For sake of a simple example, let's assume this is a debanding filter.
+ * For those of you too lazy to compile/run this file but still want to see
+ * results, these are from my machine (RX 5700 XT + 1950X, as of 2020-05-25):
+ *
+ * RADV+ACO:
+ *   api1: 10000 frames in 16.328440 s => 1.632844 ms/frame (612.43 fps)
+ *         render: 0.113524 ms, upload: 0.127551 ms, download: 0.146097 ms
+ *   api2: 10000 frames in 5.335634 s => 0.533563 ms/frame (1874.19 fps)
+ *         render: 0.064378 ms, upload: 0.000000 ms, download: 0.189719 ms
+ *
+ * AMDVLK:
+ *   api1: 10000 frames in 14.921859 s => 1.492186 ms/frame (670.16 fps)
+ *         render: 0.110603 ms, upload: 0.114412 ms, download: 0.115375 ms
+ *   api2: 10000 frames in 4.667386 s => 0.466739 ms/frame (2142.53 fps)
+ *         render: 0.030781 ms, upload: 0.000000 ms, download: 0.075237 ms
+ *
+ * You can see that AMDVLK is still better at doing texture streaming than
+ * RADV - this is because as of writing RADV still does not support
+ * asynchronous texture queues / DMA engine transfers. If we disable the
+ * `async_transfer` option with AMDVLK we get this:
+ *
+ *   api1: 10000 frames in 16.087723 s => 1.608772 ms/frame (621.59 fps)
+ *         render: 0.111154 ms, upload: 0.122476 ms, download: 0.133162 ms
+ *   api2: 10000 frames in 6.344959 s => 0.634496 ms/frame (1576.05 fps)
+ *         render: 0.031307 ms, upload: 0.000000 ms, download: 0.083520 ms
+ *
+ * License: CC0 / Public Domain
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "common.h"
+#include "pl_clock.h"
+#include "pl_thread.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+#include <libplacebo/dispatch.h>
+#include <libplacebo/shaders/sampling.h>
+#include <libplacebo/utils/upload.h>
+#include <libplacebo/vulkan.h>
+
+///////////////////////
+/// API definitions ///
+///////////////////////
+
+// Stuff that would be common to each API
+
+void *init(void);
+void uninit(void *priv);
+
+struct format {
+    // For simplicity let's make a few assumptions here, since configuring the
+    // texture format is not the point of this example. (In practice you can
+    // go nuts with the `utils/upload.h` helpers)
+    //
+    // - All formats contain unsigned integers only
+    // - All components have the same size in bits
+    // - All components are in the "canonical" order
+    // - All formats have power of two sizes only (2 or 4 components, not 3)
+    // - All plane strides are a multiple of the pixel size
+    int num_comps;
+    int bitdepth;
+};
+
+struct plane {
+    int subx, suby; // subsampling shift
+    struct format fmt;
+    size_t stride;
+    void *data;
+};
+
+#define MAX_PLANES 4
+
+struct image {
+    int width, height;
+    int num_planes;
+    struct plane planes[MAX_PLANES];
+
+    // For API #2, the associated mapped buffer (if any)
+    struct api2_buf *associated_buf;
+};
+
+
+// Example API design #1: synchronous, blocking, double-copy (bad!)
+//
+// In this API, `api1_filter` must immediately return with the new data.
+// This prevents parallelism on the GPU and should be avoided if possible,
+// but sometimes that's what you have to work with. So this is what it
+// would look like.
+//
+// Also, let's assume this API design reconfigures the filter chain (using
+// a blank `proxy` image every time the image format or dimensions change,
+// and doesn't expect us to fail due to format mismatches or resource
+// exhaustion afterwards.
+
+bool api1_reconfig(void *priv, const struct image *proxy);
+bool api1_filter(void *priv, struct image *dst, struct image *src);
+
+
+// Example API design #2: asynchronous, streaming, queued, zero-copy (good!)
+//
+// In this API, `api2_process` will run by the calling code every so often
+// (e.g. when new data is available or expected). This function has access
+// to non-blocking functions `get_image` and `put_image` that interface
+// with the video filtering engine's internal queueing system.
+//
+// This API is also designed to feed multiple frames ahead of time, i.e.
+// it will feed us as many frames as it can while we're still returning
+// `API2_WANT_MORE`. To drain the filter chain, it would continue running
+// the process function until `API2_HAVE_MORE` is no longer present
+// in the output.
+//
+// This API is also designed to do zero-copy where possible. When it wants
+// to create a data buffer of a given size, it will call our function
+// `api2_alloc` which will return a buffer that we can process directly.
+// We can use this to do zero-copy uploading to the GPU, by creating
+// host-visible persistently mapped buffers. In order to prevent the video
+// filtering system from re-using our buffers while copies are happening, we
+// use special functions `image_lock` and `image_unlock` to increase a
+// refcount on the image's backing storage. (As is typical of such APIs)
+//
+// Finally, this API is designed to be fully dynamic: The image parameters
+// could change at any time, and we must be equipped to handle that.
+
+enum api2_status {
+    // Negative values are used to signal error conditions
+    API2_ERR_FMT = -2,          // incompatible / unsupported format
+    API2_ERR_UNKNOWN = -1,      // some other error happened
+    API2_OK = 0,                // no error, no status - everything's good
+
+    // Positive values represent a mask of status conditions
+    API2_WANT_MORE = (1 << 0),  // we want more frames, please feed some more!
+    API2_HAVE_MORE = (1 << 1),  // we have more frames but they're not ready
+};
+
+enum api2_status api2_process(void *priv);
+
+// Functions for creating persistently mapped buffers
+struct api2_buf {
+    void *data;
+    size_t size;
+    void *priv;
+};
+
+bool api2_alloc(void *priv, size_t size, struct api2_buf *out);
+void api2_free(void *priv, const struct api2_buf *buf);
+
+// These functions are provided by the API. The exact details of how images
+// are enqueued, dequeued and locked are not really important here, so just
+// do something unrealistic but simple to demonstrate with.
+struct image *get_image(void);
+void put_image(struct image *img);
+void image_lock(struct image *img);
+void image_unlock(struct image *img);
+
+
+/////////////////////////////////
+/// libplacebo implementation ///
+/////////////////////////////////
+
+
+// For API #2:
+#define PARALLELISM 8
+
+struct entry {
+    pl_buf buf; // to stream the download
+    pl_tex tex_in[MAX_PLANES];
+    pl_tex tex_out[MAX_PLANES];
+    struct image image;
+
+    // For entries that are associated with a held image, so we can unlock them
+    // as soon as possible
+    struct image *held_image;
+    pl_buf held_buf;
+};
+
+// For both APIs:
+struct priv {
+    pl_log log;
+    pl_vulkan vk;
+    pl_gpu gpu;
+    pl_dispatch dp;
+    pl_shader_obj dither_state;
+
+    // Timer objects
+    pl_timer render_timer;
+    pl_timer upload_timer;
+    pl_timer download_timer;
+    uint64_t render_sum;
+    uint64_t upload_sum;
+    uint64_t download_sum;
+    int render_count;
+    int upload_count;
+    int download_count;
+
+    // API #1: A simple pair of input and output textures
+    pl_tex tex_in[MAX_PLANES];
+    pl_tex tex_out[MAX_PLANES];
+
+    // API #2: A ring buffer of textures/buffers for streaming
+    int idx_in;  // points the next free entry
+    int idx_out; // points to the first entry still in progress
+    struct entry entries[PARALLELISM];
+};
+
+void *init(void) {
+    struct priv *p = calloc(1, sizeof(struct priv));
+    if (!p)
+        return NULL;
+
+    p->log = pl_log_create(PL_API_VER, pl_log_params(
+        .log_cb = pl_log_simple,
+        .log_level = PL_LOG_WARN,
+    ));
+
+    p->vk = pl_vulkan_create(p->log, pl_vulkan_params(
+        // Note: This is for API #2. In API #1 you could just pass params=NULL
+        // and it wouldn't really matter much.
+        .async_transfer = true,
+        .async_compute = true,
+        .queue_count = PARALLELISM,
+    ));
+
+    if (!p->vk) {
+        fprintf(stderr, "Failed creating vulkan context\n");
+        goto error;
+    }
+
+    // Give this a shorter name for convenience
+    p->gpu = p->vk->gpu;
+
+    p->dp = pl_dispatch_create(p->log, p->gpu);
+    if (!p->dp) {
+        fprintf(stderr, "Failed creating shader dispatch object\n");
+        goto error;
+    }
+
+    p->render_timer = pl_timer_create(p->gpu);
+    p->upload_timer = pl_timer_create(p->gpu);
+    p->download_timer = pl_timer_create(p->gpu);
+
+    return p;
+
+error:
+    uninit(p);
+    return NULL;
+}
+
+void uninit(void *priv)
+{
+    struct priv *p = priv;
+
+    // API #1
+    for (int i = 0; i < MAX_PLANES; i++) {
+        pl_tex_destroy(p->gpu, &p->tex_in[i]);
+        pl_tex_destroy(p->gpu, &p->tex_out[i]);
+    }
+
+    // API #2
+    for (int i = 0; i < PARALLELISM; i++) {
+        pl_buf_destroy(p->gpu, &p->entries[i].buf);
+        for (int j = 0; j < MAX_PLANES; j++) {
+            pl_tex_destroy(p->gpu, &p->entries[i].tex_in[j]);
+            pl_tex_destroy(p->gpu, &p->entries[i].tex_out[j]);
+        }
+        if (p->entries[i].held_image)
+            image_unlock(p->entries[i].held_image);
+    }
+
+    pl_timer_destroy(p->gpu, &p->render_timer);
+    pl_timer_destroy(p->gpu, &p->upload_timer);
+    pl_timer_destroy(p->gpu, &p->download_timer);
+
+    pl_shader_obj_destroy(&p->dither_state);
+    pl_dispatch_destroy(&p->dp);
+    pl_vulkan_destroy(&p->vk);
+    pl_log_destroy(&p->log);
+
+    free(p);
+}
+
+// Helper function to set up the `pl_plane_data` struct from the image params
+static void setup_plane_data(const struct image *img,
+                             struct pl_plane_data out[MAX_PLANES])
+{
+    for (int i = 0; i < img->num_planes; i++) {
+        const struct plane *plane = &img->planes[i];
+
+        out[i] = (struct pl_plane_data) {
+            .type = PL_FMT_UNORM,
+            .width = img->width >> plane->subx,
+            .height = img->height >> plane->suby,
+            .pixel_stride = plane->fmt.num_comps * plane->fmt.bitdepth / 8,
+            .row_stride = plane->stride,
+            .pixels = plane->data,
+        };
+
+        // For API 2 (direct rendering)
+        if (img->associated_buf) {
+            pl_buf buf = img->associated_buf->priv;
+            out[i].pixels = NULL;
+            out[i].buf = buf;
+            out[i].buf_offset = (uintptr_t) plane->data - (uintptr_t) buf->data;
+        }
+
+        for (int c = 0; c < plane->fmt.num_comps; c++) {
+            out[i].component_size[c] = plane->fmt.bitdepth;
+            out[i].component_pad[c] = 0;
+            out[i].component_map[c] = c;
+        }
+    }
+}
+
+static bool do_plane(struct priv *p, pl_tex dst, pl_tex src)
+{
+    int new_depth = dst->params.format->component_depth[0];
+
+    // Do some debanding, and then also make sure to dither to the new depth
+    // so that our debanded gradients are actually preserved well
+    pl_shader sh = pl_dispatch_begin(p->dp);
+    pl_shader_deband(sh, pl_sample_src( .tex = src ), NULL);
+    pl_shader_dither(sh, new_depth, &p->dither_state, NULL);
+    return pl_dispatch_finish(p->dp, pl_dispatch_params(
+        .shader = &sh,
+        .target = dst,
+        .timer  = p->render_timer,
+    ));
+}
+
+static void check_timers(struct priv *p)
+{
+    uint64_t ret;
+
+    while ((ret = pl_timer_query(p->gpu, p->render_timer))) {
+        p->render_sum += ret;
+        p->render_count++;
+    }
+
+    while ((ret = pl_timer_query(p->gpu, p->upload_timer))) {
+        p->upload_sum += ret;
+        p->upload_count++;
+    }
+
+    while ((ret = pl_timer_query(p->gpu, p->download_timer))) {
+        p->download_sum += ret;
+        p->download_count++;
+    }
+}
+
+// API #1 implementation:
+//
+// In this design, we will create all GPU resources inside `reconfig`, based on
+// the texture format configured from the proxy image. This will avoid failing
+// later on due to e.g. resource exhaustion or texture format mismatch, and
+// thereby falls within the intended semantics of this style of API.
+
+bool api1_reconfig(void *priv, const struct image *proxy)
+{
+    struct priv *p = priv;
+    struct pl_plane_data data[MAX_PLANES];
+    setup_plane_data(proxy, data);
+
+    for (int i = 0; i < proxy->num_planes; i++) {
+        pl_fmt fmt = pl_plane_find_fmt(p->gpu, NULL, &data[i]);
+        if (!fmt) {
+            fprintf(stderr, "Failed configuring filter: no good texture format!\n");
+            return false;
+        }
+
+        bool ok = true;
+        ok &= pl_tex_recreate(p->gpu, &p->tex_in[i], pl_tex_params(
+            .w = data[i].width,
+            .h = data[i].height,
+            .format = fmt,
+            .sampleable = true,
+            .host_writable = true,
+        ));
+
+        ok &= pl_tex_recreate(p->gpu, &p->tex_out[i], pl_tex_params(
+            .w = data[i].width,
+            .h = data[i].height,
+            .format = fmt,
+            .renderable = true,
+            .host_readable = true,
+        ));
+
+        if (!ok) {
+            fprintf(stderr, "Failed creating GPU textures!\n");
+            return false;
+        }
+    }
+
+    return true;
+}
+
+bool api1_filter(void *priv, struct image *dst, struct image *src)
+{
+    struct priv *p = priv;
+    struct pl_plane_data data[MAX_PLANES];
+    setup_plane_data(src, data);
+
+    // Upload planes
+    for (int i = 0; i < src->num_planes; i++) {
+        bool ok = pl_tex_upload(p->gpu, pl_tex_transfer_params(
+            .tex = p->tex_in[i],
+            .row_pitch = data[i].row_stride,
+            .ptr = src->planes[i].data,
+            .timer = p->upload_timer,
+        ));
+
+        if (!ok) {
+            fprintf(stderr, "Failed uploading data to the GPU!\n");
+            return false;
+        }
+    }
+
+    // Process planes
+    for (int i = 0; i < src->num_planes; i++) {
+        if (!do_plane(p, p->tex_out[i], p->tex_in[i])) {
+            fprintf(stderr, "Failed processing planes!\n");
+            return false;
+        }
+    }
+
+    // Download planes
+    for (int i = 0; i < src->num_planes; i++) {
+        bool ok = pl_tex_download(p->gpu, pl_tex_transfer_params(
+            .tex = p->tex_out[i],
+            .row_pitch = dst->planes[i].stride,
+            .ptr = dst->planes[i].data,
+            .timer = p->download_timer,
+        ));
+
+        if (!ok) {
+            fprintf(stderr, "Failed downloading data from the GPU!\n");
+            return false;
+        }
+    }
+
+    check_timers(p);
+    return true;
+}
+
+
+// API #2 implementation:
+//
+// In this implementation we maintain a queue (implemented as ring buffer)
+// of "work entries", which are isolated structs that hold independent GPU
+// resources - so that the GPU has no cross-entry dependencies on any of the
+// textures or other resources. (Side note: It still has a dependency on the
+// dither state, but this is just a shared LUT anyway)
+
+// Align up to the nearest multiple of a power of two
+#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
+
+static enum api2_status submit_work(struct priv *p, struct entry *e,
+                                    struct image *img)
+{
+    // If the image comes from a mapped buffer, we have to take a lock
+    // while our upload is in progress
+    if (img->associated_buf) {
+        assert(!e->held_image);
+        image_lock(img);
+        e->held_image = img;
+        e->held_buf = img->associated_buf->priv;
+    }
+
+    // Upload this image's data
+    struct pl_plane_data data[MAX_PLANES];
+    setup_plane_data(img, data);
+
+    for (int i = 0; i < img->num_planes; i++) {
+        pl_fmt fmt = pl_plane_find_fmt(p->gpu, NULL, &data[i]);
+        if (!fmt)
+            return API2_ERR_FMT;
+
+        // FIXME: can we plumb a `pl_timer` in here somehow?
+        if (!pl_upload_plane(p->gpu, NULL, &e->tex_in[i], &data[i]))
+            return API2_ERR_UNKNOWN;
+
+        // Re-create the target FBO as well with this format if necessary
+        bool ok = pl_tex_recreate(p->gpu, &e->tex_out[i], pl_tex_params(
+            .w = data[i].width,
+            .h = data[i].height,
+            .format = fmt,
+            .renderable = true,
+            .host_readable = true,
+        ));
+        if (!ok)
+            return API2_ERR_UNKNOWN;
+    }
+
+    // Dispatch the work for this image
+    for (int i = 0; i < img->num_planes; i++) {
+        if (!do_plane(p, e->tex_out[i], e->tex_in[i]))
+            return API2_ERR_UNKNOWN;
+    }
+
+    // Set up the resulting `struct image` that will hold our target
+    // data. We just copy the format etc. from the source image
+    memcpy(&e->image, img, sizeof(struct image));
+
+    size_t offset[MAX_PLANES], stride[MAX_PLANES], total_size = 0;
+    for (int i = 0; i < img->num_planes; i++) {
+        // For performance, we want to make sure we align the stride
+        // to a multiple of the GPU's preferred texture transfer stride
+        // (This is entirely optional)
+        stride[i] = ALIGN2(img->planes[i].stride,
+                           p->gpu->limits.align_tex_xfer_pitch);
+        int height = img->height >> img->planes[i].suby;
+
+        // Round up the offset to the nearest multiple of the optimal
+        // transfer alignment. (This is also entirely optional)
+        offset[i] = ALIGN2(total_size, p->gpu->limits.align_tex_xfer_offset);
+        total_size = offset[i] + stride[i] * height;
+    }
+
+    // Dispatch the asynchronous download into a mapped buffer
+    bool ok = pl_buf_recreate(p->gpu, &e->buf, pl_buf_params(
+        .size = total_size,
+        .host_mapped = true,
+    ));
+    if (!ok)
+        return API2_ERR_UNKNOWN;
+
+    for (int i = 0; i < img->num_planes; i++) {
+        ok = pl_tex_download(p->gpu, pl_tex_transfer_params(
+            .tex = e->tex_out[i],
+            .row_pitch = stride[i],
+            .buf = e->buf,
+            .buf_offset = offset[i],
+            .timer = p->download_timer,
+        ));
+        if (!ok)
+            return API2_ERR_UNKNOWN;
+
+        // Update the output fields
+        e->image.planes[i].data = e->buf->data + offset[i];
+        e->image.planes[i].stride = stride[i];
+    }
+
+    // Make sure this work starts processing in the background, and especially
+    // so we can move on to the next queue on the gPU
+    pl_gpu_flush(p->gpu);
+    return API2_OK;
+}
+
+enum api2_status api2_process(void *priv)
+{
+    struct priv *p = priv;
+    enum api2_status ret = 0;
+
+    // Opportunistically release any held images. We do this across the ring
+    // buffer, rather than doing this as part of the following loop, because
+    // we want to release images ahead-of-time (no FIFO constraints)
+    for (int i = 0; i < PARALLELISM; i++) {
+        struct entry *e = &p->entries[i];
+        if (e->held_image && !pl_buf_poll(p->gpu, e->held_buf, 0)) {
+            // upload buffer is no longer in use, release it
+            image_unlock(e->held_image);
+            e->held_image = NULL;
+            e->held_buf = NULL;
+        }
+    }
+
+    // Poll the status of existing entries and dequeue the ones that are done
+    while (p->idx_out != p->idx_in) {
+        struct entry *e = &p->entries[p->idx_out];
+        if (pl_buf_poll(p->gpu, e->buf, 0))
+            break;
+
+        if (e->held_image) {
+            image_unlock(e->held_image);
+            e->held_image = NULL;
+            e->held_buf = NULL;
+        }
+
+        // download buffer is no longer busy, dequeue the frame
+        put_image(&e->image);
+        p->idx_out = (p->idx_out + 1) % PARALLELISM;
+    }
+
+    // Fill up the queue with more work
+    int last_free_idx = (p->idx_out ? p->idx_out : PARALLELISM) - 1;
+    while (p->idx_in != last_free_idx) {
+        struct image *img = get_image();
+        if (!img) {
+            ret |= API2_WANT_MORE;
+            break;
+        }
+
+        enum api2_status err = submit_work(p, &p->entries[p->idx_in], img);
+        if (err < 0)
+            return err;
+
+        p->idx_in = (p->idx_in + 1) % PARALLELISM;
+    }
+
+    if (p->idx_out != p->idx_in)
+        ret |= API2_HAVE_MORE;
+
+    return ret;
+}
+
+bool api2_alloc(void *priv, size_t size, struct api2_buf *out)
+{
+    struct priv *p = priv;
+    if (!p->gpu->limits.buf_transfer || size > p->gpu->limits.max_mapped_size)
+        return false;
+
+    pl_buf buf = pl_buf_create(p->gpu, pl_buf_params(
+        .size = size,
+        .host_mapped = true,
+    ));
+
+    if (!buf)
+        return false;
+
+    *out = (struct api2_buf) {
+        .data = buf->data,
+        .size = size,
+        .priv = (void *) buf,
+    };
+    return true;
+}
+
+void api2_free(void *priv, const struct api2_buf *buf)
+{
+    struct priv *p = priv;
+    pl_buf plbuf = buf->priv;
+    pl_buf_destroy(p->gpu, &plbuf);
+}
+
+
+////////////////////////////////////
+/// Proof of Concept / Benchmark ///
+////////////////////////////////////
+
+#define FRAMES 10000
+
+// Let's say we're processing a 1920x1080 4:2:0 8-bit NV12 video, arbitrarily
+// with a stride aligned to 256 bytes. (For no particular reason)
+#define TEXELSZ sizeof(uint8_t)
+#define WIDTH   1920
+#define HEIGHT  1080
+#define STRIDE  (ALIGN2(WIDTH, 256) * TEXELSZ)
+// Subsampled planes
+#define SWIDTH  (WIDTH >> 1)
+#define SHEIGHT (HEIGHT >> 1)
+#define SSTRIDE (ALIGN2(SWIDTH, 256) * TEXELSZ)
+// Plane offsets / sizes
+#define SIZE0   (HEIGHT * STRIDE)
+#define SIZE1   (2 * SHEIGHT * SSTRIDE)
+#define OFFSET0 0
+#define OFFSET1 SIZE0
+#define BUFSIZE (OFFSET1 + SIZE1)
+
+// Skeleton of an example image
+static const struct image example_image = {
+    .width = WIDTH,
+    .height = HEIGHT,
+    .num_planes = 2,
+    .planes = {
+        {
+            .subx = 0,
+            .suby = 0,
+            .stride = STRIDE,
+            .fmt = {
+                .num_comps = 1,
+                .bitdepth = 8 * TEXELSZ,
+            },
+        }, {
+            .subx = 1,
+            .suby = 1,
+            .stride = SSTRIDE * 2,
+            .fmt = {
+                .num_comps = 2,
+                .bitdepth = 8 * TEXELSZ,
+            },
+        },
+    },
+};
+
+// API #1: Nice and simple (but slow)
+static void api1_example(void)
+{
+    struct priv *vf = init();
+    if (!vf)
+        return;
+
+    if (!api1_reconfig(vf, &example_image)) {
+        fprintf(stderr, "api1: Failed configuring video filter!\n");
+        return;
+    }
+
+    // Allocate two buffers to hold the example data, and fill the source
+    // buffer arbitrarily with a "simple" pattern. (Decoding the data into
+    // the buffer is not meant to be part of this benchmark)
+    uint8_t *srcbuf = malloc(BUFSIZE),
+            *dstbuf = malloc(BUFSIZE);
+    if (!srcbuf || !dstbuf)
+        goto done;
+
+    for (size_t i = 0; i < BUFSIZE; i++)
+        srcbuf[i] = i;
+
+    struct image src = example_image, dst = example_image;
+    src.planes[0].data = srcbuf + OFFSET0;
+    src.planes[1].data = srcbuf + OFFSET1;
+    dst.planes[0].data = dstbuf + OFFSET0;
+    dst.planes[1].data = dstbuf + OFFSET1;
+
+    const pl_clock_t start = pl_clock_now();
+
+    // Process this dummy frame a bunch of times
+    unsigned frames = 0;
+    for (frames = 0; frames < FRAMES; frames++) {
+        if (!api1_filter(vf, &dst, &src)) {
+            fprintf(stderr, "api1: Failed filtering frame... aborting\n");
+            break;
+        }
+    }
+
+    const pl_clock_t stop = pl_clock_now();
+    const float secs = pl_clock_diff(stop, start);
+
+    printf("api1: %4u frames in %1.6f s => %2.6f ms/frame (%5.2f fps)\n",
+           frames, secs, 1000 * secs / frames, frames / secs);
+
+    if (vf->render_count) {
+        printf("      render: %f ms, upload: %f ms, download: %f ms\n",
+               1e-6 * vf->render_sum / vf->render_count,
+               vf->upload_count ? (1e-6 * vf->upload_sum / vf->upload_count) : 0.0,
+               vf->download_count ? (1e-6 * vf->download_sum / vf->download_count) : 0.0);
+    }
+
+done:
+    free(srcbuf);
+    free(dstbuf);
+    uninit(vf);
+}
+
+
+// API #2: Pretend we have some fancy pool of images.
+#define POOLSIZE (PARALLELISM + 1)
+
+static struct api2_buf buffers[POOLSIZE] = {0};
+static struct image images[POOLSIZE] = {0};
+static int refcount[POOLSIZE] = {0};
+static unsigned api2_frames_in = 0;
+static unsigned api2_frames_out = 0;
+
+static void api2_example(void)
+{
+    struct priv *vf = init();
+    if (!vf)
+        return;
+
+    // Set up a bunch of dummy images
+    for (int i = 0; i < POOLSIZE; i++) {
+        uint8_t *data;
+        images[i] = example_image;
+        if (api2_alloc(vf, BUFSIZE, &buffers[i])) {
+            data = buffers[i].data;
+            images[i].associated_buf = &buffers[i];
+        } else {
+            // Fall back in case mapped buffers are unsupported
+            fprintf(stderr, "warning: falling back to malloc, may be slow\n");
+            data = malloc(BUFSIZE);
+        }
+        // Fill with some "data" (like in API #1)
+        for (size_t n = 0; n < BUFSIZE; n++)
+            data[i] = n;
+        images[i].planes[0].data = data + OFFSET0;
+        images[i].planes[1].data = data + OFFSET1;
+    }
+
+    const pl_clock_t start = pl_clock_now();
+
+    // Just keep driving the event loop regardless of the return status
+    // until we reach the critical number of frames. (Good enough for this PoC)
+    while (api2_frames_out < FRAMES) {
+        enum api2_status ret = api2_process(vf);
+        if (ret < 0) {
+            fprintf(stderr, "api2: Failed processing... aborting\n");
+            break;
+        }
+
+        // Sleep a short time (100us) to prevent busy waiting the CPU
+        pl_thread_sleep(1e-4);
+        check_timers(vf);
+    }
+
+    const pl_clock_t stop = pl_clock_now();
+    const float secs = pl_clock_diff(stop, start);
+    printf("api2: %4u frames in %1.6f s => %2.6f ms/frame (%5.2f fps)\n",
+           api2_frames_out, secs, 1000 * secs / api2_frames_out,
+           api2_frames_out / secs);
+
+    if (vf->render_count) {
+        printf("      render: %f ms, upload: %f ms, download: %f ms\n",
+               1e-6 * vf->render_sum / vf->render_count,
+               vf->upload_count ? (1e-6 * vf->upload_sum / vf->upload_count) : 0.0,
+               vf->download_count ? (1e-6 * vf->download_sum / vf->download_count) : 0.0);
+    }
+
+    for (int i = 0; i < POOLSIZE; i++) {
+        if (images[i].associated_buf) {
+            api2_free(vf, images[i].associated_buf);
+        } else {
+            // This is what we originally malloc'd
+            free(images[i].planes[0].data);
+        }
+    }
+
+    uninit(vf);
+}
+
+struct image *get_image(void)
+{
+    if (api2_frames_in == FRAMES)
+        return NULL; // simulate EOF, to avoid queueing up "extra" work
+
+    // if we can find a free (unlocked) image, give it that
+    for (int i = 0; i < POOLSIZE; i++) {
+        if (refcount[i] == 0) {
+            api2_frames_in++;
+            return &images[i];
+        }
+    }
+
+    return NULL; // no free image available
+}
+
+void put_image(struct image *img)
+{
+    (void)img;
+    api2_frames_out++;
+}
+
+void image_lock(struct image *img)
+{
+    int index = img - images; // cheat, for lack of having actual image management
+    refcount[index]++;
+}
+
+void image_unlock(struct image *img)
+{
+    int index = img - images;
+    refcount[index]--;
+}
+
+int main(void)
+{
+    printf("Running benchmarks...\n");
+    api1_example();
+    api2_example();
+    return 0;
+}
diff --git a/demos/window.c b/demos/window.c
new file mode 100644
index 0000000..cccffa3
--- /dev/null
+++ b/demos/window.c
@@ -0,0 +1,123 @@
+// License: CC0 / Public Domain
+
+#include <string.h>
+
+#include "common.h"
+#include "window.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#include <timeapi.h>
+#endif
+
+extern const struct window_impl win_impl_glfw_vk;
+extern const struct window_impl win_impl_glfw_gl;
+extern const struct window_impl win_impl_glfw_d3d11;
+extern const struct window_impl win_impl_sdl_vk;
+extern const struct window_impl win_impl_sdl_gl;
+
+static const struct window_impl *win_impls[] = {
+#ifdef HAVE_GLFW_VULKAN
+    &win_impl_glfw_vk,
+#endif
+#ifdef HAVE_GLFW_OPENGL
+    &win_impl_glfw_gl,
+#endif
+#ifdef HAVE_GLFW_D3D11
+    &win_impl_glfw_d3d11,
+#endif
+#ifdef HAVE_SDL_VULKAN
+    &win_impl_sdl_vk,
+#endif
+#ifdef HAVE_SDL_OPENGL
+    &win_impl_sdl_gl,
+#endif
+    NULL
+};
+
+struct window *window_create(pl_log log, const struct window_params *params)
+{
+    for (const struct window_impl **impl = win_impls; *impl; impl++) {
+        if (params->forced_impl && strcmp((*impl)->tag, params->forced_impl) != 0)
+            continue;
+
+        printf("Attempting to initialize API: %s\n", (*impl)->name);
+        struct window *win = (*impl)->create(log, params);
+        if (win) {
+#ifdef _WIN32
+            if (timeBeginPeriod(1) != TIMERR_NOERROR)
+                fprintf(stderr, "timeBeginPeriod failed!\n");
+#endif
+            return win;
+        }
+    }
+
+    if (params->forced_impl)
+        fprintf(stderr, "'%s' windowing system not compiled or supported!\n", params->forced_impl);
+    else
+        fprintf(stderr, "No windowing system / graphical API compiled or supported!\n");
+
+    exit(1);
+}
+
+void window_destroy(struct window **win)
+{
+    if (!*win)
+        return;
+
+    (*win)->impl->destroy(win);
+
+#ifdef _WIN32
+    timeEndPeriod(1);
+#endif
+}
+
+void window_poll(struct window *win, bool block)
+{
+    return win->impl->poll(win, block);
+}
+
+void window_get_cursor(const struct window *win, int *x, int *y)
+{
+    return win->impl->get_cursor(win, x, y);
+}
+
+void window_get_scroll(const struct window *win, float *dx, float *dy)
+{
+    return win->impl->get_scroll(win, dx, dy);
+}
+
+bool window_get_button(const struct window *win, enum button btn)
+{
+    return win->impl->get_button(win, btn);
+}
+
+bool window_get_key(const struct window *win, enum key key)
+{
+    return win->impl->get_key(win, key);
+}
+
+char *window_get_file(const struct window *win)
+{
+    return win->impl->get_file(win);
+}
+
+bool window_toggle_fullscreen(const struct window *win, bool fullscreen)
+{
+    return win->impl->toggle_fullscreen(win, fullscreen);
+}
+
+bool window_is_fullscreen(const struct window *win)
+{
+    return win->impl->is_fullscreen(win);
+}
+
+const char *window_get_clipboard(const struct window *win)
+{
+    return win->impl->get_clipboard(win);
+}
+
+void window_set_clipboard(const struct window *win, const char *text)
+{
+    win->impl->set_clipboard(win, text);
+}
diff --git a/demos/window.h b/demos/window.h
new file mode 100644
index 0000000..8382860
--- /dev/null
+++ b/demos/window.h
@@ -0,0 +1,67 @@
+// License: CC0 / Public Domain
+#pragma once
+
+#include <libplacebo/swapchain.h>
+
+struct window {
+    const struct window_impl *impl;
+    pl_swapchain swapchain;
+    pl_gpu gpu;
+    bool window_lost;
+};
+
+struct window_params {
+    const char *title;
+    int width;
+    int height;
+    const char *forced_impl;
+
+    // initial color space
+    struct pl_swapchain_colors colors;
+    bool alpha;
+};
+
+struct window *window_create(pl_log log, const struct window_params *params);
+void window_destroy(struct window **win);
+
+// Poll/wait for window events
+void window_poll(struct window *win, bool block);
+
+// Input handling
+enum button {
+    BTN_LEFT,
+    BTN_RIGHT,
+    BTN_MIDDLE,
+};
+
+enum key {
+    KEY_ESC,
+};
+
+void window_get_cursor(const struct window *win, int *x, int *y);
+void window_get_scroll(const struct window *win, float *dx, float *dy);
+bool window_get_button(const struct window *win, enum button);
+bool window_get_key(const struct window *win, enum key);
+char *window_get_file(const struct window *win);
+bool window_toggle_fullscreen(const struct window *win, bool fullscreen);
+bool window_is_fullscreen(const struct window *win);
+const char *window_get_clipboard(const struct window *win);
+void window_set_clipboard(const struct window *win, const char *text);
+
+// For implementations
+struct window_impl {
+    const char *name;
+    const char *tag;
+    __typeof__(window_create) *create;
+    __typeof__(window_destroy) *destroy;
+    __typeof__(window_poll) *poll;
+    __typeof__(window_get_cursor) *get_cursor;
+    __typeof__(window_get_scroll) *get_scroll;
+    __typeof__(window_get_button) *get_button;
+    __typeof__(window_get_key) *get_key;
+    __typeof__(window_get_file) *get_file;
+    __typeof__(window_toggle_fullscreen) *toggle_fullscreen;
+    __typeof__(window_is_fullscreen) *is_fullscreen;
+    __typeof__(window_get_clipboard) *get_clipboard;
+    __typeof__(window_set_clipboard) *set_clipboard;
+};
diff --git a/demos/window_glfw.c b/demos/window_glfw.c
new file mode 100644
index 0000000..6100278
--- /dev/null
+++ b/demos/window_glfw.c
@@ -0,0 +1,536 @@
+// License: CC0 / Public Domain
+
+#if defined(USE_GL) + defined(USE_VK) + defined(USE_D3D11) != 1
+#error Specify exactly one of -DUSE_GL, -DUSE_VK or -DUSE_D3D11 when compiling!
+#endif
+
+#include <string.h>
+#include <math.h>
+
+#include "common.h"
+#include "window.h"
+
+#ifdef USE_VK
+#define VK_NO_PROTOTYPES
+#include <libplacebo/vulkan.h>
+#define GLFW_INCLUDE_VULKAN
+#define IMPL win_impl_glfw_vk
+#define IMPL_NAME "GLFW (vulkan)"
+#define IMPL_TAG "glfw-vk"
+#endif
+
+#ifdef USE_GL
+#include <libplacebo/opengl.h>
+#define IMPL win_impl_glfw_gl
+#define IMPL_NAME "GLFW (opengl)"
+#define IMPL_TAG "glfw-gl"
+#endif
+
+#ifdef USE_D3D11
+#include <libplacebo/d3d11.h>
+#define IMPL win_impl_glfw_d3d11
+#define IMPL_NAME "GLFW (D3D11)"
+#define IMPL_TAG "glfw-d3d11"
+#endif
+
+#include <GLFW/glfw3.h>
+
+#if defined(USE_GL) && defined(HAVE_EGL)
+#define GLFW_EXPOSE_NATIVE_EGL
+#include <GLFW/glfw3native.h>
+#endif
+
+#ifdef USE_D3D11
+#define GLFW_EXPOSE_NATIVE_WIN32
+#include <GLFW/glfw3native.h>
+#endif
+
+#ifdef _WIN32
+#define strdup _strdup
+#endif
+
+#ifdef NDEBUG
+#define DEBUG false
+#else
+#define DEBUG true
+#endif
+
+#define PL_ARRAY_SIZE(s) (sizeof(s) / sizeof((s)[0]))
+
+const struct window_impl IMPL;
+
+struct window_pos {
+    int x;
+    int y;
+    int w;
+    int h;
+};
+
+struct priv {
+    struct window w;
+    GLFWwindow *win;
+
+#ifdef USE_VK
+    VkSurfaceKHR surf;
+    pl_vulkan vk;
+    pl_vk_inst vk_inst;
+#endif
+
+#ifdef USE_GL
+    pl_opengl gl;
+#endif
+
+#ifdef USE_D3D11
+    pl_d3d11 d3d11;
+#endif
+
+    float scroll_dx, scroll_dy;
+    char **files;
+    size_t files_num;
+    size_t files_size;
+    bool file_seen;
+
+    struct window_pos windowed_pos;
+};
+
+static void err_cb(int code, const char *desc)
+{
+    fprintf(stderr, "GLFW err %d: %s\n", code, desc);
+}
+
+static void close_cb(GLFWwindow *win)
+{
+    struct priv *p = glfwGetWindowUserPointer(win);
+    p->w.window_lost = true;
+}
+
+static void resize_cb(GLFWwindow *win, int width, int height)
+{
+    struct priv *p = glfwGetWindowUserPointer(win);
+    if (!pl_swapchain_resize(p->w.swapchain, &width, &height)) {
+        fprintf(stderr, "libplacebo: Failed resizing swapchain? Exiting...\n");
+        p->w.window_lost = true;
+    }
+}
+
+static void scroll_cb(GLFWwindow *win, double dx, double dy)
+{
+    struct priv *p = glfwGetWindowUserPointer(win);
+    p->scroll_dx += dx;
+    p->scroll_dy += dy;
+}
+
+static void drop_cb(GLFWwindow *win, int num, const char *files[])
+{
+    struct priv *p = glfwGetWindowUserPointer(win);
+
+    for (int i = 0; i < num; i++) {
+        if (p->files_num == p->files_size) {
+            size_t new_size = p->files_size ? p->files_size * 2 : 16;
+            char **new_files = realloc(p->files, new_size * sizeof(char *));
+            if (!new_files)
+                return;
+            p->files = new_files;
+            p->files_size = new_size;
+        }
+
+        char *file = strdup(files[i]);
+        if (!file)
+            return;
+
+        p->files[p->files_num++] = file;
+    }
+}
+
+#ifdef USE_GL
+static bool make_current(void *priv)
+{
+    GLFWwindow *win = priv;
+    glfwMakeContextCurrent(win);
+    return true;
+}
+
+static void release_current(void *priv)
+{
+    glfwMakeContextCurrent(NULL);
+}
+#endif
+
+#ifdef USE_VK
+static VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL get_vk_proc_addr(VkInstance instance, const char* pName)
+{
+    return (PFN_vkVoidFunction) glfwGetInstanceProcAddress(instance, pName);
+}
+#endif
+
+static struct window *glfw_create(pl_log log, const struct window_params *params)
+{
+    struct priv *p = calloc(1, sizeof(struct priv));
+    if (!p)
+        return NULL;
+
+    p->w.impl = &IMPL;
+    if (!glfwInit()) {
+        fprintf(stderr, "GLFW: Failed initializing?\n");
+        goto error;
+    }
+
+    glfwSetErrorCallback(&err_cb);
+
+#ifdef USE_VK
+    if (!glfwVulkanSupported()) {
+        fprintf(stderr, "GLFW: No vulkan support! Perhaps recompile with -DUSE_GL\n");
+        goto error;
+    }
+
+    glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API);
+#endif // USE_VK
+
+#ifdef USE_D3D11
+    glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API);
+#endif // USE_D3D11
+
+#ifdef USE_GL
+    struct {
+        int api;
+        int major, minor;
+        int glsl_ver;
+        int profile;
+    } gl_vers[] = {
+        { GLFW_OPENGL_API,    4, 6, 460, GLFW_OPENGL_CORE_PROFILE },
+        { GLFW_OPENGL_API,    4, 5, 450, GLFW_OPENGL_CORE_PROFILE },
+        { GLFW_OPENGL_API,    4, 4, 440, GLFW_OPENGL_CORE_PROFILE },
+        { GLFW_OPENGL_API,    4, 0, 400, GLFW_OPENGL_CORE_PROFILE },
+        { GLFW_OPENGL_API,    3, 3, 330, GLFW_OPENGL_CORE_PROFILE },
+        { GLFW_OPENGL_API,    3, 2, 150, GLFW_OPENGL_CORE_PROFILE },
+        { GLFW_OPENGL_ES_API, 3, 2, 320, },
+        { GLFW_OPENGL_API,    3, 1, 140, },
+        { GLFW_OPENGL_ES_API, 3, 1, 310, },
+        { GLFW_OPENGL_API,    3, 0, 130, },
+        { GLFW_OPENGL_ES_API, 3, 0, 300, },
+        { GLFW_OPENGL_ES_API, 2, 0, 100, },
+        { GLFW_OPENGL_API,    2, 1, 120, },
+        { GLFW_OPENGL_API,    2, 0, 110, },
+    };
+
+    for (int i = 0; i < PL_ARRAY_SIZE(gl_vers); i++) {
+        glfwWindowHint(GLFW_CLIENT_API, gl_vers[i].api);
+#ifdef HAVE_EGL
+        glfwWindowHint(GLFW_CONTEXT_CREATION_API, GLFW_EGL_CONTEXT_API);
+#endif
+        glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, gl_vers[i].major);
+        glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, gl_vers[i].minor);
+        glfwWindowHint(GLFW_OPENGL_PROFILE, gl_vers[i].profile);
+#ifdef __APPLE__
+        if (gl_vers[i].profile == GLFW_OPENGL_CORE_PROFILE)
+            glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE);
+#endif
+
+#endif // USE_GL
+
+        if (params->alpha)
+            glfwWindowHint(GLFW_TRANSPARENT_FRAMEBUFFER, GLFW_TRUE);
+
+        printf("Creating %dx%d window%s...\n", params->width, params->height,
+               params->alpha ? " (with alpha)" : "");
+
+        p->win = glfwCreateWindow(params->width, params->height, params->title, NULL, NULL);
+
+#ifdef USE_GL
+        if (p->win)
+            break;
+    }
+#endif // USE_GL
+
+    if (!p->win) {
+        fprintf(stderr, "GLFW: Failed creating window\n");
+        goto error;
+    }
+
+    // Set up GLFW event callbacks
+    glfwSetWindowUserPointer(p->win, p);
+    glfwSetFramebufferSizeCallback(p->win, resize_cb);
+    glfwSetWindowCloseCallback(p->win, close_cb);
+    glfwSetScrollCallback(p->win, scroll_cb);
+    glfwSetDropCallback(p->win, drop_cb);
+
+#ifdef USE_VK
+    VkResult err;
+
+    uint32_t num;
+    p->vk_inst = pl_vk_inst_create(log, pl_vk_inst_params(
+        .get_proc_addr = get_vk_proc_addr,
+        .debug = DEBUG,
+        .extensions = glfwGetRequiredInstanceExtensions(&num),
+        .num_extensions = num,
+    ));
+
+    if (!p->vk_inst) {
+        fprintf(stderr, "libplacebo: Failed creating vulkan instance\n");
+        goto error;
+    }
+
+    err = glfwCreateWindowSurface(p->vk_inst->instance, p->win, NULL, &p->surf);
+    if (err != VK_SUCCESS) {
+        fprintf(stderr, "GLFW: Failed creating vulkan surface\n");
+        goto error;
+    }
+
+    p->vk = pl_vulkan_create(log, pl_vulkan_params(
+        .instance = p->vk_inst->instance,
+        .get_proc_addr = p->vk_inst->get_proc_addr,
+        .surface = p->surf,
+        .allow_software = true,
+    ));
+    if (!p->vk) {
+        fprintf(stderr, "libplacebo: Failed creating vulkan device\n");
+        goto error;
+    }
+
+    p->w.swapchain = pl_vulkan_create_swapchain(p->vk, pl_vulkan_swapchain_params(
+        .surface = p->surf,
+        .present_mode = VK_PRESENT_MODE_FIFO_KHR,
+    ));
+
+    if (!p->w.swapchain) {
+        fprintf(stderr, "libplacebo: Failed creating vulkan swapchain\n");
+        goto error;
+    }
+
+    p->w.gpu = p->vk->gpu;
+#endif // USE_VK
+
+#ifdef USE_GL
+    p->gl = pl_opengl_create(log, pl_opengl_params(
+        .allow_software = true,
+        .debug = DEBUG,
+#ifdef HAVE_EGL
+        .egl_display = glfwGetEGLDisplay(),
+        .egl_context = glfwGetEGLContext(p->win),
+#endif
+        .make_current = make_current,
+        .release_current = release_current,
+        .get_proc_addr = glfwGetProcAddress,
+        .priv = p->win,
+    ));
+    if (!p->gl) {
+        fprintf(stderr, "libplacebo: Failed creating opengl device\n");
+        goto error;
+    }
+
+    p->w.swapchain = pl_opengl_create_swapchain(p->gl, pl_opengl_swapchain_params(
+        .swap_buffers = (void (*)(void *)) glfwSwapBuffers,
+        .priv = p->win,
+    ));
+
+    if (!p->w.swapchain) {
+        fprintf(stderr, "libplacebo: Failed creating opengl swapchain\n");
+        goto error;
+    }
+
+    p->w.gpu = p->gl->gpu;
+#endif // USE_GL
+
+#ifdef USE_D3D11
+    p->d3d11 = pl_d3d11_create(log, pl_d3d11_params( .debug = DEBUG ));
+    if (!p->d3d11) {
+        fprintf(stderr, "libplacebo: Failed creating D3D11 device\n");
+        goto error;
+    }
+
+    p->w.swapchain = pl_d3d11_create_swapchain(p->d3d11, pl_d3d11_swapchain_params(
+        .window = glfwGetWin32Window(p->win),
+    ));
+    if (!p->w.swapchain) {
+        fprintf(stderr, "libplacebo: Failed creating D3D11 swapchain\n");
+        goto error;
+    }
+
+    p->w.gpu = p->d3d11->gpu;
+#endif // USE_D3D11
+
+    glfwGetWindowSize(p->win, &p->windowed_pos.w, &p->windowed_pos.h);
+    glfwGetWindowPos(p->win, &p->windowed_pos.x, &p->windowed_pos.y);
+
+    int w, h;
+    glfwGetFramebufferSize(p->win, &w, &h);
+    pl_swapchain_colorspace_hint(p->w.swapchain, &params->colors);
+    if (!pl_swapchain_resize(p->w.swapchain, &w, &h)) {
+        fprintf(stderr, "libplacebo: Failed initializing swapchain\n");
+        goto error;
+    }
+
+    return &p->w;
+
+error:
+    window_destroy((struct window **) &p);
+    return NULL;
+}
+
+static void glfw_destroy(struct window **window)
+{
+    struct priv *p = (struct priv *) *window;
+    if (!p)
+        return;
+
+    pl_swapchain_destroy(&p->w.swapchain);
+
+#ifdef USE_VK
+    pl_vulkan_destroy(&p->vk);
+    if (p->surf) {
+        PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR = (PFN_vkDestroySurfaceKHR)
+            p->vk_inst->get_proc_addr(p->vk_inst->instance, "vkDestroySurfaceKHR");
+        vkDestroySurfaceKHR(p->vk_inst->instance, p->surf, NULL);
+    }
+    pl_vk_inst_destroy(&p->vk_inst);
+#endif
+
+#ifdef USE_GL
+    pl_opengl_destroy(&p->gl);
+#endif
+
+#ifdef USE_D3D11
+    pl_d3d11_destroy(&p->d3d11);
+#endif
+
+    for (int i = 0; i < p->files_num; i++)
+        free(p->files[i]);
+    free(p->files);
+
+    glfwTerminate();
+    free(p);
+    *window = NULL;
+}
+
+static void glfw_poll(struct window *window, bool block)
+{
+    if (block) {
+        glfwWaitEvents();
+    } else {
+        glfwPollEvents();
+    }
+}
+
+static void glfw_get_cursor(const struct window *window, int *x, int *y)
+{
+    struct priv *p = (struct priv *) window;
+    double dx, dy;
+    int fw, fh, ww, wh;
+    glfwGetCursorPos(p->win, &dx, &dy);
+    glfwGetFramebufferSize(p->win, &fw, &fh);
+    glfwGetWindowSize(p->win, &ww, &wh);
+    *x = floor(dx * fw / ww);
+    *y = floor(dy * fh / wh);
+}
+
+static bool glfw_get_button(const struct window *window, enum button btn)
+{
+    static const int button_map[] = {
+        [BTN_LEFT] = GLFW_MOUSE_BUTTON_LEFT,
+        [BTN_RIGHT] = GLFW_MOUSE_BUTTON_RIGHT,
+        [BTN_MIDDLE] = GLFW_MOUSE_BUTTON_MIDDLE,
+    };
+
+    struct priv *p = (struct priv *) window;
+    return glfwGetMouseButton(p->win, button_map[btn]) == GLFW_PRESS;
+}
+
+static bool glfw_get_key(const struct window *window, enum key key)
+{
+    static const int key_map[] = {
+        [KEY_ESC] = GLFW_KEY_ESCAPE,
+    };
+
+    struct priv *p = (struct priv *) window;
+    return glfwGetKey(p->win, key_map[key]) == GLFW_PRESS;
+}
+
+static void glfw_get_scroll(const struct window *window, float *dx, float *dy)
+{
+    struct priv *p = (struct priv *) window;
+    *dx = p->scroll_dx;
+    *dy = p->scroll_dy;
+    p->scroll_dx = p->scroll_dy = 0.0;
+}
+
+static char *glfw_get_file(const struct window *window)
+{
+    struct priv *p = (struct priv *) window;
+    if (p->file_seen) {
+        assert(p->files_num);
+        free(p->files[0]);
+        memmove(&p->files[0], &p->files[1], --p->files_num * sizeof(char *));
+        p->file_seen = false;
+    }
+
+    if (!p->files_num)
+        return NULL;
+
+    p->file_seen = true;
+    return p->files[0];
+}
+
+static bool glfw_is_fullscreen(const struct window *window) {
+    const struct priv *p = (const struct priv *) window;
+    return glfwGetWindowMonitor(p->win);
+}
+
+static bool glfw_toggle_fullscreen(const struct window *window, bool fullscreen)
+{
+    struct priv *p = (struct priv *) window;
+    bool window_fullscreen = glfw_is_fullscreen(window);
+
+    if (window_fullscreen == fullscreen)
+        return true;
+
+    if (window_fullscreen) {
+        glfwSetWindowMonitor(p->win, NULL, p->windowed_pos.x, p->windowed_pos.y,
+                             p->windowed_pos.w, p->windowed_pos.h, GLFW_DONT_CARE);
+        return true;
+    }
+
+    // For simplicity sake use primary monitor
+    GLFWmonitor *monitor = glfwGetPrimaryMonitor();
+    if (!monitor)
+        return false;
+
+    const GLFWvidmode *mode = glfwGetVideoMode(monitor);
+    if (!mode)
+        return false;
+
+    glfwGetWindowPos(p->win, &p->windowed_pos.x, &p->windowed_pos.y);
+    glfwGetWindowSize(p->win, &p->windowed_pos.w, &p->windowed_pos.h);
+    glfwSetWindowMonitor(p->win, monitor, 0, 0, mode->width, mode->height,
+                         mode->refreshRate);
+
+    return true;
+}
+
+static const char *glfw_get_clipboard(const struct window *window)
+{
+    struct priv *p = (struct priv *) window;
+    return glfwGetClipboardString(p->win);
+}
+
+static void glfw_set_clipboard(const struct window *window, const char *text)
+{
+    struct priv *p = (struct priv *) window;
+    glfwSetClipboardString(p->win, text);
+}
+
+const struct window_impl IMPL = {
+    .name = IMPL_NAME,
+    .tag = IMPL_TAG,
+    .create = glfw_create,
+    .destroy = glfw_destroy,
+    .poll = glfw_poll,
+    .get_cursor = glfw_get_cursor,
+    .get_button = glfw_get_button,
+    .get_key = glfw_get_key,
+    .get_scroll = glfw_get_scroll,
+    .get_file = glfw_get_file,
+    .toggle_fullscreen = glfw_toggle_fullscreen,
+    .is_fullscreen = glfw_is_fullscreen,
+    .get_clipboard = glfw_get_clipboard,
+    .set_clipboard = glfw_set_clipboard,
+};
diff --git a/demos/window_sdl.c b/demos/window_sdl.c
new file mode 100644
index 0000000..1fd22ce
--- /dev/null
+++ b/demos/window_sdl.c
@@ -0,0 +1,404 @@
+// License: CC0 / Public Domain
+
+#if !defined(USE_GL) && !defined(USE_VK) || defined(USE_GL) && defined(USE_VK)
+#error Specify exactly one of -DUSE_GL or -DUSE_VK when compiling!
+#endif
+
+#include <SDL.h>
+
+#include "common.h"
+#include "window.h"
+
+#ifdef USE_VK
+#define VK_NO_PROTOTYPES
+#include <libplacebo/vulkan.h>
+#include <SDL_vulkan.h>
+#define WINFLAG_API SDL_WINDOW_VULKAN
+#define IMPL win_impl_sdl_vk
+#define IMPL_NAME "SDL2 (vulkan)"
+#define IMPL_TAG "sdl2-vk"
+#endif
+
+#ifdef USE_GL
+#include <libplacebo/opengl.h>
+#define WINFLAG_API SDL_WINDOW_OPENGL
+#define IMPL win_impl_sdl_gl
+#define IMPL_NAME "SDL2 (opengl)"
+#define IMPL_TAG "sdl2-gl"
+#endif
+
+#ifdef NDEBUG
+#define DEBUG false
+#else
+#define DEBUG true
+#endif
+
+const struct window_impl IMPL;
+
+struct priv {
+    struct window w;
+    SDL_Window *win;
+
+#ifdef USE_VK
+    VkSurfaceKHR surf;
+    pl_vulkan vk;
+    pl_vk_inst vk_inst;
+#endif
+
+#ifdef USE_GL
+    SDL_GLContext gl_ctx;
+    pl_opengl gl;
+#endif
+
+    int scroll_dx, scroll_dy;
+    char **files;
+    size_t files_num;
+    size_t files_size;
+    bool file_seen;
+    char *clip_text;
+};
+
+#ifdef USE_GL
+static bool make_current(void *priv)
+{
+    struct priv *p = priv;
+    return SDL_GL_MakeCurrent(p->win, p->gl_ctx) == 0;
+}
+
+static void release_current(void *priv)
+{
+    struct priv *p = priv;
+    SDL_GL_MakeCurrent(p->win, NULL);
+}
+#endif
+
+static struct window *sdl_create(pl_log log, const struct window_params *params)
+{
+    struct priv *p = calloc(1, sizeof(struct priv));
+    if (!p)
+        return NULL;
+
+    p->w.impl = &IMPL;
+    if (SDL_Init(SDL_INIT_VIDEO) < 0) {
+        fprintf(stderr, "SDL2: Failed initializing: %s\n", SDL_GetError());
+        goto error;
+    }
+
+    uint32_t sdl_flags = SDL_WINDOW_SHOWN | SDL_WINDOW_RESIZABLE | WINFLAG_API;
+    p->win = SDL_CreateWindow(params->title, SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED,
+                              params->width, params->height, sdl_flags);
+    if (!p->win) {
+        fprintf(stderr, "SDL2: Failed creating window: %s\n", SDL_GetError());
+        goto error;
+    }
+
+    int w, h;
+
+#ifdef USE_VK
+
+    unsigned int num = 0;
+    if (!SDL_Vulkan_GetInstanceExtensions(p->win, &num, NULL)) {
+        fprintf(stderr, "SDL2: Failed enumerating vulkan extensions: %s\n", SDL_GetError());
+        goto error;
+    }
+
+    const char **exts = malloc(num * sizeof(const char *));
+    SDL_Vulkan_GetInstanceExtensions(p->win, &num, exts);
+
+    p->vk_inst = pl_vk_inst_create(log, pl_vk_inst_params(
+        .get_proc_addr = SDL_Vulkan_GetVkGetInstanceProcAddr(),
+        .debug = DEBUG,
+        .extensions = exts,
+        .num_extensions = num,
+    ));
+    free(exts);
+    if (!p->vk_inst) {
+        fprintf(stderr, "libplacebo: Failed creating vulkan instance!\n");
+        goto error;
+    }
+
+    if (!SDL_Vulkan_CreateSurface(p->win, p->vk_inst->instance, &p->surf)) {
+        fprintf(stderr, "SDL2: Failed creating surface: %s\n", SDL_GetError());
+        goto error;
+    }
+
+    p->vk = pl_vulkan_create(log, pl_vulkan_params(
+        .instance = p->vk_inst->instance,
+        .get_proc_addr = p->vk_inst->get_proc_addr,
+        .surface = p->surf,
+        .allow_software = true,
+    ));
+    if (!p->vk) {
+        fprintf(stderr, "libplacebo: Failed creating vulkan device\n");
+        goto error;
+    }
+
+    p->w.swapchain = pl_vulkan_create_swapchain(p->vk, pl_vulkan_swapchain_params(
+        .surface = p->surf,
+        .present_mode = VK_PRESENT_MODE_FIFO_KHR,
+    ));
+
+    if (!p->w.swapchain) {
+        fprintf(stderr, "libplacebo: Failed creating vulkan swapchain\n");
+        goto error;
+    }
+
+    p->w.gpu = p->vk->gpu;
+
+    SDL_Vulkan_GetDrawableSize(p->win, &w, &h);
+#endif // USE_VK
+
+#ifdef USE_GL
+    p->gl_ctx = SDL_GL_CreateContext(p->win);
+    if (!p->gl_ctx) {
+        fprintf(stderr, "SDL2: Failed creating GL context: %s\n", SDL_GetError());
+        goto error;
+    }
+
+    p->gl = pl_opengl_create(log, pl_opengl_params(
+        .allow_software = true,
+        .debug = DEBUG,
+        .make_current = make_current,
+        .release_current = release_current,
+        .get_proc_addr = (void *) SDL_GL_GetProcAddress,
+        .priv = p,
+    ));
+    if (!p->gl) {
+        fprintf(stderr, "libplacebo: Failed creating opengl device\n");
+        goto error;
+    }
+
+    p->w.swapchain = pl_opengl_create_swapchain(p->gl, pl_opengl_swapchain_params(
+        .swap_buffers = (void (*)(void *)) SDL_GL_SwapWindow,
+        .priv = p->win,
+    ));
+
+    if (!p->w.swapchain) {
+        fprintf(stderr, "libplacebo: Failed creating opengl swapchain\n");
+        goto error;
+    }
+
+    p->w.gpu = p->gl->gpu;
+
+    SDL_GL_GetDrawableSize(p->win, &w, &h);
+#endif // USE_GL
+
+    pl_swapchain_colorspace_hint(p->w.swapchain, &params->colors);
+    if (!pl_swapchain_resize(p->w.swapchain, &w, &h)) {
+        fprintf(stderr, "libplacebo: Failed initializing swapchain\n");
+        goto error;
+    }
+
+    return &p->w;
+
+error:
+    window_destroy((struct window **) &p);
+    return NULL;
+}
+
+static void sdl_destroy(struct window **window)
+{
+    struct priv *p = (struct priv *) *window;
+    if (!p)
+        return;
+
+    pl_swapchain_destroy(&p->w.swapchain);
+
+#ifdef USE_VK
+    pl_vulkan_destroy(&p->vk);
+    if (p->surf) {
+        PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR = (PFN_vkDestroySurfaceKHR)
+            p->vk_inst->get_proc_addr(p->vk_inst->instance, "vkDestroySurfaceKHR");
+        vkDestroySurfaceKHR(p->vk_inst->instance, p->surf, NULL);
+    }
+    pl_vk_inst_destroy(&p->vk_inst);
+#endif
+
+#ifdef USE_GL
+    pl_opengl_destroy(&p->gl);
+    SDL_GL_DeleteContext(p->gl_ctx);
+#endif
+
+    for (int i = 0; i < p->files_num; i++)
+        SDL_free(p->files[i]);
+    free(p->files);
+
+    SDL_free(p->clip_text);
+    SDL_DestroyWindow(p->win);
+    SDL_Quit();
+    free(p);
+    *window = NULL;
+}
+
+static inline void handle_event(struct priv *p, SDL_Event *event)
+{
+    switch (event->type) {
+    case SDL_QUIT:
+        p->w.window_lost = true;
+        return;
+
+    case SDL_WINDOWEVENT:
+        if (event->window.windowID != SDL_GetWindowID(p->win))
+            return;
+
+        if (event->window.event == SDL_WINDOWEVENT_SIZE_CHANGED) {
+            int width = event->window.data1, height = event->window.data2;
+            if (!pl_swapchain_resize(p->w.swapchain, &width, &height)) {
+                fprintf(stderr, "libplacebo: Failed resizing swapchain? Exiting...\n");
+                p->w.window_lost = true;
+            }
+        }
+        return;
+
+    case SDL_MOUSEWHEEL:
+        p->scroll_dx += event->wheel.x;
+        p->scroll_dy += event->wheel.y;
+        return;
+
+    case SDL_DROPFILE:
+        if (p->files_num == p->files_size) {
+            size_t new_size = p->files_size ? p->files_size * 2 : 16;
+            char **new_files = realloc(p->files, new_size * sizeof(char *));
+            if (!new_files)
+                return;
+            p->files = new_files;
+            p->files_size = new_size;
+        }
+
+        p->files[p->files_num++] = event->drop.file;
+        return;
+    }
+}
+
+static void sdl_poll(struct window *window, bool block)
+{
+    struct priv *p = (struct priv *) window;
+    SDL_Event event;
+    int ret;
+
+    do {
+        ret = block ? SDL_WaitEvent(&event) : SDL_PollEvent(&event);
+        if (ret)
+            handle_event(p, &event);
+
+        // Only block on the first iteration
+        block = false;
+    } while (ret);
+}
+
+static void sdl_get_cursor(const struct window *window, int *x, int *y)
+{
+    SDL_GetMouseState(x, y);
+}
+
+static bool sdl_get_button(const struct window *window, enum button btn)
+{
+    static const uint32_t button_mask[] = {
+        [BTN_LEFT] = SDL_BUTTON_LMASK,
+        [BTN_RIGHT] = SDL_BUTTON_RMASK,
+        [BTN_MIDDLE] = SDL_BUTTON_MMASK,
+    };
+
+    return SDL_GetMouseState(NULL, NULL) & button_mask[btn];
+}
+
+static bool sdl_get_key(const struct window *window, enum key key)
+{
+    static const size_t key_map[] = {
+        [KEY_ESC] = SDL_SCANCODE_ESCAPE,
+    };
+
+    return SDL_GetKeyboardState(NULL)[key_map[key]];
+}
+
+static void sdl_get_scroll(const struct window *window, float *dx, float *dy)
+{
+    struct priv *p = (struct priv *) window;
+    *dx = p->scroll_dx;
+    *dy = p->scroll_dy;
+    p->scroll_dx = p->scroll_dy = 0;
+}
+
+static char *sdl_get_file(const struct window *window)
+{
+    struct priv *p = (struct priv *) window;
+    if (p->file_seen) {
+        assert(p->files_num);
+        SDL_free(p->files[0]);
+        memmove(&p->files[0], &p->files[1], --p->files_num * sizeof(char *));
+        p->file_seen = false;
+    }
+
+    if (!p->files_num)
+        return NULL;
+
+    p->file_seen = true;
+    return p->files[0];
+}
+
+static bool sdl_is_fullscreen(const struct window *window)
+{
+    const struct priv *p = (const struct priv *) window;
+    return SDL_GetWindowFlags(p->win) & SDL_WINDOW_FULLSCREEN;
+}
+
+static bool sdl_toggle_fullscreen(const struct window *window, bool fullscreen)
+{
+    struct priv *p = (struct priv *) window;
+    bool window_fullscreen = sdl_is_fullscreen(window);
+
+    if (window_fullscreen == fullscreen)
+        return true;
+
+    SDL_DisplayMode mode;
+    if (SDL_GetDesktopDisplayMode(0, &mode))
+    {
+        fprintf(stderr, "SDL2: Failed to get display mode: %s\n", SDL_GetError());
+        SDL_ClearError();
+        return false;
+    }
+
+    if (SDL_SetWindowDisplayMode(p->win, &mode))
+    {
+        fprintf(stderr, "SDL2: Failed to set window display mode: %s\n", SDL_GetError());
+        SDL_ClearError();
+        return false;
+    }
+
+    if (SDL_SetWindowFullscreen(p->win, fullscreen ? SDL_WINDOW_FULLSCREEN : 0)) {
+        fprintf(stderr, "SDL2: SetWindowFullscreen failed: %s\n", SDL_GetError());
+        SDL_ClearError();
+        return false;
+    }
+
+    return true;
+}
+
+static const char *sdl_get_clipboard(const struct window *window)
+{
+    struct priv *p = (struct priv *) window;
+    SDL_free(p->clip_text);
+    return p->clip_text = SDL_GetClipboardText();
+}
+
+static void sdl_set_clipboard(const struct window *window, const char *text)
+{
+    SDL_SetClipboardText(text);
+}
+
+const struct window_impl IMPL = {
+    .name = IMPL_NAME,
+    .tag = IMPL_TAG,
+    .create = sdl_create,
+    .destroy = sdl_destroy,
+    .poll = sdl_poll,
+    .get_cursor = sdl_get_cursor,
+    .get_button = sdl_get_button,
+    .get_key = sdl_get_key,
+    .get_scroll = sdl_get_scroll,
+    .get_file = sdl_get_file,
+    .toggle_fullscreen = sdl_toggle_fullscreen,
+    .is_fullscreen = sdl_is_fullscreen,
+    .get_clipboard = sdl_get_clipboard,
+    .set_clipboard = sdl_set_clipboard,
+};
diff --git a/docs/CNAME b/docs/CNAME
new file mode 100644
index 0000000..3be539d
--- /dev/null
+++ b/docs/CNAME
@@ -0,0 +1 @@
+libplacebo.org
diff --git a/docs/basic-rendering.md b/docs/basic-rendering.md
new file mode 100644
index 0000000..09a1f6b
--- /dev/null
+++ b/docs/basic-rendering.md
@@ -0,0 +1,432 @@
+# Basic windowing / output example
+
+We will demonstrate the basics of the libplacebo GPU output API with a worked
+example. The goal is to show a simple color on screen.
+
+## Creating a `pl_log`
+
+Almost all major entry-points into libplacebo require providing a log
+callback (or `NULL` to disable logging). This is abstracted into the `pl_log`
+object type, which we can create with
+`pl_log_create`:
+
+``` c linenums="1"
+#include <libplacebo/log.h>
+
+pl_log pllog;
+
+int main()
+{
+    pllog = pl_log_create(PL_API_VER, pl_log_params(
+        .log_cb = pl_log_color,
+        .log_level = PL_LOG_INFO,
+    ));
+
+    // ...
+
+    pl_log_destroy(&pllog);
+    return 0;
+}
+```
+
+!!! note "Compiling"
+
+    You can compile this example with:
+
+    ``` bash
+    $ gcc example.c -o example `pkg-config --cflags --libs libplacebo`
+    ```
+
+The parameter `PL_API_VER` has no special significance and is merely included
+for historical reasons. Aside from that, this snippet introduces a number of
+core concepts of the libplacebo API:
+
+### Parameter structs
+
+For extensibility, almost all libplacebo calls take a pointer to a `const
+struct pl_*_params`, into which all extensible parameters go. For convenience,
+libplacebo provides macros which create anonymous params structs on the stack
+(and also fill in default parameters). Note that this only works for C99 and
+above, users of C89 and C++ must initialize parameter structs manually.
+
+Under the hood, `pl_log_params(...)` just translates to `&((struct
+pl_log_params) { /* default params */, ... })`. This style of API allows
+libplacebo to effectively simulate optional named parameters.
+
+!!! note "On default parameters"
+
+    Wherever possible, parameters are designed in such a way that `{0}` gives
+    you a minimal parameter structure, with default behavior and no optional
+    features enabled. This is done for forwards compatibility - as new
+    features are introduced, old struct initializers will simply opt out of
+    them.
+
+### Destructors
+
+All libplacebo objects must be destroyed manually using the corresponding
+`pl_*_destroy` call, which takes a pointer to the variable the object is
+stored in. The resulting variable is written to `NULL`. This helps prevent
+use-after-free bugs.
+
+!!! note "NULL"
+
+    As a general rule, all libplacebo destructors are safe to call on
+    variables containing `NULL`. So, users need not explicitly `NULL`-test
+    before calling destructors on variables.
+
+## Creating a window
+
+While libplacebo can work in isolation, to render images offline, for the sake
+of this guide we want to provide something graphical on-screen. As such, we
+need to create some sort of window. Libplacebo provides no built-in mechanism
+for this, it assumes the API user will already have a windowing system
+in-place.
+
+Complete examples (based on GLFW and SDL) can be found [in the libplacebo
+demos](https://code.videolan.org/videolan/libplacebo/-/tree/master/demos). But
+for now, we will focus on getting a very simple window on-screen using GLFW:
+
+``` c linenums="1" hl_lines="3 5 6 7 9 17 18 20 21 22 24 25 26 28 29"
+// ...
+
+#include <GLFW/glfw3.h>
+
+const char * const title = "libplacebo demo";
+int width = 800;
+int height = 600;
+
+GLFWwindow *window;
+
+int main()
+{
+    pllog = pl_log_create(PL_API_VER, pl_log_params(
+        .log_level = PL_LOG_INFO,
+    ));
+
+    if (!glfwInit())
+        return 1;
+
+    window = glfwCreateWindow(width, height, title, NULL, NULL);
+    if (!window)
+        return 1;
+
+    while (!glfwWindowShouldClose(window)) {
+        glfwWaitEvents();
+    }
+
+    glfwDestroyWindow(window);
+    glfwTerminate();
+    pl_log_destroy(&pllog);
+    return 0;
+}
+```
+
+!!! note "Compiling"
+
+    We now also need to include the glfw3 library to compile this example.
+
+    ``` bash
+    $ gcc example.c -o example `pkg-config --cflags --libs glfw3 libplacebo`
+    ```
+
+## Creating the `pl_gpu`
+
+All GPU operations are abstracted into an internal `pl_gpu` object, which
+serves as the primary entry-point to any sort of GPU interaction. This object
+cannot be created directly, but must be obtained from some graphical API:
+currently there are Vulkan, OpenGL or D3D11. A `pl_gpu` can be accessed from
+an API-specific object like `pl_vulkan`, `pl_opengl` and `pl_d3d11`.
+
+In this guide, for simplicity, we will be using OpenGL, simply because that's
+what GLFW initializes by default.
+
+``` c linenums="1" hl_lines="3 5-6 15-23 29 36-45"
+// ...
+
+pl_opengl opengl;
+
+static bool make_current(void *priv);
+static void release_current(void *priv);
+
+int main()
+{
+    // ...
+    window = glfwCreateWindow(width, height, title, NULL, NULL);
+    if (!window)
+        return 1;
+
+    opengl = pl_opengl_create(pllog, pl_opengl_params(
+        .get_proc_addr      = glfwGetProcAddress,
+        .allow_software     = true,         // allow software rasterers
+        .debug              = true,         // enable error reporting
+        .make_current       = make_current, // (1)
+        .release_current    = release_current,
+    ));
+    if (!opengl)
+        return 2;
+
+    while (!glfwWindowShouldClose(window)) {
+        glfwWaitEvents();
+    }
+
+    pl_opengl_destroy(&opengl);
+    glfwDestroyWindow(window);
+    glfwTerminate();
+    pl_log_destroy(&pllog);
+    return 0;
+}
+
+static bool make_current(void *priv)
+{
+    glfwMakeContextCurrent(window);
+    return true;
+}
+
+static void release_current(void *priv)
+{
+    glfwMakeContextCurrent(NULL);
+}
+```
+
+1.  Setting this allows the resulting `pl_gpu` to be thread-safe, which
+    enables asynchronous transfers to be used. The alternative is to simply
+    call `glfwMakeContextCurrent` once after creating the window.
+
+    This method of making the context current is generally preferred,
+    however, so we've demonstrated it here for completeness' sake.
+
+## Creating a swapchain
+
+All access to window-based rendering commands are abstracted into an object
+known as a "swapchain" (from Vulkan terminology), including the default
+backbuffers on D3D11 and OpenGL. If we want to present something to screen,
+we need to first create a `pl_swapchain`.
+
+We can use this swapchain to perform the equivalent of `gl*SwapBuffers`:
+
+``` c linenums="1" hl_lines="2 4-9 17-22 24-27 30-31 34"
+// ...
+pl_swapchain swchain;
+
+static void resize_cb(GLFWwindow *win, int new_w, int new_h)
+{
+    width  = new_w;
+    height = new_h;
+    pl_swapchain_resize(swchain, &width, &height);
+}
+
+int main()
+{
+    // ...
+    if (!opengl)
+        return 2;
+
+    swchain = pl_opengl_create_swapchain(opengl, pl_opengl_swapchain_params(
+        .swap_buffers   = (void (*)(void *)) glfwSwapBuffers,
+        .priv           = window,
+    ));
+    if (!swchain)
+        return 2;
+
+    // (2)
+    if (!pl_swapchain_resize(swchain, &width, &height))
+        return 2;
+    glfwSetFramebufferSizeCallback(window, resize_cb);
+
+    while (!glfwWindowShouldClose(window)) {
+        pl_swapchain_swap_buffers(swchain);
+        glfwPollEvents(); // (1)
+    }
+
+    pl_swapchain_destroy(&swchain);
+    pl_opengl_destroy(&opengl);
+    glfwDestroyWindow(window);
+    glfwTerminate();
+    pl_log_destroy(&pllog);
+    return 0;
+}
+```
+
+1.  We change this from `glfwWaitEvents` to `glfwPollEvents` because
+    we now want to re-run our main loop once per vsync, rather than only when
+    new events arrive.  The `pl_swapchain_swap_buffers` call will ensure
+    that this does not execute too quickly.
+
+2.  The swapchain needs to be resized to fit the size of the window, which in
+    GLFW is handled by listening to a callback. In addition to setting this
+    callback, we also need to inform the swapchain of the initial window size.
+
+    Note that the `pl_swapchain_resize` function handles both resize requests
+    and size queries - hence, the actual swapchain size is returned back to
+    the passed variables.
+
+## Getting pixels on the screen
+
+With a swapchain in hand, we're now equipped to start drawing pixels to the
+screen:
+
+``` c linenums="1" hl_lines="3-8 15-20"
+// ...
+
+static void render_frame(struct pl_swapchain_frame frame)
+{
+    pl_gpu gpu = opengl->gpu;
+
+    pl_tex_clear(gpu, frame.fbo, (float[4]){ 1.0, 0.5, 0.0, 1.0 });
+}
+
+int main()
+{
+    // ...
+
+    while (!glfwWindowShouldClose(window)) {
+        struct pl_swapchain_frame frame;
+        while (!pl_swapchain_start_frame(swchain, &frame))
+            glfwWaitEvents(); // (1)
+        render_frame(frame);
+        if (!pl_swapchain_submit_frame(swchain))
+            break; // (2)
+
+        pl_swapchain_swap_buffers(swchain);
+        glfwPollEvents();
+    }
+
+    // ...
+}
+```
+
+1.  If `pl_swapchain_start_frame` fails, it typically means the window is
+    hidden, minimized or blocked. This is not a fatal condition, and as such
+    we simply want to process window events until we can resume rendering.
+
+2.  If `pl_swapchain_submit_frame` fails, it typically means the window has
+    been lost, and further rendering commands are not expected to succeed.
+    As such, in this case, we simply terminate the example program.
+
+Our main render loop has changed into a combination of
+`pl_swapchain_start_frame`, rendering, and `pl_swapchain_submit_frame`. To
+start with, we simply use the `pl_tex_clear` function to blit a constant
+orange color to the framebuffer.
+
+### Interlude: Rendering commands
+
+The previous code snippet represented our first foray into the `pl_gpu` API.
+For more detail on this API, see the [GPU API](#TODO) section. But as a
+general rule of thumb, all `pl_gpu`-level operations are thread safe,
+asynchronous (except when returning something to the CPU), and internally
+refcounted (so you can destroy all objects as soon as you no longer need the
+reference).
+
+In the example loop, `pl_swapchain_swap_buffers` is the only operation that
+actually flushes commands to the GPU. You can force an early flush with
+`pl_gpu_flush()` or `pl_gpu_finish()`, but other than that, commands will
+"queue" internally and complete asynchronously at some unknown point in time,
+until forward progress is needed (e.g. `pl_tex_download`).
+
+## Conclusion
+
+We have demonstrated how to create a window, how to initialize the libplacebo
+API, create a GPU instance based on OpenGL, and how to write a basic rendering
+loop that blits a single color to the framebuffer.
+
+Here is a complete transcript of the example we built in this section:
+
+??? example "Basic rendering"
+    ``` c linenums="1"
+    #include <GLFW/glfw3.h>
+    
+    #include <libplacebo/log.h>
+    #include <libplacebo/opengl.h>
+    #include <libplacebo/gpu.h>
+    
+    const char * const title = "libplacebo demo";
+    int width = 800;
+    int height = 600;
+    
+    GLFWwindow *window;
+    
+    pl_log pllog;
+    pl_opengl opengl;
+    pl_swapchain swchain;
+    
+    static bool make_current(void *priv);
+    static void release_current(void *priv);
+    
+    static void resize_cb(GLFWwindow *win, int new_w, int new_h)
+    {
+        width  = new_w;
+        height = new_h;
+        pl_swapchain_resize(swchain, &width, &height);
+    }
+    
+    static void render_frame(struct pl_swapchain_frame frame)
+    {
+        pl_gpu gpu = opengl->gpu;
+    
+        pl_tex_clear(gpu, frame.fbo, (float[4]){ 1.0, 0.5, 0.0, 1.0 });
+    }
+    
+    int main()
+    {
+        pllog = pl_log_create(PL_API_VER, pl_log_params(
+            .log_cb = pl_log_color,
+            .log_level = PL_LOG_INFO,
+        ));
+    
+        if (!glfwInit())
+            return 1;
+    
+        window = glfwCreateWindow(width, height, title, NULL, NULL);
+        if (!window)
+            return 1;
+    
+        opengl = pl_opengl_create(pllog, pl_opengl_params(
+            .get_proc_addr      = glfwGetProcAddress,
+            .allow_software     = true,         // allow software rasterers
+            .debug              = true,         // enable error reporting
+            .make_current       = make_current,
+            .release_current    = release_current,
+        ));
+    
+        swchain = pl_opengl_create_swapchain(opengl, pl_opengl_swapchain_params(
+            .swap_buffers   = (void (*)(void *)) glfwSwapBuffers,
+            .priv           = window,
+        ));
+        if (!swchain)
+            return 2;
+    
+        if (!pl_swapchain_resize(swchain, &width, &height))
+            return 2;
+        glfwSetFramebufferSizeCallback(window, resize_cb);
+    
+        while (!glfwWindowShouldClose(window)) {
+            struct pl_swapchain_frame frame;
+            while (!pl_swapchain_start_frame(swchain, &frame))
+                glfwWaitEvents();
+            render_frame(frame);
+            if (!pl_swapchain_submit_frame(swchain))
+                break;
+    
+            pl_swapchain_swap_buffers(swchain);
+            glfwPollEvents();
+        }
+    
+        pl_swapchain_destroy(&swchain);
+        pl_opengl_destroy(&opengl);
+        glfwDestroyWindow(window);
+        glfwTerminate();
+        pl_log_destroy(&pllog);
+        return 0;
+    }
+    
+    static bool make_current(void *priv)
+    {
+        glfwMakeContextCurrent(window);
+        return true;
+    }
+    
+    static void release_current(void *priv)
+    {
+        glfwMakeContextCurrent(NULL);
+    }
+    ```
diff --git a/docs/custom-shaders.md b/docs/custom-shaders.md
new file mode 100644
index 0000000..c6dc107
--- /dev/null
+++ b/docs/custom-shaders.md
@@ -0,0 +1,729 @@
+# Custom Shaders (mpv .hook syntax)
+
+libplacebo supports the same [custom shader syntax used by
+mpv](https://mpv.io/manual/master/#options-glsl-shader), with some important
+changes. This document will serve as a complete reference for this syntax.
+
+## Overview
+
+In general, user shaders are divided into distinct *blocks*. Each block can
+define a shader, a texture, a buffer, or a tunable parameter. Each block
+starts with a collection of header directives, which are lines starting with
+the syntax `//!`.
+
+As an example, here is a simple shader that simply inverts the video signal:
+
+``` glsl linenums="1"
+//!HOOK LUMA
+//!HOOK RGB
+//!BIND HOOKED
+
+vec4 hook()
+{
+    vec4 color = HOOKED_texOff(0);
+    color.rgb = vec3(1.0) - color.rgb;
+    return color;
+}
+```
+
+This shader defines one block - a shader block which hooks into the two
+texture stages `LUMA` and `RGB`, binds the hooked texture, inverts the value
+of the `rgb` channels, and then returns the modified color.
+
+### Expressions
+
+In a few contexts, shader directives accept arithmetic expressions, denoted by
+`<expr>` in the listing below. For historical reasons, all expressions are
+given in [reverse polish notation
+(RPN)](https://en.wikipedia.org/wiki/Reverse_Polish_notation), and the only
+value type is a floating point number. The following value types and
+arithmetic operations are available:
+
+* `1.234`: Literal float constant, evaluates to itself.
+* `NAME.w`, `NAME.width`: Evaluates to the width of a texture with name `NAME`.
+* `NAME.h`, `NAME.height`: Evaluates to the height of a texture with name `NAME`.
+* `PAR`: Evaluates to the value of a tunable shader parameter with name `PAR`.
+* `+`: Evaluates to `X+Y`.
+* `-`: Evaluates to `X-Y`.
+* `*`: Evaluates to `X*Y`.
+* `/`: Evaluates to `X/Y`.
+* `%`: Evaluates to `fmod(X, Y)`.
+* `>`: Evaluates to `(X > Y) ? 1.0 : 0.0`.
+* `<`: Evaluates to `(X < Y) ? 1.0 : 0.0`.
+* `=`: Evaluates to `fuzzy_eq(X, Y) ? 1.0 : 0.0`, with some tolerance to
+  allow for floating point inaccuracy. (Around 1 ppm)
+* `!`: Evaluates to `X ? 0.0 : 1.0`.
+
+Note that `+` and `*` can be used as suitable replacements for the otherwise
+absent boolean logic expressions (`||` and `&&`).
+
+## Shaders
+
+Shaders are the default block type, and have no special syntax to indicate
+their presence. Shader stages contain raw GLSL code that will be
+(conditionally) executed. This GLSL snippet must define a single function
+`vec4 hook()`, or `void hook()` for compute shaders.
+
+During the execution of any shader, the following global variables are made
+available:
+
+* `int frame`: A raw counter tracking the number of executions of this shader
+  stage.
+* `float random`: A pseudo-random float uniformly distributed in the range
+  `[0,1)`.
+* `vec2 input_size`: The nominal size (in pixels) of the original input image.
+* `vec2 target_size`: The nominal size (in pixels) of the output rectangle.
+* `vec2 tex_offset`: The nominal offset (in pixels), of the original input crop.
+* `vec4 linearize(vec4 color)`: Linearize the input color according to the
+  image's tagged gamma function.
+* `vec4 delinearize(vec4 color)`: Opposite counterpart to `linearize`.
+
+Shader stages accept the following directives:
+
+### `HOOK <texture>`
+
+A `HOOK` directive determines when a shader stage is run. During internal
+processing, libplacebo goes over a number of pre-defined *hook points* at set
+points in the processing pipeline. It is only possible to intercept the image,
+and run custom shaders, at these fixed hook points.
+
+Here is a current list of hook points:
+
+* `RGB`: Input plane containing RGB values
+* `LUMA`: Input plane containing a Y value
+* `CHROMA`: Input plane containing chroma values (one or both)
+* `ALPHA`: Input plane containing a single alpha value
+* `XYZ`: Input plane containing XYZ values
+* `CHROMA_SCALED`: Chroma plane, after merging and upscaling to luma size
+* `ALPHA_SCALED`: Alpha plane, after upscaling to luma size
+* `NATIVE`: Merged input planes, before any sort of color conversion (as-is)
+* `MAIN`: After conversion to RGB, before linearization/scaling
+* `LINEAR`: After conversion to linear light (for scaling purposes)
+* `SIGMOID`: After conversion to sigmoidized light (for scaling purposes)
+* `PREKERNEL`: Immediately before the execution of the main scaler kernel
+* `POSTKERNEL`: Immediately after the execution of the main scaler kernel
+* `SCALED`: After scaling, in either linear or non-linear light RGB
+* `PREOUTPUT`: After color conversion to target colorspace, before alpha blending
+* `OUTPUT`: After alpha blending, before dithering and final output pass
+
+!!! warning "`MAINPRESUB`"
+    In mpv, `MAIN` and `MAINPRESUB` are separate shader stages, because the
+    mpv option `--blend-subtitles=video` allows rendering overlays directly
+    onto the pre-scaled video stage. libplacebo does not support this feature,
+    and as such, the `MAINPRESUB` shader stage does not exist. It is still
+    valid to refer to this name in shaders, but it is handled identically to
+    `MAIN`.
+
+It's possible for a hook point to never fire. For example, `SIGMOID` will not
+fire when downscaling, as sigmoidization only happens when upscaling.
+Similarly, `LUMA`/`CHROMA` will not fire on an RGB video and vice versa.
+
+A single shader stage may hook multiple hook points simultaneously, for
+example, to cover both `LUMA` and `RGB` cases with the same logic. (See the
+example shader in the introduction)
+
+### `BIND <texture>`
+
+The `BIND` directive makes a texture available for use in the shader. This can
+be any of the previously named hook points, a custom texture define by a
+`TEXTURE` block, a custom texture saved by a `SAVE` directive, or the special
+value `HOOKED` which allows binding whatever texture hook dispatched this
+shader stage.
+
+A bound texture will define the following GLSL functions (as macros):
+
+* `sampler2D NAME_raw`: A reference to the raw texture sampler itself.
+* `vec2 NAME_pos`: The texel coordinates of the current pixel.
+* `vec2 NAME_map(ivec2 id)`: A function that maps from `gl_GlobalInvocationID`
+  to texel coordinates. (Compute shaders)
+* `vec2 NAME_size`: The size (in pixels) of the texture.
+* `vec2 NAME_pt`: Convenience macro for `1.0 / NAME_size`. The size of a
+  single pixel (in texel coordinates).
+* `vec2 NAME_off`: The sample offset of the texture. Basically, the pixel
+  coordinates of the top-left corner of the sampled area.
+* `float NAME_mul`: The coefficient that must be multiplied into sampled
+  values in order to rescale them to `[0,1]`.
+* `vec4 NAME_tex(vec2 pos)`: A wrapper around `NAME_mul * textureLod(NAME_raw,
+  pos, 0.0)`.
+* `vec4 NAME_texOff(vec2 offset)`: A wrapper around `NAME_tex(NAME_pos + NAME_pt * offset)`.
+  This can be used to easily access adjacent pixels, e.g. `NAME_texOff(-1,2)`
+  samples a pixel one to the left and two to the bottom of the current
+  location.
+* `vec4 NAME_gather(vec2 pos, int c)`: A wrapper around
+  `NAME_mul * textureGather(pos, c)`, with appropriate scaling. (Only when
+  supported[^ifdef])
+
+!!! note "Rotation matrix"
+    For compatibility with mpv, we also define a `mat2 NAME_rot` which is
+    simply equal to a 2x2 identity matrix. libplacebo never rotates input
+    planes - all rotation happens during the final output to the display.
+
+[^ifdef]: Because these are macros, their presence can be tested for using
+  `#ifdef` inside the GLSL preprocessor.
+
+This same directive can also be used to bind buffer blocks (i.e.
+uniform/storage buffers), as defined by the [`BUFFER` directive](#buffer-name).
+
+### `SAVE <texture>`
+
+By default, after execution of a shader stage, the resulting output is
+captured back into the same hooked texture that triggered the shader. This
+behavior can be overridden using the explicit `SAVE` directive. For example,
+a shader might need access to a low-res version of the luma input texture in
+order to process chroma:
+
+``` glsl linenums="1"
+//!HOOK CHROMA
+//!BIND CHROMA
+//!BIND LUMA
+//!SAVE LUMA_LOWRES
+//!WIDTH CHROMA.w
+//!HEIGHT CHROMA.h
+
+vec4 hook()
+{
+    return LUMA_texOff(0);
+}
+```
+
+This shader binds both luma and chroma and resizes the luma plane down to the
+size of the chroma plane, saving the result as a new texture `LUMA_LOWRES`. In
+general, you can pick any name you want, here.
+
+### `DESC <description>`
+
+This purely informative directive simply gives the shader stage a name. This
+is the name that will be reported to the shader stage and execution time
+metrics.
+
+### `OFFSET <xo yo | ALIGN>`
+
+This directive indicates a pixel shift (offset) introduced by this pass. These
+pixel offsets will be accumulated and corrected automatically as part of plane
+alignment / main scaling.
+
+A special value of `ALIGN` will attempt to counteract any existing offset of
+the hooked texture by aligning it with reference plane (i.e. luma). This can
+be used to e.g. introduce custom chroma scaling in a way that doesn't break
+chroma subtexel offsets.
+
+An example:
+
+``` glsl linenums="1"
+//!HOOK LUMA
+//!BIND HOOKED
+//!OFFSET 100.5 100.5
+
+vec4 hook()
+{
+    // Constant offset by N pixels towards the bottom right
+    return HOOKED_texOff(-vec2(100.5));
+}
+```
+
+This (slightly silly) shader simply shifts the entire sampled region to the
+bottom right by 100.5 pixels, and propagates this shift to the main scaler
+using the `OFFSET` directive. As such, the end result of this is that there is
+no visible shift of the overall image, but some detail (~100 pixels) near the
+bottom-right border is lost due to falling outside the bounds of the texture.
+
+### `WIDTH <expr>`, `HEIGHT <expr>`
+
+These directives can be used to override the dimensions of the resulting
+texture. Note that not all textures can be resized this way. Currently, only
+`RGB`, `LUMA`, `CHROMA`, `XYZ`, `NATIVE` and `MAIN` are resizable. Trying to
+save a texture with an incompatible size to any other shader stage will result
+in an error.
+
+### `WHEN <expr>`
+
+This directive takes an expression that can be used to make shader stages
+conditionally executed. If this evaluates to 0, the shader stage will be
+skipped.
+
+Example:
+
+``` glsl linenums="1"
+//!PARAM strength
+//!TYPE float
+//!MINIMUM 0
+1.0
+
+//!HOOK MAIN
+//!BIND HOOKED
+//!WHEN intensity 0 >
+//!DESC do something based on 'intensity'
+...
+```
+
+This example defines a shader stage that only conditionally executes itself
+if the value of the `intensity` shader parameter is non-zero.
+
+### `COMPONENTS <num>`
+
+This directive overrides the number of components present in a texture.
+For example, if you want to extract a one-dimensional feature map from the
+otherwise 3 or 4 dimensional `MAIN` texture, you can use this directive to
+save on memory bandwidth and consumption by having libplacebo only allocate a
+one-component texture to store the feature map in:
+
+``` glsl linenums="1"
+//!HOOK MAIN
+//!BIND HOOKED
+//!SAVE featuremap
+//!COMPONENTS 1
+```
+
+### `COMPUTE <bw> <bh> [<tw> <th>]`
+
+This directive specifies that the shader should be treated as a compute
+shader, with the block size `bw` and `bh`. The compute shader will be
+dispatched with however many blocks are necessary to completely tile over the
+output. Within each block, there will be `tw*th` threads, forming a single
+work group. In other words: `tw` and `th` specify the work group size, which
+can be different from the block size. So for example, a compute shader with
+`bw = bh = 32` and `tw = th = 8` running on a `500x500` texture would dispatch
+`16x16` blocks (rounded up), each with `8x8` threads.
+
+Instead of defining a `vec4 hook()`, compute shaders must define a `void
+hook()` which results directly to the output texture, a `writeonly image2D
+out_image` made available to the shader stage.
+
+For example, here is a shader executing a single-pass 41x41 convolution
+(average blur) on the luma plane, using a compute shader to share sampling
+work between adjacent threads in a work group:
+
+``` glsl linenums="1"
+//!HOOK LUMA
+//!BIND HOOKED
+//!COMPUTE 32 32
+//!DESC avg convolution
+
+// Kernel size, 41x41 as an example
+const ivec2 ksize = ivec2(41, 41);
+const ivec2 offset = ksize / 2;
+
+// We need to load extra source texels to account for padding due to kernel
+// overhang
+const ivec2 isize = ivec2(gl_WorkGroupSize) + ksize - 1;
+
+shared float inp[isize.y][isize.x];
+
+void hook()
+{
+    // load texels into shmem
+    ivec2 base = ivec2(gl_WorkGroupID) * ivec2(gl_WorkGroupSize);
+    for (uint y = gl_LocalInvocationID.y; y < isize.y; y += gl_WorkGroupSize.y) {
+        for (uint x = gl_LocalInvocationID.x; x < isize.x; x += gl_WorkGroupSize.x)
+            inp[y][x] = texelFetch(HOOKED_raw, base + ivec2(x,y) - offset, 0).x;
+    }
+
+    // synchronize threads
+    barrier();
+
+    // do convolution
+    float sum;
+    for (uint y = 0; y < ksize.y; y++) {
+        for (uint x = 0; x < ksize.x; x++)
+            sum += inp[gl_LocalInvocationID.y+y][gl_LocalInvocationID.x+x];
+    }
+
+    vec4 color = vec4(HOOKED_mul * sum / (ksize.x * ksize.y), 0, 0, 1);
+    imageStore(out_image, ivec2(gl_GlobalInvocationID), color);
+}
+```
+
+## Textures
+
+Custom textures can be defined and made available to shader stages using
+`TEXTURE` blocks. These can be used to provide e.g. LUTs or pre-trained
+weights.
+
+The data for a texture is provided as a raw hexadecimal string encoding the
+in-memory representation of a texture, according to its given texture format,
+for example:
+
+``` glsl linenums="1"
+//!TEXTURE COLORS
+//!SIZE 3 3
+//!FORMAT rgba32f
+//!FILTER NEAREST
+//!BORDER REPEAT
+0000803f000000000000000000000000000000000000803f00000000000000000000000
+0000000000000803f00000000000000000000803f0000803f000000000000803f000000
+000000803f000000000000803f0000803f00000000000000009a99993e9a99993e9a999
+93e000000009a99193F9A99193f9a99193f000000000000803f0000803f0000803f0000
+0000
+```
+
+Texture blocks accept the following directives:
+
+### `TEXTURE <name>`
+
+This must be the first directive in a texture block, and marks it as such. The
+name given is the name that the texture will be referred to (via `BIND`
+directives).
+
+### `SIZE <width> [<height> [<depth>]]`
+
+This directive gives the size of the texture, as integers. For example,
+`//!SIZE 512 512` marks a 512x512 texture block. Textures can be 1D, 2D or 3D
+depending on the number of coordinates specified.
+
+### `FORMAT <fmt>`
+
+This directive specifies the texture format. A complete list of known textures
+is exposed as part of the `pl_gpu` struct metadata, but they follow the format
+convention `rgba8`, `rg16hf`, `rgba32f`, `r64i` and so on.
+
+### `FILTER <LINEAR | NEAREST>`
+
+This directive specifies the texture magnification/minification filter.
+
+### `BORDER <CLAMP | REPEAT | MIRROR>`
+
+This directive specifies the border clamping method of the texture.
+
+### `STORAGE`
+
+If present, this directive marks the texture as a storage image. It will still
+be initialized with the initial values, but rather than being bound as a
+read-only and immutable `sampler2D`, it is bound as a `readwrite coherent
+image2D`. Such texture scan be used to, for example, store persistent state
+across invocations of the shader.
+
+## Buffers
+
+Custom uniform / storage shader buffer  blocks can be defined using `BUFFER`
+directives.
+
+The (initial) data for a buffer is provided as a raw hexadecimal string
+encoding the in-memory representation of a buffer in the corresponding GLSL
+packing layout (std140 or std430 for uniform and storage blocks,
+respectively):
+
+``` glsl linenums="1"
+//!BUFFER buf_uniform
+//!VAR float foo
+//!VAR float bar
+0000000000000000
+
+//!BUFFER buf_storage
+//!VAR vec2 bat
+//!VAR int big[32];
+//!STORAGE
+```
+
+Buffer blocks accept the following directives:
+
+### `BUFFER <name>`
+
+This must be the first directive in a buffer block, and marks it as such. The
+name given is mostly cosmetic, as individual variables can be accessed
+directly using the names given in the corresponding `VAR` directives.
+
+### `STORAGE`
+
+If present, this directive marks the buffer as a (readwrite coherent) shader
+storage block, instead of a readonly uniform buffer block. Such storage blocks
+can be used to track and evolve state across invocations of this shader.
+
+Storage blocks may also be initialized with default data, but this is
+optional. They can also be initialized as part of the first shader execution
+(e.g. by testing for `frame == 0`).
+
+### `VAR <type> <name>`
+
+This directive appends a new variable to the shader block, with GLSL type
+`<type>` and shader name `<name>`. For example, `VAR float foo` introduces a
+`float foo;` member into the buffer block, and `VAR mat4 transform` introduces
+a `mat4 transform;` member.
+
+It is also possible to introduce array variables, using `[N]` as part of the
+variable name.
+
+## Tunable parameters
+
+Finally, the `PARAM` directive allows introducing tunable shader parameters,
+which are exposed programmatically as part of the C API (`pl_hook`).[^mpv]
+
+[^mpv]: In mpv using `--vo=gpu-next`, these can be set using the
+  [`--glsl-shader-opts` option](https://mpv.io/manual/master/#options-glsl-shader-opts).
+
+The default value of a parameter is given as the block body, for example:
+
+``` glsl linenums="1"
+//!PARAM contrast
+//!DESC Gain to apply to image brightness
+//!TYPE float
+//!MINIMUM 0.0
+//!MAXIMUM 100.0
+1.0
+```
+
+Parameters accept the following directives:
+
+### `PARAM <name>`
+
+This must be the first directive in a parameter block, and marks it as such.
+The name given is the name that will be used to refer to this parameter in
+GLSL code.
+
+### `DESC <description>`
+
+This directive can be used to provide a friendlier description of the shader
+parameter, exposed as part of the C API to end users.
+
+### `MINIMUM <value>`, `MAXIMUM <value>`
+
+Provides the minimum/maximum value bound of this parameter. If absent, no
+minimum/maximum is enforced.
+
+### `TYPE [ENUM] <DEFINE | [DYNAMIC | CONSTANT] <type>>`
+
+This gives the type of the parameter, which determines what type of values it
+can hold and how it will be made available to the shader. `<type>` must be
+a scalar GLSL numeric type, such as `int`, `float` or `uint`.
+
+If a type is `ENUM`, it is treated as an enumeration type. To use this, `type`
+must either be `int` or `DEFINE`. Instead of providing a single default value,
+the param body should be a list of all possible enumeration values (as separate
+lines). These names will be made available inside the shader body (as a
+`#define`), as well as inside RPN expressions (e.g. `WHEN`). The qualifiers
+`MINIMUM` and `MAXIMUM` are ignored for `ENUM` parameters, with the value
+range instead being set implicitly from the list of options.
+
+The optional qualifiers `DYNAMIC` or `CONSTANT` mark the parameter as
+dynamically changing and compile-time constant, respectively. A `DYNAMIC`
+variable is assumed to change frequently, and will be grouped with other
+frequently-changing input parameters. A `CONSTANT` parameter will be
+introduced as a compile-time constant into the shader header, which means thy
+can be used in e.g. constant expressions such as array sizes.[^spec]
+
+[^spec]: On supported platforms, these are implemented using specialization
+  constants, which can be updated at run-time without requiring a full shader
+  recompilation.
+
+Finally, the special type `TYPE DEFINE` marks a variable as a preprocessor
+define, which can be used inside `#if` preprocessor expressions. For example:
+
+``` glsl linenums="1"
+//!PARAM taps
+//!DESC Smoothing taps
+//!TYPE DEFINE
+//!MINIMUM 0
+//!MAXIMUM 5
+2
+
+//!HOOK LUMA
+//!BIND HOOKED
+const uint row_size = 2 * taps + 1;
+const float weights[row_size] = {
+#if taps == 0
+    1.0,
+#endif
+
+#if taps == 1
+    0.10650697891920,
+    0.78698604216159,
+    0.10650697891920,
+#endif
+
+#if taps == 2
+    0.05448868454964,
+    0.24420134200323,
+    0.40261994689424,
+    0.24420134200323,
+    0.05448868454964,
+#endif
+
+    // ...
+};
+```
+
+An example of an enum parameter:
+
+``` glsl linenums="1"
+//!PARAM csp
+//!DESC Colorspace
+//!TYPE ENUM int
+BT709
+BT2020
+DCIP3
+
+//!HOOK MAIN
+//!BIND HOOKED
+const mat3 matrices[3] = {
+    mat3(...), // BT709
+    mat3(...), // BT2020
+    mat3(...), // DCIP3
+};
+
+#define MAT matrices[csp]
+// ...
+```
+
+## Full example
+
+A collection of full examples can be found in the [mpv user shaders
+wiki](https://github.com/mpv-player/mpv/wiki/User-Scripts#user-shaders), but
+here is an example of a parametrized Gaussian smoothed film grain compute
+shader:
+
+``` glsl linenums="1"
+//!PARAM intensity
+//!DESC Film grain intensity
+//!TYPE float
+//!MINIMUM 0
+0.1
+
+//!PARAM taps
+//!DESC Film grain smoothing taps
+//!TYPE DEFINE
+//!MINIMUM 0
+//!MAXIMUM 5
+2
+
+//!HOOK LUMA
+//!BIND HOOKED
+//!DESC Apply gaussian smoothed film grain
+//!WHEN intensity 0 >
+//!COMPUTE 32 32
+
+const uint row_size = 2 * taps + 1;
+const float weights[row_size] = {
+#if taps == 0
+    1.0,
+#endif
+
+#if taps == 1
+    0.10650697891920,
+    0.78698604216159,
+    0.10650697891920,
+#endif
+
+#if taps == 2
+    0.05448868454964,
+    0.24420134200323,
+    0.40261994689424,
+    0.24420134200323,
+    0.05448868454964,
+#endif
+
+#if taps == 3
+    0.03663284536919,
+    0.11128075847888,
+    0.21674532140370,
+    0.27068214949642,
+    0.21674532140370,
+    0.11128075847888,
+    0.03663284536919,
+#endif
+
+#if taps == 4
+    0.02763055063889,
+    0.06628224528636,
+    0.12383153680577,
+    0.18017382291138,
+    0.20416368871516,
+    0.18017382291138,
+    0.12383153680577,
+    0.06628224528636,
+    0.02763055063889,
+#endif
+
+#if taps == 5
+    0.02219054849244,
+    0.04558899978527,
+    0.07981140824009,
+    0.11906462996609,
+    0.15136080967773,
+    0.16396720767670,
+    0.15136080967773,
+    0.11906462996609,
+    0.07981140824009,
+    0.04558899978527,
+    0.02219054849244,
+#endif
+};
+
+const uvec2 isize = uvec2(gl_WorkGroupSize) + uvec2(2 * taps);
+shared float grain[isize.y][isize.x];
+
+// PRNG
+float permute(float x)
+{
+    x = (34.0 * x + 1.0) * x;
+    return fract(x * 1.0/289.0) * 289.0;
+}
+
+float seed(uvec2 pos)
+{
+    const float phi = 1.61803398874989;
+    vec3 m = vec3(fract(phi * vec2(pos)), random) + vec3(1.0);
+    return permute(permute(m.x) + m.y) + m.z;
+}
+
+float rand(inout float state)
+{
+    state = permute(state);
+    return fract(state * 1.0/41.0);
+}
+
+// Turns uniform white noise into gaussian white noise by passing it
+// through an approximation of the gaussian quantile function
+float rand_gaussian(inout float state) {
+    const float a0 = 0.151015505647689;
+    const float a1 = -0.5303572634357367;
+    const float a2 = 1.365020122861334;
+    const float b0 = 0.132089632343748;
+    const float b1 = -0.7607324991323768;
+
+    float p = 0.95 * rand(state) + 0.025;
+    float q = p - 0.5;
+    float r = q * q;
+
+    float g = q * (a2 + (a1 * r + a0) / (r*r + b1*r + b0));
+    g *= 0.255121822830526; // normalize to [-1,1)
+    return g;
+}
+
+void hook()
+{
+    // generate grain in `grain`
+    uint num_threads = gl_WorkGroupSize.x * gl_WorkGroupSize.y;
+    for (uint i = gl_LocalInvocationIndex; i < isize.y * isize.x; i += num_threads) {
+        uvec2 pos = uvec2(i % isize.y, i / isize.y);
+        float state = seed(gl_WorkGroupID.xy * gl_WorkGroupSize.xy + pos);
+        grain[pos.y][pos.x] = rand_gaussian(state);
+    }
+
+    // make writes visible
+    barrier();
+
+    // convolve horizontally
+    for (uint y = gl_LocalInvocationID.y; y < isize.y; y += gl_WorkGroupSize.y) {
+        float hsum = 0;
+        for (uint x = 0; x < row_size; x++) {
+            float g = grain[y][gl_LocalInvocationID.x + x];
+            hsum += weights[x] * g;
+        }
+
+        // update grain LUT
+        grain[y][gl_LocalInvocationID.x + taps] = hsum;
+    }
+
+    barrier();
+
+    // convolve vertically
+    float vsum = 0.0;
+    for (uint y = 0; y < row_size; y++) {
+        float g = grain[gl_LocalInvocationID.y + y][gl_LocalInvocationID.x + taps];
+        vsum += weights[y] * g;
+    }
+
+    vec4 color = HOOKED_tex(HOOKED_pos);
+    color.rgb += vec3(intensity * vsum);
+    imageStore(out_image, ivec2(gl_GlobalInvocationID), color);
+}
+```
diff --git a/docs/glsl.md b/docs/glsl.md
new file mode 100644
index 0000000..543e3a4
--- /dev/null
+++ b/docs/glsl.md
@@ -0,0 +1,501 @@
+# GLSL shader system
+
+## Overall design
+
+Shaders in libplacebo are all written in GLSL, and built up incrementally, on
+demand. Generally, all shaders for each frame are generated *per frame*. So
+functions like `pl_shader_color_map` etc. are run anew for every frame. This
+makes the renderer very stateless and allows us to directly embed relevant
+constants, uniforms etc. as part of the same code that generates the actual
+GLSL shader.
+
+To avoid this from becoming wasteful, libplacebo uses an internal string
+building abstraction
+([`pl_str_builder`](https://code.videolan.org/videolan/libplacebo/-/blob/master/src/pl_string.h#L263)).
+Rather than building up a string directly, a `pl_str_builder` is like a list of
+string building functions/callbacks to execute in order to generate the actual
+shader. Combined with an efficient `pl_str_builder_hash`, this allows us to
+avoid the bulk of the string templating work for already-cached shaders.
+
+## Legacy API
+
+For the vast majority of libplacebo's history, the main entry-point into the
+shader building mechanism was the `GLSL()` macro ([and
+variants](#shader-sections-glsl-glslh-glslf)), which works like a
+`printf`-append:
+
+```c linenums="1"
+void pl_shader_extract_features(pl_shader sh, struct pl_color_space csp)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    sh_describe(sh, "feature extraction");
+    pl_shader_linearize(sh, &csp);
+    GLSL("// pl_shader_extract_features             \n"
+         "{                                         \n"
+         "vec3 lms = %f * "$" * color.rgb;          \n"
+         "lms = pow(max(lms, 0.0), vec3(%f));       \n"
+         "lms = (vec3(%f) + %f * lms)               \n"
+         "        / (vec3(1.0) + %f * lms);         \n"
+         "lms = pow(lms, vec3(%f));                 \n"
+         "float I = dot(vec3(%f, %f, %f), lms);     \n"
+         "color = vec4(I, 0.0, 0.0, 1.0);           \n"
+         "}                                         \n",
+         PL_COLOR_SDR_WHITE / 10000,
+         SH_MAT3(pl_ipt_rgb2lms(pl_raw_primaries_get(csp.primaries))),
+         PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2,
+         pl_ipt_lms2ipt.m[0][0], pl_ipt_lms2ipt.m[0][1], pl_ipt_lms2ipt.m[0][2]);
+}
+```
+
+The special macro `$` is a stand-in for an *identifier* (`ident_t`), which is
+the internal type used to pass references to loaded uniforms, descriptors and
+so on:
+
+```c
+typedef unsigned short ident_t;
+#define $           "_%hx"
+#define NULL_IDENT  0u
+
+// ...
+
+ident_t sh_var_mat3(pl_shader sh, const char *name, pl_matrix3x3 val);
+#define SH_MAT3(val) sh_var_mat3(sh, "mat", val)
+```
+
+In general, constants in libplacebo are divided into three categories:
+
+### Literal shader constants
+
+These are values that are expected to change very infrequently (or never), or
+for which we want to generate a different shader variant per value. Such values
+should be directly formatted as numbers into the shader text: `%d`, `%f` and so
+on. This is commonly used for array sizes, constants that depend only on
+hardware limits, constants that never change (but which have a friendly name,
+like `PQ_C2` above), and so on.
+
+As an example, the debanding iterations weights are hard-coded like this,
+because the debanding shader is expected to change as a result of a different
+number of iterations anyway:
+
+```c linenums="1"
+// For each iteration, compute the average at a given distance and
+// pick it instead of the color if the difference is below the threshold.
+for (int i = 1; i <= params->iterations; i++) {
+    GLSL(// Compute a random angle and distance
+         "d = "$".xy * vec2(%d.0 * "$", %f);    \n" // (1)
+         "d = d.x * vec2(cos(d.y), sin(d.y));   \n"
+         // Sample at quarter-turn intervals around the source pixel
+         "avg = T(0.0);                         \n"
+         "avg += GET(+d.x, +d.y);               \n"
+         "avg += GET(-d.x, +d.y);               \n"
+         "avg += GET(-d.x, -d.y);               \n"
+         "avg += GET(+d.x, -d.y);               \n"
+         "avg *= 0.25;                          \n"
+         // Compare the (normalized) average against the pixel
+         "diff = abs(res - avg);                \n"
+         "bound = T("$" / %d.0);                \n",
+         prng, i, radius, M_PI * 2,
+         threshold, i);
+
+    if (num_comps > 1) {
+        GLSL("res = mix(avg, res, greaterThan(diff, bound)); \n");
+    } else {
+        GLSL("res = mix(avg, res, diff > bound); \n");
+    }
+}
+```
+
+1.  The `%d.0` here corresponds to the iteration index `i`, while the `%f`
+    corresponds to the fixed constant `M_PI * 2`.
+
+### Specializable shader constants
+
+These are used for tunable parameters that are expected to change infrequently
+during normal playback. These constitute by far the biggest category, and most
+parameters coming from the various `_params` structs should be loaded like
+this.
+
+They are loaded using the `sh_const_*()` functions, which generate a
+specialization constant on supported platforms, falling back to a literal
+shader `#define` otherwise. For anoymous parameters, you can use the
+short-hands `SH_FLOAT`, `SH_INT` etc.:
+
+```c
+ident_t sh_const_int(pl_shader sh, const char *name, int val);
+ident_t sh_const_uint(pl_shader sh, const char *name, unsigned int val);
+ident_t sh_const_float(pl_shader sh, const char *name, float val);
+#define SH_INT(val)     sh_const_int(sh, "const", val)
+#define SH_UINT(val)    sh_const_uint(sh, "const", val)
+#define SH_FLOAT(val)   sh_const_float(sh, "const", val)
+```
+
+Here is an example of them in action:
+
+```c linenums="1"
+void pl_shader_sigmoidize(pl_shader sh, const struct pl_sigmoid_params *params)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    params = PL_DEF(params, &pl_sigmoid_default_params);
+    float center = PL_DEF(params->center, 0.75);
+    float slope  = PL_DEF(params->slope, 6.5);
+
+    // This function needs to go through (0,0) and (1,1), so we compute the
+    // values at 1 and 0, and then scale/shift them, respectively.
+    float offset = 1.0 / (1 + expf(slope * center));
+    float scale  = 1.0 / (1 + expf(slope * (center - 1))) - offset;
+
+    GLSL("// pl_shader_sigmoidize                               \n"
+         "color = clamp(color, 0.0, 1.0);                       \n"
+         "color = vec4("$") - vec4("$") *                       \n"
+         "    log(vec4(1.0) / (color * vec4("$") + vec4("$"))   \n"
+         "        - vec4(1.0));                                 \n",
+         SH_FLOAT(center), SH_FLOAT(1.0 / slope),
+         SH_FLOAT(scale), SH_FLOAT(offset));
+}
+```
+
+The advantage of this type of shader constant is that they will be
+transparently replaced by dynamic uniforms whenever
+`pl_render_params.dynamic_constants` is true, which allows the renderer to
+respond more instantly to changes in the parameters (e.g. as a result of a user
+dragging a slider around). During "normal" playback, they will then be
+"promoted" to actual shader constants to prevent them from taking up registers.
+
+### Dynamic variables
+
+For anything else, e.g. variables which are expected to change very frequently,
+you can use the generic `sh_var()` mechanism, which sends constants either as
+elements of a uniform buffer, or directly as push constants:
+
+```c
+ident_t sh_var_int(pl_shader sh, const char *name, int val, bool dynamic);
+ident_t sh_var_uint(pl_shader sh, const char *name, unsigned int val, bool dynamic);
+ident_t sh_var_float(pl_shader sh, const char *name, float val, bool dynamic);
+#define SH_INT_DYN(val)   sh_var_int(sh, "const", val, true)
+#define SH_UINT_DYN(val)  sh_var_uint(sh, "const", val, true)
+#define SH_FLOAT_DYN(val) sh_var_float(sh, "const", val, true)
+```
+
+These are used primarily when a variable is expected to change very frequently,
+e.g. as a result of randomness, or for constants which depend on dynamically
+computed, source-dependent variables (e.g. input frame characteristics):
+
+```c linenums="1"
+if (params->show_clipping) {
+    const float eps = 1e-6f;
+    GLSL("bool clip_hi, clip_lo;                            \n"
+         "clip_hi = any(greaterThan(color.rgb, vec3("$"))); \n"
+         "clip_lo = any(lessThan(color.rgb, vec3("$")));    \n"
+         "clip_hi = clip_hi || ipt.x > "$";                 \n"
+         "clip_lo = clip_lo || ipt.x < "$";                 \n",
+         SH_FLOAT_DYN(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, tone.input_max) + eps),
+         SH_FLOAT(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, tone.input_min) - eps),
+         SH_FLOAT_DYN(tone.input_max + eps),
+         SH_FLOAT(tone.input_min - eps));
+}
+```
+
+### Shader sections (GLSL, GLSLH, GLSLF)
+
+Shader macros come in three main flavors, depending on where the resulting text
+should be formatted:
+
+- `GLSL`: Expanded in the scope of the current `main` function,
+  and is related to code directly processing the current pixel value.
+- `GLSLH`: Printed to the 'header', before the first function, but after
+  variables, uniforms etc. This is used for global definitions, helper
+  functions, shared memory variables, and so on.
+- `GLSLF`: Printed to the `footer`, which is always at the end of the current
+  `main` function, but before returning to the caller / writing to the
+  framebuffer. Used to e.g. update SSBO state in preparation for the next
+  frame.
+
+Finally, there is a fourth category `GLSLP` (prelude), which is currently only
+used internally to generate preambles during e.g. compute shader translation.
+
+## New #pragma GLSL macro
+
+Starting with libplacebo v6, the internal shader system has been augmented by a
+custom macro preprocessor, which is designed to ease the boilerplate of writing
+shaders (and also strip redundant whitespace from generated shaders). The code
+for this is found in the
+[tools/glsl_preproc](https://code.videolan.org/videolan/libplacebo/-/tree/master/tools/glsl_preproc)
+directory.
+
+In a nutshell, this allows us to embed GLSL snippets directly as `#pragma GLSL`
+macros (resp. `#pragma GLSLH`, `#pragma GLSLF`):
+
+```c linenums="1"
+bool pl_shader_sample_bicubic(pl_shader sh, const struct pl_sample_src *src)
+{
+    ident_t tex, pos, pt;
+    float rx, ry, scale;
+    if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
+        return false;
+
+    if (rx < 1 || ry < 1) {
+        PL_TRACE(sh, "Using fast bicubic sampling when downscaling. This "
+                 "will most likely result in nasty aliasing!");
+    }
+
+    // Explanation of how bicubic scaling with only 4 texel fetches is done:
+    //   http://www.mate.tue.nl/mate/pdfs/10318.pdf
+    //   'Efficient GPU-Based Texture Interpolation using Uniform B-Splines'
+
+    sh_describe(sh, "bicubic");
+#pragma GLSL /* pl_shader_sample_bicubic */         \
+    vec4 color;                                     \
+    {                                               \
+    vec2 pos = $pos;                                \
+    vec2 size = vec2(textureSize($tex, 0));         \
+    vec2 frac  = fract(pos * size + vec2(0.5));     \
+    vec2 frac2 = frac * frac;                       \
+    vec2 inv   = vec2(1.0) - frac;                  \
+    vec2 inv2  = inv * inv;                         \
+    /* compute basis spline */                      \
+    vec2 w0 = 1.0/6.0 * inv2 * inv;                 \
+    vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac); \
+    vec2 w2 = 2.0/3.0 - 0.5 * inv2  * (2.0 - inv);  \
+    vec2 w3 = 1.0/6.0 * frac2 * frac;               \
+    vec4 g = vec4(w0 + w1, w2 + w3);                \
+    vec4 h = vec4(w1, w3) / g + inv.xyxy;           \
+    h.xy -= vec2(2.0);                              \
+    /* sample four corners, then interpolate */     \
+    vec4 p = pos.xyxy + $pt.xyxy * h;               \
+    vec4 c00 = textureLod($tex, p.xy, 0.0);         \
+    vec4 c01 = textureLod($tex, p.xw, 0.0);         \
+    vec4 c0 = mix(c01, c00, g.y);                   \
+    vec4 c10 = textureLod($tex, p.zy, 0.0);         \
+    vec4 c11 = textureLod($tex, p.zw, 0.0);         \
+    vec4 c1 = mix(c11, c10, g.y);                   \
+    color = ${float:scale} * mix(c1, c0, g.x);      \
+    }
+
+    return true;
+}
+```
+
+This gets transformed, by the GLSL macro preprocessor, into an optimized shader
+template invocation like the following:
+
+```c linenums="1"
+{
+    // ...
+    sh_describe(sh, "bicubic");
+    const struct __attribute__((__packed__)) {
+        ident_t pos;
+        ident_t tex;
+        ident_t pt;
+        ident_t scale;
+    } _glsl_330_args = {
+        .pos = pos,
+        .tex = tex,
+        .pt = pt,
+        .scale = sh_const_float(sh, "scale", scale),
+    };
+    size_t _glsl_330_fn(void *, pl_str *, const uint8_t *);
+    pl_str_builder_append(sh->buffers[SH_BUF_BODY], _glsl_330_fn,
+                          &_glsl_330_args, sizeof(_glsl_330_args));
+    // ...
+}
+
+size_t _glsl_330_fn(void *alloc, pl_str *buf, const uint8_t *ptr)
+{
+    struct __attribute__((__packed__)) {
+        ident_t pos;
+        ident_t tex;
+        ident_t pt;
+        ident_t scale;
+    } vars;
+    memcpy(&vars, ptr, sizeof(vars));
+
+    pl_str_append_asprintf_c(alloc, buf,
+        "/* pl_shader_sample_bicubic */\n"
+        "    vec4 color;\n"
+        "    {\n"
+        "    vec2 pos = /*pos*/_%hx;\n"
+        "    vec2 size = vec2(textureSize(/*tex*/_%hx, 0));\n"
+        "    vec2 frac  = fract(pos * size + vec2(0.5));\n"
+        "    vec2 frac2 = frac * frac;\n"
+        "    vec2 inv   = vec2(1.0) - frac;\n"
+        "    vec2 inv2  = inv * inv;\n"
+        "    /* compute basis spline */\n"
+        "    vec2 w0 = 1.0/6.0 * inv2 * inv;\n"
+        "    vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac);\n"
+        "    vec2 w2 = 2.0/3.0 - 0.5 * inv2  * (2.0 - inv);\n"
+        "    vec2 w3 = 1.0/6.0 * frac2 * frac;\n"
+        "    vec4 g = vec4(w0 + w1, w2 + w3);\n"
+        "    vec4 h = vec4(w1, w3) / g + inv.xyxy;\n"
+        "    h.xy -= vec2(2.0);\n"
+        "    /* sample four corners, then interpolate */\n"
+        "    vec4 p = pos.xyxy + /*pt*/_%hx.xyxy * h;\n"
+        "    vec4 c00 = textureLod(/*tex*/_%hx, p.xy, 0.0);\n"
+        "    vec4 c01 = textureLod(/*tex*/_%hx, p.xw, 0.0);\n"
+        "    vec4 c0 = mix(c01, c00, g.y);\n"
+        "    vec4 c10 = textureLod(/*tex*/_%hx, p.zy, 0.0);\n"
+        "    vec4 c11 = textureLod(/*tex*/_%hx, p.zw, 0.0);\n"
+        "    vec4 c1 = mix(c11, c10, g.y);\n"
+        "    color = /*scale*/_%hx * mix(c1, c0, g.x);\n"
+        "    }\n",
+        vars.pos,
+        vars.tex,
+        vars.pt,
+        vars.tex,
+        vars.tex,
+        vars.tex,
+        vars.tex,
+        vars.scale
+    );
+
+    return sizeof(vars);
+}
+```
+
+To support this style of shader programming, special syntax was invented:
+
+### Shader variables
+
+Instead of being formatted with `"$"`, `%f` etc. and supplied in a big list,
+printf style, GLSL macros may directly embed shader variables:
+
+```c
+ident_t pos, tex = sh_bind(sh, texture, ..., &pos, ...);
+#pragma GLSL vec4 color = texture($tex, $pos);
+```
+
+The simplest possible shader variable is just `$name`, which corresponds to
+any variable of type `ident_t`. More complicated expression are also possible:
+
+```glsl
+#define RAND3 ${sh_prng(sh, false, NULL)}
+color.rgb += ${float:params->noise} * RAND3;
+```
+
+In the expression `${float:params->noise}`, the `float:` prefix here transforms
+the shader variable into the equivalent of `SH_FLOAT()` in the legacy API,
+that is, a generic float (specialization) constant. Other possible types are:
+
+```glsl
+TYPE  i = ${ident: sh_desc(...)};
+float f = ${float: M_PI};
+int   i = ${int:   params->width};
+uint  u = ${uint:  sizeof(ssbo)};
+```
+
+In addition to a type specifier, the optional qualifiers `dynamic` and `const`
+will modify the variable, turning it into (respectively) a dynamically loaded
+uniform (`SH_FLOAT_DYN` etc.), or a hard-coded shader literal (`%d`, `%f`
+etc.):
+
+```glsl
+const float base = ${const float: M_LOG10E};
+int seed = ${dynamic int: rand()};
+```
+
+For sampling from component masks, the special types `swizzle` and
+`(u|i)vecType` can be used to generate the appropriate texture swizzle and
+corresponding vector type:
+
+```glsl
+${vecType: comp_mask} tmp = color.${swizzle: comp_mask};
+```
+
+### Macro directives
+
+Lines beginning with `@` are not included in the GLSL as-is, but instead parsed
+as macro directives, to control the code flow inside the macro expansion:
+
+#### @if / @else
+
+Standard-purpose conditional. Example:
+
+```glsl
+float alpha = ...;
+@if (repr.alpha == PL_ALPHA_INDEPENDENT)
+    color.a *= alpha;
+@else
+    color.rgba *= alpha;
+```
+
+The condition is evaluated outside the macro (in the enclosing scope) and
+the resulting boolean variable is directly passed to the template.
+
+An `@if` block can also enclose multiple lines:
+
+```glsl
+@if (threshold > 0) {
+    float thresh = ${float:threshold};
+    coeff = mix(coeff, vec2(0.0), lessThan(coeff, vec2(thresh)));
+    coeff = mix(coeff, vec2(1.0), greaterThan(coeff, vec2(1.0 - thresh)));
+@}
+```
+
+#### @for
+
+This can be used to generate (unrolled) loops:
+
+```glsl
+int offset = ${const int: params->kernel_width / 2};
+float sum = 0.0;
+@for (x < params->kernel_width)
+    sum += textureLodOffset($luma, $pos, 0.0, int(@sum - offset)).r;
+```
+
+This introduces a local variable, `@x`, which expands to an integer containing
+the current loop index. Loop indices always start at 0. Valid terminating
+conditions include `<` and `<=`, and the loop stop condition is also evaluated
+as an integer.
+
+Alternatively, this can be used to iterate over a bitmask (as commonly used for
+e.g. components in a color mask):
+
+```glsl
+float weight = /* ... */;
+vec4 color = textureLod($tex, $pos, 0.0);
+@for (c : params->component_mask)
+    sum[@c] += weight * color[@c];
+```
+
+Finally, to combine loops with conditionals, the special syntax `@if @(cond)`
+may be used to evaluate expressions inside the template loop:
+
+```glsl
+@for (i < 10) {
+    float weight = /* ... */;
+    @if @(i < 5)
+        weight = -weight;
+    sum += weight * texture(...);
+@}
+```
+
+In this case, the `@if` conditional may only reference local (loop) variables.
+
+#### @switch / @case
+
+This corresponds fairly straightforwardly to a normal switch/case from C:
+
+```glsl
+@switch (color->transfer) {
+@case PL_COLOR_TRC_SRGB:
+    color.rgb = mix(color.rgb * 1.0/12.92,
+                    pow((color.rgb + vec3(0.055)) / 1.055, vec3(2.4)),
+                    lessThan(vec3(0.04045), color.rgb));
+    @break;
+@case PL_COLOR_TRC_GAMMA18:
+    color.rgb = pow(color.rgb, vec3(1.8));
+    @break;
+@case PL_COLOR_TRC_GAMMA20:
+    color.rgb = pow(color.rgb, vec3(2.0));
+    @break;
+@case PL_COLOR_TRC_GAMMA22:
+    color.rgb = pow(color.rgb, vec3(2.2));
+    @break;
+/* ... */
+@}
+```
+
+The switch body is always evaluated as an `unsigned int`.
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 0000000..9122afe
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,36 @@
+# Introduction
+
+## Overview
+
+This document will serve as an introduction to and usage example for the
+[libplacebo](https://code.videolan.org/videolan/libplacebo) API. This is not
+intended as a full API reference, for that you should see the repository of
+[header
+files](https://code.videolan.org/videolan/libplacebo/-/tree/master/src/include/libplacebo),
+which are written to be (hopefully) understandable as-is.
+
+libplacebo exposes large parts of its internal abstractions publicly. This
+guide will take the general approach of starting as high level as possible and
+diving into the details in later chapters.
+
+A full listing of currently available APIs and their corresponding header
+files can be seen
+[here](https://code.videolan.org/videolan/libplacebo#api-overview).
+
+## Getting Started
+
+To get started using libplacebo, you need to install it (and its development
+headers) somehow onto your system. On most distributions, this should be as
+simple as installing the corresponding `libplacebo-devel` package, or the
+appropriate variants.
+
+You can see a fill list of libplacebo packages and their names [on
+repology](https://repology.org/project/libplacebo/versions).
+
+!!! note "API versions"
+
+    This document is targeting the "v4 API" overhaul, and as such, examples
+    provided will generally fail to compile on libplacebo versions below v4.x.
+
+Alternatively, you can install it from the source code. For that, see the
+build instructions [located here](https://code.videolan.org/videolan/libplacebo#installing).
diff --git a/docs/options.md b/docs/options.md
new file mode 100644
index 0000000..decba48
--- /dev/null
+++ b/docs/options.md
@@ -0,0 +1,978 @@
+# Options
+
+The following provides an overview of all options available via the built-in
+`pl_options` system.
+
+## Global preset
+
+### `preset=<default|fast|high_quality>`
+
+Override all options from all sections by the values from the given
+preset. The following presets are available:
+
+- `default`: Default settings, tuned to provide a balance of performance and
+  quality. Should be fine on almost all systems.
+- `fast`: Disable all advanced rendering, equivalent to passing `no` to every
+  option. Increases performance on very slow / old integrated GPUs.
+- `high_quality`: Reset all structs to their `high_quality` presets (where
+  available), set the upscaler to `ewa_lanczossharp`, and enable `deband=yes`.
+  Suitable for use on machines with a discrete GPU.
+
+## Scaling
+
+### `upscaler=<filter>`
+
+Sets the filter used for upscaling. Defaults to `lanczos`. Pass `upscaler=help`
+to see a full list of filters. The most relevant options, roughly ordered from
+fastest to slowest:
+
+- `none`: No filter, only use basic GPU texture sampling
+- `nearest`: Nearest-neighbour (box) sampling (very fast)
+- `bilinear`: Bilinear sampling (very fast)
+- `oversample`: Aspect-ratio preserving nearest neighbour sampling (very fast)
+- `bicubic`: Bicubic interpolation (fast)
+- `gaussian`: Gaussian smoothing (fast)
+- `catmull_rom`: Catmull-Rom cubic spline
+- `lanczos`: Lanczos reconstruction
+- `ewa_lanczos`: EWA Lanczos ("Jinc") reconstruction (slow)
+- `ewa_lanczossharp`: Sharpened version of `ewa_lanczos` (slow)
+- `ewa_lanczos4sharpest`: Very sharp version of `ewa_lanczos`, with
+  anti-ringing (very slow)
+
+### `downscaler=<filter>`
+
+Sets the filter used for downscaling. Defaults to `hermite`. Pass
+`downscaler=help` to see a full list of filters. The most relevant options,
+roughly ordered from fastest to slowest:
+
+- `none`: Use the same filter as specified for `upscaler`
+- `box`: Box averaging (very fast)
+- `hermite`: Hermite-weighted averaging (fast)
+- `bilinear`: Bilinear (triangle) averaging (fast)
+- `bicubic`: Bicubic interpolation (fast)
+- `gaussian`: Gaussian smoothing (fast)
+- `catmull_rom`: Catmull-Rom cubic spline
+- `mitchell`: Mitchell-Netravalia cubic spline
+- `lanczos`: Lanczos reconstruction
+
+### `plane_upscaler=<filter>`, `plane_downscaler=<filter>`
+
+Override the filter used for upscaling/downscaling planes, e.g. chroma/alpha.
+If set to `none`, use the same setting as `upscaler` and `downscaler`,
+respectively. Defaults to `none` for both.
+
+### `frame_mixer=<filter>`
+
+Sets the filter used for frame mixing (temporal interpolation). Defaults to
+`oversample`. Pass `frame_mixer=help` to see a full list of filters. The most
+relevant options, roughly ordered from fastest to slowest:
+
+- `none`: Disable frame mixing, show nearest frame to target PTS
+- `oversample`: Oversampling, only mix "edge" frames while preserving FPS
+- `hermite`: Hermite-weighted frame mixing
+- `linear`: Linear frame mixing
+- `cubic`: Cubic B-spline frame mixing
+
+### `antiringing_strength=<0.0..1.0>`
+
+Antiringing strength to use for all filters. A value of `0.0` disables
+antiringing, and a value of `1.0` enables full-strength antiringing. Defaults
+to `0.0`.
+
+!!! note
+    Specific filter presets may override this option.
+
+### Custom scalers
+
+Custom filter kernels can be created by setting the filter to `custom`, in
+addition to setting the respective options, replacing `<scaler>` by the
+corresponding scaler (`upscaler`, `downscaler`, etc.)
+
+#### `<scaler>_preset=<filter>`
+
+Overrides the value of all options in this section by their default values from
+the given filter preset.
+
+#### `<scaler>_kernel=<kernel>`, `<scaler>_window=<kernel>`
+
+Choose the filter kernel and window function, rspectively. Pass `help` to
+get a full list of filter kernels. Defaults to `none`.
+
+#### `<scaler>_radius=<0.0..16.0>`
+
+Override the filter kernel radius. Has no effect if the filter kernel
+is not resizeable. Defaults to `0.0`, meaning "no override".
+
+#### `<scaler>_clamp=<0.0..1.0>`
+
+Represents an extra weighting/clamping coefficient for negative weights. A
+value of `0.0` represents no clamping. A value of `1.0` represents full
+clamping, i.e. all negative lobes will be removed. Defaults to `0.0`.
+
+#### `<scaler>_blur=<0.0..100.0>`
+
+Additional blur coefficient. This effectively stretches the kernel, without
+changing the effective radius of the filter radius. Setting this to a value of
+`0.0` is equivalent to disabling it. Values significantly below `1.0` may
+seriously degrade the visual output, and should be used with care. Defaults to
+`0.0`.
+
+#### `<scaler>_taper=<0.0..1.0>`
+
+Additional taper coefficient. This essentially flattens the function's center.
+The values within `[-taper, taper]` will return `1.0`, with the actual function
+being squished into the remainder of `[taper, radius]`. Defaults to `0.0`.
+
+#### `<scaler>_antiring=<0.0..1.0>`
+
+Antiringing override for this filter. Defaults to `0.0`, which infers the value
+from `antiringing_strength`.
+
+#### `<scaler>_param1`, `<scaler>_param2` `<scaler>_wparam1`, `<scaler>_wparam2`
+
+Parameters for the respective filter function. Ignored if not tunable. Defaults
+to `0.0`.
+
+#### `<scaler>_polar=<yes|no>`
+
+If true, this filter is a polar/2D filter (EWA), instead of a separable/1D
+(orthogonal) filter. Defaults to `no`.
+
+## Debanding
+
+These options control the optional debanding step. Debanding can be used to
+reduce the prevalence of quantization artefacts in low quality sources, but
+can be heavy to compute on weaker devices.
+
+!!! note
+    This can also be used as a pure grain generator, by setting
+    `deband_iterations=0`.
+
+### `deband=<yes|no>`
+
+Enables debanding. Defaults to `no`.
+
+### `deband_preset=<default>`
+
+Overrides the value of all options in this section by their default values from
+the given preset.
+
+### `deband_iterations=<0..16>`
+
+The number of debanding steps to perform per sample. Each
+step reduces a bit more banding, but takes time to compute.
+Note that the strength of each step falls off very quickly,
+so high numbers (>4) are practically useless. Defaults to `1`.
+
+### `deband_threshold=<0.0..1000.0>`
+
+The debanding filter's cut-off threshold. Higher numbers
+increase the debanding strength dramatically, but
+progressively diminish image details. Defaults to `3.0`.
+
+### `deband_radius=<0.0..1000.0>`
+
+The debanding filter's initial radius. The radius increases
+linearly for each iteration. A higher radius will find more
+gradients, but a lower radius will smooth more aggressively.
+Defaults to `16.0`.
+
+### `deband_grain=<0.0..1000.0>`
+
+Add some extra noise to the image. This significantly helps
+cover up remaining quantization artifacts. Higher numbers add
+more noise. Defaults to `4.0`, which is very mild.
+
+### `deband_grain_neutral_r, deband_grain_neutral_g, deband_grain_neutral_b`
+
+'Neutral' grain value for each channel being debanded. Grain
+application will be modulated to avoid disturbing colors
+close to this value. Set this to a value corresponding to
+black in the relevant colorspace.
+
+!!! note
+    This is done automatically by `pl_renderer` and should not need to be
+    touched by the user. This is purely a debug option.
+
+## Sigmoidization
+
+These options control the sigmoidization parameters. Sigmoidization is an
+optional step during upscaling which reduces the prominence of ringing
+artifacts.
+
+### `sigmoid=<yes|no>`
+
+Enables sigmoidization. Defaults to `yes`.
+
+### `sigmoid_preset=<default>`
+
+Overrides the value of all options in this section by their default values from
+the given preset.
+
+### `sigmoid_center=<0.0..1.0>`
+
+The center (bias) of the sigmoid curve. Defaults to `0.75`.
+
+### `sigmoid_slope=<1.0..20.0>`
+
+The slope (steepness) of the sigmoid curve. Defaults to `6.5`.
+
+## Color adjustment
+
+These options affect the decoding of the source color values, and can be used
+to subjectively alter the appearance of the video.
+
+### `color_adjustment=<yes|no>`
+
+Enables color adjustment. Defaults to `yes`.
+
+### `color_adjustment_preset=<neutral>`
+
+Overrides the value of all options in this section by their default values from
+the given preset.
+
+### `brightness=<-1.0..1.0>`
+
+Brightness boost. Adds a constant bias onto the source
+luminance signal. `0.0` = neutral, `1.0` = solid white,
+`-1.0` = solid black. Defaults to `0.0`.
+
+### `contrast=<0.0..100.0>`
+
+Contrast gain. Multiplies the source luminance signal by a
+constant factor. `1.0` = neutral, `0.0` = solid black.
+Defaults to `1.0`.
+
+### `saturation=<0.0..100.0>`
+
+Saturation gain. Multiplies the source chromaticity signal by
+a constant factor. `1.0` = neutral, `0.0` = grayscale.
+Defaults to `1.0`.
+
+### `hue=<angle>`
+
+Hue shift. Corresponds to a rotation of the UV subvector
+around the neutral axis. Specified in radians. Defaults to
+`0.0` (neutral).
+
+### `gamma=<0.0..100.0>`
+
+Gamma lift. Subjectively brightnes or darkens the scene while
+preserving overall contrast. `1.0` = neutral, `0.0` = solid
+black. Defaults to `1.0`.
+
+### `temperature=<-1.143..5.286>`
+
+Color temperature shift. Relative to 6500 K, a value of `0.0` gives you 6500 K
+(no change), a value of `-1.0` gives you 3000 K, and a value of `1.0` gives you
+10000 K. Defaults to `0.0`.
+
+## HDR peak detection
+
+These options affect the HDR peak detection step. This can be used to greatly
+improve the HDR tone-mapping process in the absence of dynamic video metadata,
+but may be prohibitively slow on some devices (e.g. weaker integrated GPUs).
+
+### `peak_detect=<yes|no>`
+
+Enables HDR peak detection. Defaults to `yes`.
+
+### `peak_detection_preset=<default|high_quality>`
+
+Overrides the value of all options in this section by their default values from
+the given preset. `high_quality` also enables frame histogram measurement.
+
+### `peak_smoothing_period=<0.0..1000.0>`
+
+Smoothing coefficient for the detected values. This controls the time parameter
+(tau) of an IIR low pass filter. In other words, it represent the cutoff period
+(= 1 / cutoff frequency) in frames. Frequencies below this length will be
+suppressed. This helps block out annoying "sparkling" or "flickering" due to
+small variations in frame-to-frame brightness. If left as `0.0`, this smoothing
+is completely disabled. Defaults to `20.0`.
+
+### `scene_threshold_low=<0.0..100.0>`, `scene_threshold_high=<0.0..100.0>`
+
+In order to avoid reacting sluggishly on scene changes as a result of the
+low-pass filter, we disable it when the difference between the current frame
+brightness and the average frame brightness exceeds a given threshold
+difference. But rather than a single hard cutoff, which would lead to weird
+discontinuities on fades, we gradually disable it over a small window of
+brightness ranges. These parameters control the lower and upper bounds of this
+window, in units of 1% PQ.
+
+Setting either one of these to 0.0 disables this logic. Defaults to `1.0` and
+`3.0`, respectively.
+
+### `peak_percentile=<0.0..100.0>`
+
+Which percentile of the input image brightness histogram to consider as the
+true peak of the scene. If this is set to `100` (or `0`), the brightest pixel
+is measured. Otherwise, the top of the frequency distribution is progressively
+cut off. Setting this too low will cause clipping of very bright details, but
+can improve the dynamic brightness range of scenes with very bright isolated
+highlights.
+
+Defaults to `100.0`. The `high_quality` preset instead sets this to `99.995`,
+which is very conservative and should cause no major issues in typical content.
+
+### `allow_delayed_peak=<yes|no>`
+
+Allows the peak detection result to be delayed by up to a single frame, which
+can sometimes improve thoughput, at the cost of introducing the possibility of
+1-frame flickers on transitions. Defaults to `no`.
+
+## Color mapping
+
+These options affect the way colors are transformed between color spaces,
+including tone- and gamut-mapping where needed.
+
+### `color_map=<yes|no>`
+
+Enables the use of these color mapping settings. Defaults to `yes`.
+
+!!! note
+    Disabling this option does *not* disable color mapping, it just means "use
+    the default options for everything".
+
+### `color_map_preset=<default|high_quality>`
+
+Overrides the value of all options in this section by their default values from
+the given preset. `high_quality` also enables HDR contrast recovery.
+
+### `gamut_mapping=<function>`
+
+Gamut mapping function to use to handle out-of-gamut colors, including colors
+which are out-of-gamut as a consequence of tone mapping. Defaults to
+`perceptual`. The following options are available:
+
+- `clip`: Performs no gamut-mapping, just hard clips out-of-range colors
+  per-channel.
+- `perceptual`: Performs a perceptually balanced (saturation) gamut mapping,
+  using a soft knee function to preserve in-gamut colors, followed by a final
+  softclip operation. This works bidirectionally, meaning it can both compress
+  and expand the gamut. Behaves similar to a blend of `saturation` and
+  `softclip`.
+- `softclip`: Performs a perceptually balanced gamut mapping using a soft knee
+  function to roll-off clipped regions, and a hue shifting function to preserve
+  saturation.
+- `relative`: Performs relative colorimetric clipping, while maintaining an
+  exponential relationship between brightness and chromaticity.
+- `saturation`: Performs simple RGB->RGB saturation mapping. The input R/G/B
+  channels are mapped directly onto the output R/G/B channels. Will never clip,
+  but will distort all hues and/or result in a faded look.
+- `absolute`: Performs absolute colorimetric clipping. Like `relative`, but
+  does not adapt the white point.
+- `desaturate`: Performs constant-luminance colorimetric clipping, desaturing
+  colors towards white until they're in-range.
+- `darken`: Uniformly darkens the input slightly to prevent clipping on
+  blown-out highlights, then clamps colorimetrically to the input gamut
+  boundary, biased slightly to preserve chromaticity over luminance.
+- `highlight`: Performs no gamut mapping, but simply highlights out-of-gamut
+  pixels.
+- `linear`: Linearly/uniformly desaturates the image in order to bring the
+  entire image into the target gamut.
+
+### Gamut mapping constants
+
+These settings can be used to fine-tune the constants used for the various
+gamut mapping algorithms.
+
+#### `perceptual_deadzone=<0.0..1.0>`
+
+(Relative) chromaticity protection zone for `perceptual` mapping. Defaults to
+`0.30`.
+
+#### `perceptual_strength=<0.0..1.0>`
+
+Strength of the `perceptual` saturation mapping component. Defaults to `0.80`.
+
+#### `colorimetric_gamma=<0.0..10.0>`
+
+I vs C curve gamma to use for colorimetric clipping (`relative`, `absolute`
+and `darken`). Defaults to `1.80`.
+
+#### `softclip_knee=<0.0..1.0>`
+
+Knee point to use for soft-clipping methods (`perceptual`, `softclip`).
+Defaults to `0.70`.
+
+#### `softclip_desat=<0.0..1.0>`
+
+Desaturation strength for `softclip`. Defaults to `0.35`.
+
+### `lut3d_size_I=<0..1024>`, `lut3d_size_C=<0..1024>`, `lut3d_size_h=<0..1024>`
+
+Gamut mapping 3DLUT size. Setting a dimension to `0` picks the default value.
+Defaults to `48`, `32` and `256`, respectively, for channels `I`, `C` and `h`.
+
+### `lut3d_tricubic=<yes|no>`
+
+Use higher quality, but slower, tricubic interpolation for gamut mapping
+3DLUTs. May substantially improve the 3DLUT gamut mapping accuracy, in
+particular at smaller 3DLUT sizes. Shouldn't have much effect at the default
+size. Defaults to `no`.
+
+### `gamut_expansion=<yes|no>`
+
+If enabled, allows the gamut mapping function to expand the gamut, in cases
+where the target gamut exceeds that of the source. If disabled, the source
+gamut will never be enlarged, even when using a gamut mapping function capable
+of bidirectional mapping. Defaults to `no`.
+
+### `tone_mapping=<function>`
+
+Tone mapping function to use for adapting between difference luminance ranges,
+including black point adaptation. Defaults to `spline`. The following functions
+are available:
+
+- `clip`: Performs no tone-mapping, just clips out-of-range colors. Retains
+  perfect color accuracy for in-range colors but completely destroys
+  out-of-range information. Does not perform any black point adaptation.
+- `spline`: Simple spline consisting of two polynomials, joined by a single
+  pivot point, which is tuned based on the source scene average brightness
+  (taking into account dynamic metadata if available). This function can be
+  used for both forward and inverse tone mapping.
+- `st2094-40`: EETF from SMPTE ST 2094-40 Annex B, which uses the provided OOTF
+  based on Bezier curves to perform tone-mapping. The OOTF used is adjusted
+  based on the ratio between the targeted and actual display peak luminances.
+  In the absence of HDR10+ metadata, falls back to a simple constant bezier
+  curve.
+- `st2094-10`: EETF from SMPTE ST 2094-10 Annex B.2, which takes into account
+  the input signal average luminance in addition to the maximum/minimum.
+!!! warning
+    This does *not* currently include the subjective gain/offset/gamma controls
+    defined in Annex B.3. (Open an issue with a valid sample file if you want
+    such parameters to be respected.)
+- `bt2390`: EETF from the ITU-R Report BT.2390, a hermite spline roll-off with
+  linear segment.
+- `bt2446a`: EETF from ITU-R Report BT.2446, method A. Can be used for both
+  forward and inverse tone mapping.
+- `reinhard:` Very simple non-linear curve. Named after Erik Reinhard.
+- `mobius`: Generalization of the `reinhard` tone mapping algorithm to support
+  an additional linear slope near black. The name is derived from its function
+  shape `(ax+b)/(cx+d)`, which is known as a Möbius transformation. This
+  function is considered legacy/low-quality, and should not be used.
+- `hable`: Piece-wise, filmic tone-mapping algorithm developed by John Hable
+  for use in Uncharted 2, inspired by a similar tone-mapping algorithm used by
+  Kodak. Popularized by its use in video games with HDR rendering. Preserves
+  both dark and bright details very well, but comes with the drawback of
+  changing the average brightness quite significantly. This is sort of similar
+  to `reinhard` with `reinhard_contrast=0.24`. This function is considered
+  legacy/low-quality, and should not be used.
+- `gamma`: Fits a gamma (power) function to transfer between the source and
+  target color spaces, effectively resulting in a perceptual hard-knee joining
+  two roughly linear sections. This preserves details at all scales, but can
+  result in an image with a muted or dull appearance. This function
+  is considered legacy/low-quality and should not be used.
+- `linear`: Linearly stretches the input range to the output range, in PQ
+  space. This will preserve all details accurately, but results in a
+  significantly different average brightness. Can be used for inverse
+  tone-mapping in addition to regular tone-mapping.
+- `linearlight`: Like `linear`, but in linear light (instead of PQ). Works well
+  for small range adjustments but may cause severe darkening when
+  downconverting from e.g. 10k nits to SDR.
+
+### Tone-mapping constants
+
+These settings can be used to fine-tune the constants used for the various
+tone mapping algorithms.
+
+#### `knee_adaptation=<0.0..1.0>`
+
+Configures the knee point, as a ratio between the source average and target
+average (in PQ space). An adaptation of `1.0` always adapts the source scene
+average brightness to the (scaled) target average, while a value of `0.0` never
+modifies scene brightness.
+
+Affects all methods that use the ST2094 knee point determination (currently
+`spline`, `st2094-40` and `st2094-10`). Defaults to `0.4`.
+
+#### `knee_minimum=<0.0..0.5>`, `knee_maximum=<0.5..1.0>`
+
+Configures the knee point minimum and maximum, respectively, as a percentage of
+the PQ luminance range. Provides a hard limit on the knee point chosen by
+`knee_adaptation`. Defaults to `0.1` and `0.8`, respectively.
+
+#### `knee_default=<0.0..1.0>`
+
+Default knee point to use in the absence of source scene average metadata.
+Normally, this is ignored in favor of picking the knee point as the (relative)
+source scene average brightness level. Defaults to `0.4`.
+
+#### `knee_offset=<0.5..2.0>`
+
+Knee point offset (for `bt2390` only). Note that a value of `0.5` is the
+spec-defined default behavior, which differs from the libplacebo default of
+`1.0`.
+
+#### `slope_tuning=<0.0..10.0>`, `slope_offset=<0.0..1.0>`
+
+For the single-pivot polynomial (spline) function, this controls the
+coefficients used to tune the slope of the curve. This tuning is designed to
+make the slope closer to `1.0` when the difference in peaks is low, and closer
+to linear when the difference between peaks is high. Defaults to `1.5`, with
+offset `0.2`.
+
+#### `spline_contrast=<0.0..1.5>`
+
+Contrast setting for the `spline` function. Higher values make the curve
+steeper (closer to `clip`), preserving midtones at the cost of losing
+shadow/highlight details, while lower values make the curve shallowed (closer
+to `linear`), preserving highlights at the cost of losing midtone contrast.
+Values above `1.0` are possible, resulting in an output with more contrast than
+the input. Defaults to `0.5`.
+
+#### `reinhard_contrast=<0.0..1.0>`
+
+For the `reinhard` function, this specifies the local contrast coefficient at
+the display peak. Essentially, a value of `0.5` implies that the reference
+white will be about half as bright as when clipping. Defaults to `0.5`.
+
+#### `linear_knee=<0.0..1.0>`
+
+For legacy functions (`mobius`, `gamma`) which operate on linear light, this
+directly sets the corresponding knee point. Defaults to `0.3`.
+
+#### `exposure=<0.0..10.0>`
+
+For linear methods (`linear`, `linearlight`), this controls the linear
+exposure/gain applied to the image. Defaults to `1.0`.
+
+### `inverse_tone_mapping=<yes|no>`
+
+If enabled, and supported by the given tone mapping function, will perform
+inverse tone mapping to expand the dynamic range of a signal. libplacebo is not
+liable for any HDR-induced eye damage. Defaults to `no`.
+
+### `tone_map_metadata=<any|none|hdr10|hdr10plus|cie_y>`
+
+Data source to use when tone-mapping. Setting this to a specific value allows
+overriding the default metadata preference logic. Defaults to `any`.
+
+### `tone_lut_size=<0..4096>`
+
+Tone mapping LUT size. Setting `0` picks the default size. Defaults to `256`.
+
+### `contrast_recovery=<0.0..2.0>`
+
+HDR contrast recovery strength. If set to a value above `0.0`, the source image
+will be divided into high-frequency and low-frequency components, and a portion
+of the high-frequency image is added back onto the tone-mapped output. May
+cause excessive ringing artifacts for some HDR sources, but can improve the
+subjective sharpness and detail left over in the image after tone-mapping.
+
+Defaults to `0.0`. The `high_quality` preset sets this to `0.3`, which is a
+fairly conservativee value and should subtly enhance the image quality without
+creating too many obvious artefacts.
+
+### `contrast_smoothness=<1.0..32.0>`
+
+HDR contrast recovery lowpass kernel size. Increasing or decreasing this will
+affect the visual appearance substantially. Defaults to `3.5`.
+
+### Debug options
+
+Miscellaneous debugging and display options related to tone/gamut mapping.
+
+#### `force_tone_mapping_lut=<yes|no>`
+
+Force the use of a full tone-mapping LUT even for functions that have faster
+pure GLSL replacements (e.g. `clip`, `linear`, `saturation`). This is a debug
+option. Defaults to `no`.
+
+#### `visualize_lut=<yes|no>`
+
+Visualize the color mapping LUTs. Displays a (PQ-PQ) graph of the active
+tone-mapping LUT. The X axis shows PQ input values, the Y axis shows PQ output
+values. The tone-mapping curve is shown in green/yellow. Yellow means the
+brightness has been boosted from the source, dark blue regions show where the
+brightness has been reduced. The extra colored regions and lines indicate
+various monitor limits, as well a reference diagonal (neutral tone-mapping) and
+source scene average brightness information (if available). The background
+behind this shows a visualization of the gamut mapping 3DLUT, in IPT space.
+Iso-luminance, iso-chromaticity and iso-hue lines are highlighted (depending on
+the exact value of `visualize_theta`). Defaults to `no`.
+
+#### `visualize_lut_x0`, `visualize_lut_y0`, `visualize_lut_x0`, `visualize_lut_y1`
+
+Controls where to draw the LUt visualization, relative to the rendered video.
+Defaults to `0.0` for `x0`/`y0`, and `1.0` for `x1`/`y1`.
+
+#### `visualize_hue=<angle>`, `visualize_theta=<angle>`
+
+Controls the rotation of the gamut 3DLUT visualization. The `hue` parameter
+rotates the gamut through hue space (around the `I` axis), while the `theta`
+parameter vertically rotates the cross section (around the `C` axis), in
+radians. Defaults to `0.0` for both.
+
+#### `show_clipping=<yes|no>`
+
+Graphically highlight hard-clipped pixels during tone-mapping (i.e. pixels that
+exceed the claimed source luminance range). Defaults to `no`.
+
+## Dithering
+
+These options affect the way colors are dithered before output. Dithering is
+always required to avoid introducing banding artefacts as a result of
+quantization to a lower bit depth output texture.
+
+### `dither=<yes|no>`
+
+Enables dithering. Defaults to `yes`.
+
+### `dither_preset=<default>`
+
+Overrides the value of all options in this section by their default values from
+the given preset.
+
+### `dither_method=<method>`
+
+Chooses the dithering method to use. Defaults to `blue`. The following methods
+are available:
+
+- `blue`: Dither with blue noise. Very high quality, but requires the use of a
+  LUT.
+!!! warning
+    Computing a blue noise texture with a large size can be very slow, however
+    this only needs to be performed once. Even so, using this with a
+    `dither_lut_size` greater than `6` is generally ill-advised.
+- `ordered_lut`: Dither with an ordered (bayer) dither matrix, using a LUT. Low
+  quality, and since this also uses a LUT, there's generally no advantage to
+  picking this instead of `blue`. It's mainly there for testing.
+- `ordered`: The same as `ordered`, but uses fixed function math instead of a
+  LUT. This is faster, but only supports a fixed dither matrix size of 16x16
+  (equivalent to `dither_lut_size=4`).
+- `white`: Dither with white noise. This does not require a LUT and is fairly
+  cheap to compute. Unlike the other modes it doesn't show any repeating
+  patterns either spatially or temporally, but the downside is that this is
+  visually fairly jarring due to the presence of low frequencies in the noise
+  spectrum.
+
+### `dither_lut_size=<1..8>`
+
+For the dither methods which require the use of a LUT (`blue`, `ordered_lut`),
+this controls the size of the LUT (base 2). Defaults to `6`.
+
+### `dither_temporal=<yes|no>`
+
+Enables temporal dithering. This reduces the persistence of dithering artifacts
+by perturbing the dithering matrix per frame. Defaults to `no`.
+
+!!! warning
+    This can cause nasty aliasing artifacts on some LCD screens.
+
+## Cone distortion
+
+These options can be optionally used to modulate the signal in LMS space, in
+particular, to simulate color blindiness.
+
+### `cone=<yes|no>`
+
+Enables cone distortion. Defaults to `no`.
+
+### `cone_preset=<preset>`
+
+Overrides the value of all options in this section by their default values from
+the given preset. The following presets are available:
+
+- `normal`: No distortion (92% of population)
+- `protanomaly`: Red cone deficiency (0.66% of population)
+- `protanopia`: Red cone absence (0.59% of population)
+- `deuteranomaly`: Green cone deficiency (2.7% of population)
+- `deuteranopia`: Green cone absence (0.56% of population)
+- `tritanomaly`: Blue cone deficiency (0.01% of population)
+- `tritanopia`: Blue cone absence (0.016% of population)
+- `monochromacy`: Blue cones only (<0.001% of population)
+- `achromatopsia`: Rods only (<0.0001% of population)
+
+### `cones=<none|l|m|s|lm|ms|ls|lms>`
+
+Choose the set of cones to modulate. Defaults to `none`.
+
+### `cone_strength=<gain>`
+
+Defect/gain coefficient to apply to these cones. `1.0` = unaffected, `0.0` =
+full blindness. Defaults to `1.0`. Values above `1.0` can be used to instead
+boost the signal going to this cone. For example, to partially counteract
+deuteranomaly, you could set `cones=m`, `cone_strength=2.0`. Defaults to `0.0`.
+
+## Output blending
+
+These options affect the way the image is blended onto the output framebuffer.
+
+### `blend=<yes|no>`
+
+Enables output blending. Defaults to `no`.
+
+### `blend_preset=<alpha_overlay>`
+
+Overrides the value of all options in this section by their default values from
+the given preset. Currently, the only preset is `alpha_overlay`, which
+corresponds to normal alpha blending.
+
+### `blend_src_rgb`, `blend_src_alpha`, `blend_dst_rgb`, `blend_dst_alpha`
+
+Choose the blending mode for each component. Defaults to `zero` for all. The
+following modes are available:
+
+- `zero`: Component will be unused.
+- `one`: Component will be added at full strength.
+- `alpha`: Component will be multiplied by the source alpha value.
+- `one_minus_alpha`: Component will be multiplied by 1 minus the source alpha.
+
+## Deinterlacing
+
+Configures the settings used to deinterlace frames, if required.
+
+!!! note
+    The use of these options requires the caller to pass extra metadata to
+    incoming frames to link them together / mark them as fields.
+
+### `deinterlace=<yes|no>`
+
+Enables deinterlacing. Defaults to `no`.
+
+### `deinterlace_preset=<default>`
+
+Overrides the value of all options in this section by their default values from
+the given preset.
+
+### `deinterlace_algo=<algorithm>`
+
+Chooses the algorithm to use for deinterlacing. Defaults to `yadif`. The
+following algorithms are available:
+
+- `weave`: No-op deinterlacing, just sample the weaved frame un-touched.
+- `bob`: Naive bob deinterlacing. Doubles the field lines vertically.
+- `yadif`: "Yet another deinterlacing filter". Deinterlacer with temporal and
+  spatial information. Based on FFmpeg's Yadif filter algorithm, but adapted
+  slightly for the GPU.
+
+### `deinterlace_skip_spatial=<yes|no>`
+
+Skip the spatial interlacing check for `yadif`. Defaults to `no`.
+
+## Distortion
+
+The settings in this section can be used to distort/transform the output image.
+
+### `distort=<yes|no>`
+
+Enables distortion. Defaults to `no`.
+
+### `distort_preset=<default>`
+
+Overrides the value of all options in this section by their default values from
+the given preset.
+
+### `distort_scale_x`, `distort_scale_y`
+
+Scale the image in the X/Y dimension by an arbitrary factor. Corresponds to the
+main diagonal of the transformation matrix. Defaults to `1.0` for both.
+
+### `distort_shear_x`, `distort_shear_y`
+
+Adds the X/Y dimension onto the Y/X dimension (respectively), scaled by an
+arbitrary amount. Corresponds to the anti-diagonal of the 2x2 transformation
+matrix. Defaults to `0.0` for both.
+
+### `distort_offset_x`, `distort_offset_y`
+
+Offsets the X/Y dimensions by an arbitrary offset, relative to the image size.
+Corresponds to the bottom row of a 3x3 affine transformation matrix. Defaults
+to `0.0` for both.
+
+### `distort_unscaled=<yes|no>`
+
+If enabled, the texture is placed inside the center of the canvas without
+scaling. Otherwise, it is effectively stretched to the canvas size. Defaults
+to `no`.
+
+!!! note
+    This option has no effect when using `pl_renderer`.
+
+### `distort_constrain=<yes|no>`
+
+If enabled, the transformation is automatically scaled down and shifted to
+ensure that the resulting image fits inside the output canvas. Defaults to
+`no`.
+
+### `distort_bicubic=<yes|no>`
+
+If enabled, use bicubic interpolation rather than faster bilinear
+interpolation. Higher quality but slower. Defaults to `no`.
+
+### `distort_addreess_mode=<clamp|repeat|mirror>`
+
+Specifies the texture address mode to use when sampling out of bounds. Defaults
+to `clamp`.
+
+### `distort_alpha_mode=<none|independent|premultiplied>`
+
+If set to something other than `none`, all out-of-bounds accesses will instead
+be treated as transparent, according to the given alpha mode.
+
+## Miscellaneous renderer settings
+
+### `error_diffusion=<kernel>`
+
+Enables error diffusion dithering. Error diffusion is a very slow and memory
+intensive method of dithering without the use of a fixed dither pattern. If
+set, this will be used instead of `dither_method` whenever possible. It's
+highly recommended to use this only for still images, not moving video.
+Defaults to `none`. The following options are available:
+
+- `simple`: Simple error diffusion (fast)
+- `false-fs`: False Floyd-Steinberg kernel (fast)
+- `sierra-lite`: Sierra Lite kernel (slow)
+- `floyd-steinberg`: Floyd-Steinberg kernel (slow)
+- `atkinson`: Atkinson kernel (slow)
+- `jarvis-judice-ninke`: Jarvis, Judice & Ninke kernel (very slow)
+- `stucki`: Stucki kernel (very slow)
+- `burkes`: Burkes kernel (very slow)
+- `sierra-2`: Two-row Sierra (very slow)
+- `sierra-3`: Three-row Sierra (very slow)
+
+### `lut_type=<type>`
+
+Overrides the color mapping LUT type. Defaults to `unknown`. The following
+options are available:
+
+- `unknown`: Unknown LUT type, try and guess from metadata
+- `native`: LUT is applied to raw image contents
+- `normalized`: LUT is applied to normalized (HDR) RGB values
+- `conversion`: LUT fully replaces color conversion step
+
+!!! note
+    There is no way to load LUTs via the options mechanism, so this option only
+    has an effect if the LUT is loaded via external means.
+
+### `background_r=<0.0..1.0>`, `background_g=<0.0..1.0>`, `background_b=<0.0..1.0>`
+
+If the image being rendered does not span the entire size of the target, it
+will be cleared explicitly using this background color (RGB). Defaults to `0.0`
+for all.
+
+### `background_transparency=<0.0..1.0>`
+
+The (inverted) alpha value of the background clear color. Defaults to `0.0`.
+
+### `skip_target_clearing=<yes|no>`
+
+If set, skips clearing the background backbuffer entirely. Defaults to `no`.
+
+!!! note
+    This is automatically skipped if the image to be rendered would completely
+    cover the backbuffer.
+
+### `corner_rounding=<0.0..1.0>`
+
+If set to a value above `0.0`, the output will be rendered with rounded
+corners, as if an alpha transparency mask had been applied. The value indicates
+the relative fraction of the side length to round - a value of `1.0` rounds the
+corners as much as possible. Defaults to `0.0`.
+
+### `blend_against_tiles=<yes|no>`
+
+If true, then transparent images will made opaque by painting them against a
+checkerboard pattern consisting of alternating colors. Defaults to `no`.
+
+### `tile_color_hi_r`, `tile_color_hi_g`, `tile_color_hi_b`, `tile_color_lo_r`, `tile_color_lo_g`, `tile_color_l_b`
+
+The colors of the light/dark tiles used for `blend_against_tiles`. Defaults to
+`0.93` for light R/G/B and `0.87` for dark R/G/B, respectively.
+
+### `tile_size=<2..256>`
+
+The size, in output pixels, of the tiles used for `blend_against_tiles`.
+Defaults to `32`.
+
+## Performance / quality trade-offs
+
+These should generally be left off where quality is desired, as they can
+degrade the result quite noticeably; but may be useful for older or slower
+hardware. Note that libplacebo will automatically disable advanced features on
+hardware where they are unsupported, regardless of these settings. So only
+enable them if you need a performance bump.
+
+### `skip_anti_aliasing=<yes|no>`
+
+Disables anti-aliasing on downscaling. This will result in moiré artifacts and
+nasty, jagged pixels when downscaling, except for some very limited special
+cases (e.g. bilinear downsampling to exactly 0.5x). Significantly speeds up
+downscaling with high downscaling ratios. Defaults to `no`.
+
+### `preserve_mixing_cache=<yes|no>`
+
+Normally, when the size of the target framebuffer changes, or the render
+parameters are updated, the internal cache of mixed frames must be discarded in
+order to re-render all required frames. Setting this option to `yes` will skip
+the cache invalidation and instead re-use the existing frames (with bilinear
+scaling to the new size if necessary). This comes at a hefty quality loss
+shortly after a resize, but should make it much more smooth. Defaults to `no`.
+
+## Debugging, tuning and testing
+
+These may affect performance or may make debugging problems easier, but
+shouldn't have any effect on the quality (except where otherwise noted).
+
+### `skip_caching_single_frame=<yes|no>`
+
+Normally, single frames will also get pushed through the mixer cache, in order
+to speed up re-draws. Enabling this option disables that logic, causing single
+frames to bypass being written to the cache. Defaults to `no`.
+
+!!! note
+    If a frame is *already* cached, it will be re-used, regardless.
+
+### `disable_linear_scaling=<yes|no>`
+
+Disables linearization / sigmoidization before scaling. This might be useful
+when tracking down unexpected image artifacts or excessing ringing, but it
+shouldn't normally be necessary. Defaults to `no`.
+
+### `disable_builtin_scalers=<yes|no>`
+
+Forces the use of the slower, "general" scaling algorithms even when faster
+built-in replacements exist. Defaults to `no`.
+
+### `correct_subpixel_offsets=<yes|no>`
+
+Forces correction of subpixel offsets (using the configured `upscaler`).
+Defaults to `no`.
+
+!!! warning
+    Enabling this may cause such images to get noticeably blurrier, especially
+    when using a polar scaler. It's not generally recommended to enable this.
+
+### `force_dither=<yes|no>`
+
+Forces the use of dithering, even when rendering to 16-bit FBOs. This is
+generally pretty pointless because most 16-bit FBOs have high enough depth that
+rounding errors are below the human perception threshold, but this can be used
+to test the dither code. Defaults to `no`.
+
+### `disable_dither_gamma_correction=<yes|no>`
+
+Disables the gamma-correct dithering logic which normally applies when
+dithering to low bit depths. No real use, outside of testing. Defaults to `no`.
+
+### `disable_fbos=<yes|no>`
+
+Completely overrides the use of FBOs, as if there were no renderable texture
+format available. This disables most features. Defaults to `no`.
+
+### `force_low_bit_depth_fbos=<yes|no>`
+
+Use only low-bit-depth FBOs (8 bits). Note that this also implies disabling
+linear scaling and sigmoidization. Defaults to `no`.
+
+### `dynamic_constants=<yes|no>`
+
+If this is enabled, all shaders will be generated as "dynamic" shaders, with
+any compile-time constants being replaced by runtime-adjustable values. This is
+generally a performance loss, but has the advantage of being able to freely
+change parameters without triggering shader recompilations. It's a good idea to
+enable this if you will change these options very frequently, but it should be
+disabled once those values are "dialed in". Defaults to `no`.
diff --git a/docs/renderer.md b/docs/renderer.md
new file mode 100644
index 0000000..3104b0d
--- /dev/null
+++ b/docs/renderer.md
@@ -0,0 +1,302 @@
+# Rendering content: pl_frame, pl_renderer, and pl_queue
+
+This example roughly builds off the [previous entry](./basic-rendering.md),
+and as such will not cover the basics of how to create a window, initialize a
+`pl_gpu` and get pixels onto the screen.
+
+## Renderer
+
+The `pl_renderer` set of APIs represents the highest-level interface into
+libplacebo, and is what most users who simply want to display e.g. a video
+feed on-screen will want to be using.
+
+The basic initialization is straightforward, requiring no extra parameters:
+
+``` c linenums="1"
+pl_renderer renderer;
+
+init()
+{
+    renderer = pl_renderer_create(pllog, gpu);
+    if (!renderer)
+        goto error;
+
+    // ...
+}
+
+uninit()
+{
+    pl_renderer_destroy(&renderer);
+}
+```
+
+What makes the renderer powerful is the large number of `pl_render_params` it
+exposes. By default, libplacebo provides several presets to use:
+
+* **pl_render_fast_params**: Disables everything except for defaults. This is
+  the fastest possible configuration.
+* **pl_render_default_params**: Contains the recommended default parameters,
+  including some slightly higher quality scaling, as well as dithering.
+* **pl_render_high_quality_params**: A preset of reasonable defaults for a
+  higher-end machine (i.e. anything with a discrete GPU). This enables most
+  of the basic functionality, including upscaling, downscaling, debanding
+  and better HDR tone mapping.
+
+Covering all of the possible options exposed by `pl_render_params` is
+out-of-scope of this example and would be better served by looking at [the API
+documentation](https://code.videolan.org/videolan/libplacebo/-/blob/master/src/include/libplacebo/renderer.h#L94).
+
+### Frames
+
+[`pl_frame`](https://code.videolan.org/videolan/libplacebo/-/blob/master/src/include/libplacebo/renderer.h#L503)
+is the struct libplacebo uses to group textures and their metadata together
+into a coherent unit that can be rendered using the renderer. This is not
+currently a dynamically allocated or refcounted heap object, it is merely a
+struct that can live on the stack (or anywhere else). The actual data lives in
+corresponding `pl_tex` objects referenced in each of the frame's planes.
+
+``` c linenums="1"
+bool render_frame(const struct pl_frame *image,
+                  const struct pl_swapchain_frame *swframe)
+{
+    struct pl_frame target;
+    pl_frame_from_swapchain(&target, swframe);
+
+    return pl_render_image(renderer, image, target,
+                           &pl_render_default_params);
+}
+```
+
+!!! note "Renderer state"
+    The `pl_renderer` is conceptually (almost) stateless. The only thing that
+    is needed to get a different result is to change the render params, which
+    can be varied freely on every call, if the user desires.
+
+    The one case where this is not entirely true is when using frame mixing
+    (see below), or when using HDR peak detection. In this case, the renderer
+    can be explicitly reset using `pl_renderer_flush_cache`.
+
+To upload frames, the easiest methods are made available as dedicated helpers
+in
+[`<libplacebo/utils/upload.h>`](https://code.videolan.org/videolan/libplacebo/-/blob/master/src/include/libplacebo/utils/upload.h),
+and
+[`<libplacebo/utils/libav.h>`](https://code.videolan.org/videolan/libplacebo/-/blob/master/src/include/libplacebo/utils/libav.h)
+(for AVFrames). In general, I recommend checking out the [demo
+programs](https://code.videolan.org/videolan/libplacebo/-/tree/master/demos)
+for a clearer illustration of how to use them in practice.
+
+### Shader cache
+
+The renderer internally generates, compiles and caches a potentially large
+number of shader programs, some of which can be complex. On some platforms
+(notably D3D11), these can be quite costly to recompile on every program
+launch.
+
+As such, the renderer offers a way to save/restore its internal shader cache
+from some external location (managed by the API user). The use of this API is
+highly recommended:
+
+``` c linenums="1" hl_lines="1-2 10-14 21-27"
+static uint8_t *load_saved_cache();
+static void store_saved_cache(uint8_t *cache, size_t bytes);
+
+void init()
+{
+    renderer = pl_renderer_create(pllog, gpu);
+    if (!renderer)
+        goto error;
+
+    uint8_t *cache = load_saved_cache();
+    if (cache) {
+        pl_renderer_load(renderer, cache);
+        free(cache);
+    }
+
+    // ...
+}
+
+void uninit()
+{
+    size_t cache_bytes = pl_renderer_save(renderer, NULL);
+    uint8_t *cache = malloc(cache_bytes);
+    if (cache) {
+        pl_renderer_save(renderer, cache);
+        store_saved_cache(cache, cache_bytes);
+        free(cache);
+    }
+
+    pl_renderer_destroy(&renderer);
+}
+```
+
+!!! warning "Cache safety"
+    libplacebo performs only minimal validity checking on the shader cache,
+    and in general, cannot possibly guard against malicious alteration of such
+    files. Loading a cache from an untrusted source represents a remote code
+    execution vector.
+
+## Frame mixing
+
+One of the renderer's most powerful features is its ability to compensate
+for differences in framerates between the source and display by using [frame
+mixing](https://github.com/mpv-player/mpv/wiki/Interpolation) to blend
+adjacent frames together.
+
+Using this API requires presenting the renderer, at each vsync, with a
+`pl_frame_mix` struct, describing the current state of the vsync. In
+principle, such structs can be constructed by hand. To do this, all of the
+relevant frames (nearby the vsync timestamp) must be collected, and their
+relative distances to the vsync determined, by normalizing all PTS values such
+that the vsync represents time `0.0` (and a distance of `1.0` represents the
+nominal duration between adjacent frames). Note that timing vsyncs, and
+determining the correct vsync duration, are both left as problems for the user
+to solve.[^timing]. Here could be an example of a valid struct:
+
+[^timing]: However, this may change in the future, as the recent introduction of
+  the Vulkan display timing extension may result in display timing feedback
+  being added to the `pl_swapchain` API. That said, as of writing, this has
+  not yet happened.
+
+``` c
+(struct pl_frame_mix) {
+    .num_frames = 6
+    .frames = (const struct pl_frame *[]) {
+        /* frame 0 */
+        /* frame 1 */
+        /* ... */
+        /* frame 5 */
+    },
+    .signatures = (uint64_t[]) {
+        0x0, 0x1, 0x2, 0x3, 0x4, 0x5 // (1)
+    },
+    .timestamps = (float[]) {
+        -2.4, -1.4, -0.4, 0.6, 1.6, 2.6, // (2)
+    },
+    .vsync_duration = 0.4, // 24 fps video on 60 fps display
+}
+```
+
+1.  These must be unique per frame, but always refer to the same frame. For
+    example, this could be based on the frame's PTS, the frame's numerical ID
+    (in order of decoding), or some sort of hash. The details don't matter,
+    only that this uniquely identifies specific frames.
+
+2.  Typically, for CFR sources, frame timestamps will always be separated in
+    this list by a distance of 1.0. In this example, the vsync falls roughly
+    halfway (but not quite) in between two adjacent frames (with IDs 0x2 and
+    0x3).
+
+!!! note "Frame mixing radius"
+    In this example, the frame mixing radius (as determined by
+    `pl_frame_mix_radius` is `3.0`, so we include all frames that fall within
+    the timestamp interval of `[-3, 3)`. In general, you should consult this
+    function to determine what frames need to be included in the
+    `pl_frame_mix` - though including more frames than needed is not an error.
+
+### Frame queue
+
+Because this API is rather unwieldy and clumsy to use directly, libplacebo
+provides a helper abstraction known as `pl_queue` to assist in transforming
+some arbitrary source of frames (such as a video decoder) into nicely packed
+`pl_frame_mix` structs ready for consumption by the `pl_renderer`:
+
+``` c linenums="1"
+#include <libplacebo/utils/frame_queue.h>
+
+pl_queue queue;
+
+void init()
+{
+    queue = pl_queue_create(gpu);
+}
+
+void uninit()
+{
+    pl_queue_destroy(&queue);
+    // ...
+}
+```
+
+This queue can be interacted with through a number of mechanisms: either
+pushing frames (blocking or non-blocking), or by having the queue poll frames
+(via blocking or non-blocking callback) as-needed. For a full overview of the
+various methods of pushing and polling frames, check the [API
+documentation](https://code.videolan.org/videolan/libplacebo/-/blob/master/src/include/libplacebo/utils/frame_queue.h#L115).
+
+In this example, I will assume that we have a separate decoder thread pushing
+frames into the `pl_queue` in a blocking manner:
+
+``` c linenums="1"
+static void decoder_thread(void)
+{
+    void *frame;
+
+    while ((frame = /* decode new frame */)) {
+        pl_queue_push_block(queue, UINT64_MAX, &(struct pl_source_frame) {
+            .pts        = /* frame pts */,
+            .duration   = /* frame duration */,
+            .map        = /* map callback */,
+            .unmap      = /* unmap callback */,
+            .frame_data = frame,
+        });
+    }
+
+    pl_queue_push(queue, NULL); // signal EOF
+}
+```
+
+Now, in our render loop, we want to call `pl_queue_update` with appropriate
+values to retrieve the correct frame mix for each vsync:
+
+``` c linenums="1" hl_lines="3-10 12-21 27"
+bool render_frame(const struct pl_swapchain_frame *swframe)
+{
+    struct pl_frame_mix mix;
+    enum pl_queue_status res;
+    res = pl_queue_update(queue, &mix, pl_queue_params(
+        .pts            = /* time of next vsync */,
+        .radius         = pl_frame_mix_radius(&render_params),
+        .vsync_duration = /* if known */,
+        .timeout        = UINT64_MAX, // (2)
+    ));
+
+    switch (res) {
+    case PL_QUEUE_OK:
+        break;
+    case PL_QUEUE_EOF:
+        /* no more frames */
+        return false;
+    case PL_QUEUE_ERR:
+        goto error;
+    // (1)
+    }
+
+
+    struct pl_frame target;
+    pl_frame_from_swapchain(&target, swframe);
+
+    return pl_render_image_mix(renderer, &mix, target,
+                               &pl_render_default_params);
+}
+```
+
+1.  There is a fourth status, `PL_QUEUE_MORE`, which is returned only if the
+    resulting frame mix is incomplete (and the timeout was reached) -
+    basically this can only happen if the queue runs dry due to frames not
+    being supplied fast enough.
+
+    In this example, since we are setting `timeout` to `UINT64_MAX`, we will
+    never get this return value.
+
+2.  Setting this makes `pl_queue_update` block indefinitely until sufficiently
+    many frames have been pushed into the `pl_queue` from our separate
+    decoding thread.
+
+### Deinterlacing
+
+The frame queue also vastly simplifies the process of performing
+motion-adaptive temporal deinterlacing, by automatically linking together
+adjacent fields/frames. To take advantage of this, all you need to do is set
+the appropriate field (`pl_source_frame.first_frame`), as well as enabling
+[deinterlacing
+parameters](https://code.videolan.org/videolan/libplacebo/-/blob/master/src/include/libplacebo/renderer.h#L186).
diff --git a/docs/style.css b/docs/style.css
new file mode 100644
index 0000000..81ed8a8
--- /dev/null
+++ b/docs/style.css
@@ -0,0 +1,3 @@
+.md-typeset p {
+    margin: 1em 1em;
+}
diff --git a/gcovr.cfg b/gcovr.cfg
new file mode 100644
index 0000000..ac1fb7a
--- /dev/null
+++ b/gcovr.cfg
@@ -0,0 +1,4 @@
+exclude = .*/tests/.*
+exclude = .*/demos/.*
+exclude = .*_gen\.c$
+sort-uncovered = yes
diff --git a/meson.build b/meson.build
new file mode 100644
index 0000000..5101043
--- /dev/null
+++ b/meson.build
@@ -0,0 +1,475 @@
+project('libplacebo', ['c', 'cpp'],
+  license: 'LGPL2.1+',
+  default_options: [
+    'buildtype=debugoptimized',
+    'warning_level=2',
+    'c_std=c11',
+    'cpp_std=c++20',
+  ],
+  meson_version: '>=0.63',
+  version: '@0@.@1@.@2@'.format(
+    # Major version
+    6,
+    # API version
+    {
+      '338': 'split pl_filter_nearest into pl_filter_nearest and pl_filter_box',
+      '337': 'fix PL_FILTER_DOWNSCALING constant',
+      '336': 'deprecate pl_filter.radius_cutoff in favor of pl_filter.radius',
+      '335': 'remove {pl_render_params,pl_sample_filter_params}.{lut_entries,polar_cutoff}',
+      '334': 'add pl_tex_transfer_params.no_import',
+      '333': 'add pl_shader_sample_{hermite,gaussian}',
+      '332': 'add pl_filter_function_hermite and pl_filter_hermite',
+      '331': 'add pl_filter_function_cubic and remove bcspline family of filter functions',
+      '330': 'add pl_frames_infer(_mix)',
+      '329': 'add pl_frame_mix_current and pl_frame_mix_nearest',
+      '328': 'remove pl_render_params.ignore_icc_profiles',
+      '327': 'remove pl_render_params.icc_params',
+      '326': 'add pl_frame.icc',
+      '325': 'add pl_icc_update',
+      '324': 'add pl_render_params.correct_subpixel_offsets',
+      '323': 'deprecate pl_{dispatch,renderer}_{save,load}',
+      '322': 'remove pl_pass_params.cached_program(_len)',
+      '321': 'deprecate pl_icc_params.cache_{save,load,priv}',
+      '320': 'add pl_gpu_set_cache',
+      '319': 'add <libplacebo/cache.h>',
+      '318': 'add pl_filter_ewa_lanczossharp and pl_filter_ewa_lanczos4sharpest',
+      '317': 'add pl_filter_config.antiring',
+      '316': 'remove pl_filter_config.filter_scale',
+      '315': 'remove pl_tone_map_auto',
+      '314': 'add pl_renderer_get_hdr_metadata',
+      '313': 'remove pl_peak_detect_params.minimum_peak',
+      '312': 'add pl_gamut_map_constants.perceptual_strength',
+      '311': 'add pl_tone_map_constants, pl_tone_map_params.constants and pl_color_map_params.tone_constants',
+      '310': 'add pl_gamut_map_constants, pl_gamut_map_params.constants and pl_color_map_params.gamut_constants',
+      '309': 'add <libplacebo/options.h>',
+      '308': 'add pl_hook_par.names',
+      '307': 'add pl_filter.radius_zero',
+      '306': 'add pl_filter_functions and pl_filter_configs',
+      '305': 'add pl_filter_function.opaque and move pl_filter_oversample to filters.h',
+      '304': 'add pl_filter_config.allowed/recommended',
+      '303': 'refactor pl_filter_config and pl_filter_function',
+      '302': 'change type of pl_icc_params.size_r/g/b to int',
+      '301': 'add PL_COLOR_ADJUSTMENT_NEUTRAL and pl_color_adjustment()',
+      '300': 'add pl_color_map_params.gamut_expansion',
+      '299': 'add pl_primaries_compatible and pl_primaries_clip',
+      '298': 'add pl_gamut_map_softclip',
+      '297': 'add pl_tone_map_linear_light',
+      '296': 'add pl_queue_estimate_fps/vps, pl_queue_num_frames and pl_queue_peek',
+      '295': 'change pl_source_frame.pts and pl_queue_params.pts to double',
+      '294': 'add pl_vulkan_swapchain_params.disable_10bit_sdr',
+      '293': 'add pl_test_pixfmt_caps',
+      '292': 'add pl_peak_detect_high_quality_params and pl_color_map_high_quality_params',
+      '291': 'add PL_COLOR_HDR_BLACK, PL_COLOR_SDR_CONTRAST and PL_COLOR_HLG_PEAK',
+      '290': 'remove pl_color_map_params.hybrid_mix',
+      '289': 'remove pl_gamut_map_params.chroma_margin',
+      '288': 'add pl_color_map_params.lut3d_tricubic',
+      '287': 'add pl_transform2x2_bounds',
+      '286': 'add PL_RENDER_ERR_CONTRAST_RECOVERY',
+      '285': 'add pl_color_map_params.contrast_recovery/smoothness, ' +
+             'pl_color_map_args.feature_map and pl_shader_extract_features',
+      '284': 'add pl_color_map_args and pl_shader_color_map_ex',
+      '283': 'add pl_render_params.distort_params',
+      '282': 'add PL_HOOK_PRE_OUTPUT',
+      '281': 'add pl_matrix2x2_rotation',
+      '280': 'add pl_distortion_params and pl_shader_distort',
+      '279': 'add pl_matrix2x2_scale/invert and pl_transform2x2_scale/invert',
+      '278': 'switch pl_vulkan.(un)lock_queue to uint32_t',
+      '277': 'add pl_render_params.corner_rounding',
+      '276': 'add pl_get_mapped_avframe',
+      '275': 'add pl_vulkan_params.extra_queues',
+      '274': 'drop minimum vulkan version to 1.2',
+      '273': 'add pl_vulkan_required_features and refactor pl_vulkan_recommended_features',
+      '272': 'require vulkan version 1.3 minimum',
+      '271': 'deprecate pl_vulkan.queues',
+      '270': 'add pl_color_map_params.visualize_hue/theta',
+      '269': 'refactor pl_color_map_params gamut mapping settings',
+      '268': 'add <libplacebo/gamut_mapping.h>',
+      '267': 'add pl_ipt_lms2rgb/rgb2lms and pl_ipt_lms2ipt/ipt2lms',
+      '266': 'add pl_shader_info and change type of pl_dispatch_info.shader',
+      '265': 'remove fields deprecated for libplacebo v4',
+      '264': 'add pl_color_map_params.show_clipping',
+      '263': 'add pl_peak_detect_params.percentile',
+      '262': 'add pl_color_map_params.visualize_rect',
+      '261': 'add pl_color_map_params.metadata',
+      '260': 'add pl_tone_map_params.input_avg',
+      '259': 'add pl_color_space_nominal_luma_ex',
+      '258': 'add pl_hdr_metadata_type and pl_hdr_metadata_contains',
+      '257': 'add pl_hdr_metadata.max_pq_y and avg_pq_y',
+      '256': 'deprecate pl_peak_detect_params.overshoot_margin',
+      '255': 'deprecate pl_get_detected_peak and add pl_get_detected_hdr_metadata',
+      '254': 'deprecate pl_renderer_params.allow_delayed_peak_detect and add pl_peak_detect_params.allow_delayed',
+      '253': 'remove pl_color_space.nominal_min/max and add pl_color_space_nominal_peak',
+      '252': 'remove pl_swapchain.impl',
+      '251': 'add `utils/dolbyvision.h` and `pl_hdr_metadata_from_dovi_rpu`',
+      '250': 'add pl_frame_map_avdovi_metadata',
+      '249': 'add `pl_render_error`, `pl_render_errors` and `pl_renderer_get_errors`',
+      '248': 'add pl_hook.signature',
+      '247': 'add pl_color_map_params.visualize_lut',
+      '246': 'add `pl_tone_map_st2094_10` and `pl_tone_map_st2094_40`',
+      '245': 'add `pl_tone_map_params.hdr`',
+      '244': 'add `pl_map_hdr_metadata`',
+      '243': 'add `pl_color_space.nominal_min/max`',
+      '242': 'add `pl_hdr_metadata.scene_max/avg` and `pl_hdr_metadata.ootf`',
+      '241': 'add `pl_plane_data.swapped`',
+      '240': 'add `PL_COLOR_TRC_ST428`',
+      '239': 'add `pl_fmt.planes` and `pl_tex.planes`',
+      '238': 'add `pl_vulkan_wrap_params.aspect`',
+      '237': 'add `pl_vulkan_hold_ex` and `pl_vulkan_release_ex`',
+      '236': 'add `pl_vulkan_sem_create` and `pl_vulkan_sem_destroy`',
+      '235': 'add `pl_vulkan.get_proc_addr`',
+      '234': 'add `pl_gpu_limits.host_cached`',
+      '233': 'add `pl_hook.parameters`, `struct pl_hook_par`',
+      '232': 'add `pl_plane_data_from_comps`',
+      '231': 'add `pl_tone_map_params_infer`',
+      '230': 'add PL_COLOR_PRIM_ACES_AP0 and PL_COLOR_PRIM_ACES_AP1',
+      '229': 'add pl_shader_sample_ortho2, deprecate pl_shader_sample_ortho',
+      '228': 'add pl_icc_params.force_bpc',
+      '227': 'refactor `pl_render_info.index` and add `pl_render_info.count`',
+      '226': 'add `pl_dither_params.transfer` and `pl_render_params.disable_dither_gamma_correction`',
+      '225': 'add `pl_render_params.error_diffusion`',
+      '224': 'add `pl_shader_error_diffusion` and related functions',
+      '223': 'add <libplacebo/shaders/dithering.h>',
+      '222': 'add `pl_icc_params.cache_save/load`',
+      '221': 'add `pl_source_frame.first_field`',
+      '220': 'add deinterlacing-related fields to `pl_frame` and `pl_render_params`',
+      '219': 'add pl_source_frame.duration, deprecating pl_queue_params.frame_duration',
+      '218': 'add <libplacbeo/shaders/deinterlacing.h> and pl_shader_deinterlace',
+      '217': 'add pl_color_space_infer_map',
+      '216': 'add pl_deband_params.grain_neutral',
+      '215': 'add pl_opengl_params.get_proc_addr_ex',
+      '214': 'drop deprecated legacy C struct names',
+      '213': 'add pl_opengl_params.get_proc_addr',
+      '212': 'add pl_opengl.major/minor version numbers',
+      '211': 'add pl_opengl.extensions and pl_opengl_has_ext',
+      '210': 'add PL_HANDLE_MTL_TEX, PL_HANDLE_IOSURFACE, and pl_shared_mem.plane',
+      '209': 'add pl_gpu_limits.array_size_constants',
+      '208': 'add pl_filter_function.name and pl_filter_config.name',
+      '207': 'add pl_render_params.plane_upscaler and plane_downscaler',
+      '206': 'add new ICC profile API (pl_icc_open, ...)',
+      '205': 'add pl_cie_from_XYZ and pl_raw_primaries_similar, fix pl_cie_xy_equal',
+      '204': 'add pl_d3d11_swapchain_params.disable_10bit_sdr',
+      '203': 'add pl_film_grain_from_av',
+      '202': 'add pl_frame.acquire/release',
+      '201': 'add pl_vulkan.(un)lock_queue',
+      '200': 'always set pl_vulkan.queue_*',
+      '199': 'add pl_plane.flipped',
+      '198': 'remove PL_HOOK_PRE_OVERLAY',
+      '197': 'add pl_overlay.coords, change type of pl_overlay_part.dst',
+      '196': 'add pl_render_params.force_low_bit_depth_fbos',
+      '195': 'change pl_log_create prototype to pl_log_create_${api_ver} to do linking time api check',
+      '194': 'add pl_primaries_valid',
+      '193': 'add pl_hook_params.orig_repr/color',
+      '192': 'add pl_map_avframe_ex',
+      '191': 'add pl_map_dovi_metadata',
+      '190': 'add pl_color_map_params.gamut_mode, replacing gamut_clipping/warning',
+      '189': 'refactor pl_color_space, merging it with pl_hdr_metadata',
+      '188': 'refactor pl_color_map_params tone mapping settings',
+      '187': 'add <libplacebo/tone_mapping.h>',
+      '186': 'add pl_d3d11_swapchain_params.flags',
+      '185': 'add PL_COLOR_SYSTEM_DOLBYVISION and reshaping',
+      '184': 'add pl_map_avframe/pl_unmap_avframe, deprecate pl_upload_avframe',
+      '183': 'relax pl_shared_mem.size > 0 requirement',
+      '182': 'add pl_vulkan_get, pl_opengl_get, pl_d3d11_get',
+      '181': 'add pl_shader_set_alpha, change alpha handling of pl_shader_decode_color',
+      '180': 'add pl_gpu_limits.max_variable_comps',
+      '179': 'add pl_render_params.skip_caching_single_frame',
+      '178': 'add pl_gpu_limits.align_vertex_stride',
+      '177': 'add debug_tag to pl_tex/buf_params',
+      '176': 'revert vulkan 1.2 requirement',
+      '175': 'require timeline semaphores for all vulkan devices',
+      '174': 'deprecate pl_vulkan_params.disable_events',
+      '173': 'remove VkAccessFlags from pl_vulkan_hold/release',
+      '172': 'replace VkSemaphore by pl_vulkan_sem in pl_vulkan_hold/release',
+      '171': 'make vulkan 1.2 the minimum version',
+      '170': 'allow pl_queue_update on NULL',
+      '169': 'refactor pl_pass_params.target_dummy into target_format',
+      '168': 'refactor pl_tex_transfer.stride_w/h into row/depth_pitch',
+      '167': 'expose pl_dispatch_reset_frame',
+      '166': 'add pl_index_format',
+      '165': 'add pl_fmt.signature',
+      '164': 'support blending against tiles',
+      '163': 'add pl_frame_copy_stream_props',
+      '162': 'support rotation in pl_renderer',
+      '161': 'make H.274 film grain values indirect',
+      '160': 'add preprocessor macros for default params',
+      '159': 'remove fields deprecated for libplacebo v3',
+      '158': 'add support for H.274 film grain',
+      '157': 'add pl_peak_detect_params.minimum_peak',
+      '156': 'add pl_swapchain_colors_from_avframe/dav1dpicture',
+      '155': 'refactor pl_swapchain_hdr_metadata into pl_swapchain_colorspace_hint',
+      '154': 'add <libplacebo/d3d11.h>',
+      '153': 'add pl_render_info callbacks',
+      '152': 'add pl_dispatch_info callbacks',
+      '151': 'pl_shader_res.description/steps',
+      '150': 'add PL_FMT_CAP_READWRITE',
+      '149': 'add pl_gpu_limits.buf_transfer',
+      '148': 'refactor pl_gpu_caps',
+      '147': 'add pl_color_space.sig_floor and black point adaptation',
+      '146': 'add PL_COLOR_TRC_GAMMA20, GAMMA24 and GAMMA26',
+      '145': 'add pl_render_params/pl_shader_params.dynamic_constants',
+      '144': 'add support for pl_constant specialization constants',
+      '143': 'add pl_color_space_infer_ref',
+      '142': 'add pl_render_params.background_transparency and pl_frame_clear_rgba',
+      '141': 'add pl_filter_oversample',
+      '140': 'add pl_shader_sample_oversample',
+      '139': 'make vulkan 1.1 the minimum vulkan version',
+      '138': 're-add and properly deprecate pl_filter_haasnsoft',
+      '137': 'change behavior of pl_image_mix.num_frames == 1',
+      '136': 'add pl_fmt.gatherable',
+      '135': 'add pl_queue_params.interpolation_threshold',
+      '134': 'add pl_render_params.ignore_icc_profiles',
+      '133': 'remove pl_shader_signature',
+      '132': 'add pl_tex_clear_ex',
+      '131': 'remove PL_PRIM_TRIANGLE_FAN',
+      '130': 'provide typedefs for object types, e.g. const struct pl_tex * -> pl_tex',
+      '129': 'rename pl_context to pl_log, move <libplacebo/context.h> to <libplacebo/log.h>',
+      '128': 'add pl_opengl_params.make/release_current, for thread safety',
+      '127': 'add pl_get_buffer2',
+      '126': 'add pl_render_params.background_color',
+      '125': 'allow calling pl_render_image on NULL',
+      '124': 'make pl_queue_update return valid data even on PL_QUEUE_MORE',
+      '123': 'refactor pl_overlay from pl_plane into pl_overlay_part',
+      '122': 'make pl_gpu thread safe',
+      '121': 'add pl_queue_push_block and refactor frame queue threading',
+      '120': 'refactor pl_named_filter_config into pl_filter_preset',
+      '119': 'add pl_color_adjustment.temperature',
+      '118': 'add <libplacebo/utils/frame_queue.h>',
+      '117': 'rename pl_filter_triangle/box to pl_filter_bilinear/nearest',
+      '116': 'add pl_frame_recreate_from_avframe and pl_download_avframe',
+      '115': 'add pl_dispatch_vertex',
+      '114': 'add pl_pass_run_params.index_data',
+      '113': 'add <libplacebo/shaders/lut.h>',
+      '112': 'add <libplacebo/shaders/icc.h>, replacing existing 3dlut API',
+      '111': 'add pl_fmt.modifiers for proper DRM format modifier support',
+      '110': 'refactor pl_upload_dav1dpicture',
+      '109': 'add support for host pointer imports on OpenGL',
+      '108': 'add <libplacebo/utils/dav1d.h>',
+      '107': 'add pl_render_image_mix',
+      '106': 'add pl_shared_mem.stride_w/h',
+      '105': 'add asynchronous texture transfers',
+      '104': 'add pl_render_params.blend_params',
+      '103': 'move pl_tex_sample_mode from pl_tex_params to pl_desc_binding',
+      '102': 'add pl_tex_poll',
+      '101': 'merge pl_image and pl_render_target into pl_frame',
+      '100': 'add pl_render_target.planes',
+      '99': 'add pl_sample_src.component_mask',
+      '98': 'add pl_vulkan_params.disable_overmapping',
+      '97': 'add pl_av1_grain_params.luma_comp',
+      '96': 'add <libplacebo/utils/libav.h>',
+      '95': 'add PL_COLOR_PRIM_EBU3213 and FILM_C',
+      '94': 'add support for //!BUFFER to user shaders',
+      '93': 'add pl_plane_data_align',
+      '92': 'add more pl_var helper functions',
+      '91': 'implement PL_HANDLE_DMA_BUF for EGL',
+      '90': 'add pl_opengl_params.allow_software',
+      '89': 'use uniform arrays instead of shader literals for LUTs',
+      '88': 'add pl_shared_mem.drm_format_mod',
+      '87': 'refactor pl_opengl_wrap',
+      '86': 'add pl_pass_run_params.vertex_buf',
+      '85': 'add PL_HANDLE_HOST_PTR',
+      '84': 'add pl_buf_params.import/export_handle',
+      '83': 'add pl_shader_custom',
+      '82': 'add pl_gpu_is_failed',
+      '81': 'add PL_GPU_CAP_SUBGROUPS',
+      '80': 'add pl_color_map_params.gamut_clipping',
+      '79': 'add pl_get_detected_peak',
+      '78': 'add pl_buf_copy',
+      '77': 'make all pl_buf_* commands implicitly synchronized',
+      '76': 'add pl_vulkan_swapchain_params.prefer_hdr',
+      '75': 'add pl_dispatch_save/load',
+      '74': 'remove pl_image.signature',
+      '73': 'add pl_memory_qualifiers',
+      '72': 'generalize PL_SHADER_SIG_SAMPLER2D into PL_SHADER_SIG_SAMPLER',
+      '71': 'add pl_opengl_wrap/unwrap',
+      '70': 'add pl_tex_sampler_type',
+      '69': 'add pl_peak_detect_params.overshoot_margin',
+      '68': 'add PL_TONE_MAPPING_BT_2390',
+      '67': 'add pl_image_set_chroma_location',
+      '66': 'change pl_render_target.dst_rect from pl_rect2d to pl_rect2df',
+      '65': 'add PL_SHADER_SIG_SAMPLER2D',
+      '64': 'add pl_rect2df_aspect_* family of functions',
+      '63': 'refactor pl_shader_av1_grain',
+      '62': 'refactor PL_COLOR_REF_WHITE into PL_COLOR_SDR_WHITE and PL_COLOR_SDR_WHITE_HLG',
+      '61': 'refactor pl_dispatch_finish etc. to support timers',
+      '60': 'add pl_timer',
+      '59': 'add pl_render_high_quality_params',
+      '58': 'add <libplacebo/shaders/custom.h> and pl_hook',
+      '57': 'add width/height fields to pl_dispatch_compute',
+      '56': 'make pl_vulkan.features etc. extensible',
+      '55': 'add pl_vulkan_params.features',
+      '54': 'add pl_vulkan_import',
+      '53': 'refactor pl_vulkan_wrap',
+      '52': 'revert addition of pl_filter_nearest',
+      '51': 'add pl_vulkan_hold_raw',
+      '50': 'add pl_vulkan_params.device_uuid',
+      '49': 'add pl_filter_nearest',
+      '48': 'deprecate pl_image.width/height',
+      '47': 'add more matrix math helpers to common.h',
+      '46': 'add pl_vk_inst_params.debug_extra',
+      '45': 'add pl_vulkan.api_version',
+      '44': 'add pl_swapchain_hdr_metadata',
+      '43': 'add pl_vulkan/opengl_params.max_glsl_version',
+      '42': 'add pl_vk_inst_params.layers/opt_layers',
+      '41': 'add PL_FMT_CAP_HOST_READABLE',
+      '40': 'add PL_GPU_CAP_BLITTABLE_1D_3D',
+      '39': 'add pl_render_params.disable_fbos',
+      '38': 'add pl_render_params.force_dither',
+      '37': 'add pl_color_levels_guess',
+      '36': 'remove pl_opengl.priv leftover',
+      '35': 'fix pl_vulkan_swapchain_suboptimal signature',
+      '34': 'add <libplacebo/opengl.h>',
+      '33': 'add pl_image.av1_grain',
+      '32': 'refactor pl_grain_params',
+      '31': 'add pl_vulkan_params.get_proc_addr',
+      '30': 'add pl_gpu.pci',
+      '29': 'add pl_vulkan_swapchain_params.allow_suboptimal',
+      '28': 'eliminate void *priv fields from all object types',
+      '27': 'add pl_vulkan_choose_device',
+      '26': 'add PL_GPU_CAP_MAPPED_BUFFERS',
+      '25': 'add pl_fmt.internal_size',
+      '24': 'add pl_vulkan_params.disable_events',
+      '23': 'add error checking to functions in <libplacebo/gpu.h>',
+      '22': 'add pl_vulkan_params.blacklist_caps',
+      '21': 'add pl_shader_params.glsl',
+      '20': 'refactor pl_shader_alloc',
+      '19': 'default to GLSL 130 instead of 110 if unspecified',
+      '18': 'add pl_swapchain_resize',
+      '17': 'add pl_context_update',
+      '16': 'add pl_tex/buf_params.user_data',
+      '15': 'add <libplacebo/dummy.h>',
+      '14': 'remove ident from pl_shader_reset',
+      '13': 'add pl_render_params.peak_detect_params',
+      '12': 'add pl_shader_detect_peak',
+      '11': 'add pl_var_int',
+      '10': 'refactor pl_color_map_params desaturation fields',
+      '9': 'add pl_tex_params.import/export_handle',
+      '8': 'add pl_color_space.sig_scale',
+      '7': 'initial major release',
+      '6': '',
+      '5': '',
+      '4': '',
+      '3': '',
+      '2': '',
+      '1': '',
+    }.keys().length(),
+    # Fix version
+    2)
+)
+
+### Version number and configuration
+version = meson.project_version()
+version_pretty = 'v' + version
+version_split = version.split('.')
+
+majorver = version_split[0]
+apiver = version_split[1]
+fixver = version_split[2]
+
+# Configuration data
+conf_public = configuration_data()
+conf_internal = configuration_data()
+conf_public.set('majorver', majorver)
+conf_public.set('apiver', apiver)
+conf_internal.set('BUILD_API_VER', apiver)
+conf_internal.set('BUILD_FIX_VER', fixver)
+conf_internal.set('PL_DEBUG_ABORT', get_option('debug-abort'))
+
+
+### Global build options
+build_opts = [
+  # Warnings
+  '-Wundef', '-Wshadow', '-Wparentheses', '-Wpointer-arith',
+  '-fno-math-errno',
+]
+
+link_args = []
+
+cc = meson.get_compiler('c')
+cxx = meson.get_compiler('cpp')
+
+c_opts = [
+  '-D_ISOC99_SOURCE', '-D_ISOC11_SOURCE', '-D_GNU_SOURCE', '-U__STRICT_ANSI__',
+  '-Wmissing-prototypes',
+
+  # Warnings to ignore
+  '-Wno-sign-compare', '-Wno-unused-parameter',
+  '-Wno-missing-field-initializers', '-Wno-type-limits',
+
+  # Warnings to treat as errors
+  '-Werror=implicit-function-declaration',
+]
+
+if cc.has_argument('-Wincompatible-pointer-types')
+  c_opts += ['-Werror=incompatible-pointer-types']
+endif
+
+# clang's version of -Wmissing-braces rejects the common {0} initializers
+if cc.get_id() == 'clang'
+  c_opts += ['-Wno-missing-braces']
+endif
+
+# For sanitizers to work/link properly some public symbols have to be available.
+if get_option('b_sanitize') == 'none'
+  # don't leak library symbols if possible
+  vflag = '-Wl,--exclude-libs=ALL'
+  # link and lld-link don't support this arg, but it only shows warning about
+  # unsupported argument. Meson doesn't detect it, so manually exclude them.
+  if cc.has_link_argument(vflag) and not ['lld-link', 'link'].contains(cc.get_linker_id())
+    link_args += [vflag]
+  endif
+endif
+
+# OS specific build options
+if host_machine.system() == 'windows'
+    build_opts += ['-D_WIN32_WINNT=0x0601',
+                   '-D_USE_MATH_DEFINES',
+                   '-DWIN32_LEAN_AND_MEAN',
+                   '-DNOMINMAX',
+                   '-D_CRT_SECURE_NO_WARNINGS']
+    subdir('win32')
+endif
+
+add_project_arguments(build_opts + c_opts, language: ['c'])
+add_project_arguments(build_opts, language: ['c', 'cpp'])
+add_project_link_arguments(link_args, language: ['c', 'cpp'])
+
+
+# Global dependencies
+fs = import('fs')
+libm = cc.find_library('m', required: false)
+thirdparty = meson.project_source_root()/'3rdparty'
+python = import('python').find_installation()
+python_env = environment()
+python_env.append('PYTHONPATH', thirdparty/'jinja/src')
+python_env.append('PYTHONPATH', thirdparty/'markupsafe/src')
+python_env.append('PYTHONPATH', thirdparty/'glad')
+
+if host_machine.system() == 'windows'
+    threads = declare_dependency()
+else
+    pthreads = dependency('threads')
+    has_setclock = cc.has_header_symbol(
+                     'pthread.h',
+                     'pthread_condattr_setclock',
+                     dependencies: pthreads,
+                     args: c_opts,
+                   )
+
+    threads = declare_dependency(
+      dependencies: pthreads,
+      compile_args: [pthreads.found() ? '-DPL_HAVE_PTHREAD' : '',
+                     has_setclock ? '-DPTHREAD_HAS_SETCLOCK' : '',]
+    )
+endif
+
+build_deps = [ libm, threads ]
+
+subdir('tools')
+subdir('src')
+
+if get_option('demos')
+  subdir('demos')
+endif
+
+# Allows projects to build libplacebo by cloning into ./subprojects/libplacebo
+meson.override_dependency('libplacebo', libplacebo)
diff --git a/meson_options.txt b/meson_options.txt
new file mode 100644
index 0000000..5e582fe
--- /dev/null
+++ b/meson_options.txt
@@ -0,0 +1,55 @@
+# Optional components
+option('vulkan', type: 'feature', value: 'auto',
+       description: 'Vulkan-based renderer')
+
+option('vk-proc-addr', type: 'feature', value: 'auto',
+       description: 'Link directly against vkGetInstanceProcAddr from libvulkan.so')
+
+option('vulkan-registry', type: 'string', value: '',
+       description: 'Path to vulkan XML registry (for code generation)')
+
+option('opengl', type: 'feature', value: 'auto',
+       description: 'OpenGL-based renderer')
+
+option('gl-proc-addr', type: 'feature', value: 'auto',
+       description: 'Enable built-in OpenGL loader (uses dlopen, dlsym...)')
+
+option('d3d11', type: 'feature', value: 'auto',
+       description: 'Direct3D 11 based renderer')
+
+option('glslang', type: 'feature', value: 'auto',
+       description: 'glslang SPIR-V compiler')
+
+option('shaderc', type: 'feature', value: 'auto',
+       description: 'libshaderc SPIR-V compiler')
+
+option('lcms', type: 'feature', value: 'auto',
+       description: 'LittleCMS 2 support')
+
+option('dovi', type: 'feature', value: 'auto',
+       description: 'Dolby Vision reshaping support')
+
+option('libdovi', type: 'feature', value: 'auto',
+       description: 'libdovi support')
+
+# Miscellaneous
+option('demos', type: 'boolean', value: true,
+       description: 'Enable building (and installing) the demo programs')
+
+option('tests', type: 'boolean', value: false,
+       description: 'Enable building the test cases')
+
+option('bench', type: 'boolean', value: false,
+       description: 'Enable building benchmarks (`meson test benchmark`)')
+
+option('fuzz', type: 'boolean', value: false,
+       description: 'Enable building fuzzer binaries (`CC=afl-cc`)')
+
+option('unwind', type: 'feature', value: 'auto',
+       description: 'Enable linking against libunwind for printing stack traces caused by runtime errors')
+
+option('xxhash', type: 'feature', value: 'auto',
+       description: 'Use libxxhash as a faster replacement for internal siphash')
+
+option('debug-abort', type: 'boolean', value: false,
+       description: 'abort() on most runtime errors (only for debugging purposes)')
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 0000000..dfe29ed
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,49 @@
+site_name: libplacebo
+site_url: https://libplacebo.org/
+repo_url: https://code.videolan.org/videolan/libplacebo
+repo_name: videolan/libplacebo
+copyright: Copyright &copy; 2017-2022 Niklas Haas
+
+theme:
+  name: material
+  palette:
+    - scheme: slate
+      primary: deep purple
+      accent: deep purple
+      toggle:
+        icon: material/brightness-4
+        name: Switch to light mode
+    - scheme: default
+      primary: purple
+      accent: purple
+      toggle:
+        icon: material/brightness-7
+        name: Switch to dark mode
+  icon:
+    repo: fontawesome/brands/gitlab
+  features:
+    - content.code.annotate
+
+extra_css:
+  - style.css
+
+markdown_extensions:
+  - admonition
+  - footnotes
+  - pymdownx.highlight:
+      anchor_linenums: true
+  - pymdownx.details
+  - pymdownx.snippets
+  - pymdownx.superfences
+  - toc:
+      toc_depth: 3
+
+nav:
+  - 'Using':
+    - index.md
+    - basic-rendering.md
+    - renderer.md
+    - custom-shaders.md
+    - options.md
+  - 'Developing':
+    - glsl.md
diff --git a/src/cache.c b/src/cache.c
new file mode 100644
index 0000000..4f8ed4e
--- /dev/null
+++ b/src/cache.c
@@ -0,0 +1,447 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+#include <limits.h>
+
+#include "common.h"
+#include "cache.h"
+#include "log.h"
+#include "pl_thread.h"
+
+const struct pl_cache_params pl_cache_default_params = {0};
+
+struct priv {
+    pl_log log;
+    pl_mutex lock;
+    PL_ARRAY(pl_cache_obj) objects;
+    size_t total_size;
+};
+
+int pl_cache_objects(pl_cache cache)
+{
+    if (!cache)
+        return 0;
+
+    struct priv *p = PL_PRIV(cache);
+    pl_mutex_lock(&p->lock);
+    int num = p->objects.num;
+    pl_mutex_unlock(&p->lock);
+    return num;
+}
+
+size_t pl_cache_size(pl_cache cache)
+{
+    if (!cache)
+        return 0;
+
+    struct priv *p = PL_PRIV(cache);
+    pl_mutex_lock(&p->lock);
+    size_t size = p->total_size;
+    pl_mutex_unlock(&p->lock);
+    return size;
+}
+
+pl_cache pl_cache_create(const struct pl_cache_params *params)
+{
+    struct pl_cache_t *cache = pl_zalloc_obj(NULL, cache, struct priv);
+    struct priv *p = PL_PRIV(cache);
+    pl_mutex_init(&p->lock);
+    if (params) {
+        cache->params = *params;
+        p->log = params->log;
+    }
+
+    // Sanitize size limits
+    size_t total_size  = PL_DEF(cache->params.max_total_size,  SIZE_MAX);
+    size_t object_size = PL_DEF(cache->params.max_object_size, SIZE_MAX);
+    object_size = PL_MIN(total_size, object_size);
+    cache->params.max_total_size  = total_size;
+    cache->params.max_object_size = object_size;
+
+    return cache;
+}
+
+static void remove_obj(pl_cache cache, pl_cache_obj obj)
+{
+    struct priv *p = PL_PRIV(cache);
+
+    p->total_size -= obj.size;
+    if (obj.free)
+        obj.free(obj.data);
+}
+
+void pl_cache_destroy(pl_cache *pcache)
+{
+    pl_cache cache = *pcache;
+    if (!cache)
+         return;
+
+    struct priv *p = PL_PRIV(cache);
+    for (int i = 0; i < p->objects.num; i++)
+        remove_obj(cache, p->objects.elem[i]);
+
+    pl_assert(p->total_size == 0);
+    pl_mutex_destroy(&p->lock);
+    pl_free((void *) cache);
+    *pcache = NULL;
+}
+
+void pl_cache_reset(pl_cache cache)
+{
+    if (!cache)
+        return;
+
+    struct priv *p = PL_PRIV(cache);
+    pl_mutex_lock(&p->lock);
+    for (int i = 0; i < p->objects.num; i++)
+        remove_obj(cache, p->objects.elem[i]);
+    p->objects.num = 0;
+    pl_assert(p->total_size == 0);
+    pl_mutex_unlock(&p->lock);
+}
+
+static bool try_set(pl_cache cache, pl_cache_obj obj)
+{
+    struct priv *p = PL_PRIV(cache);
+
+    // Remove any existing entry with this key
+    for (int i = p->objects.num - 1; i >= 0; i--) {
+        pl_cache_obj prev = p->objects.elem[i];
+        if (prev.key == obj.key) {
+            PL_TRACE(p, "Removing out-of-date object 0x%"PRIx64, prev.key);
+            remove_obj(cache, prev);
+            PL_ARRAY_REMOVE_AT(p->objects, i);
+            break;
+        }
+    }
+
+    if (!obj.size) {
+        PL_TRACE(p, "Deleted object 0x%"PRIx64, obj.key);
+        return true;
+    }
+
+    if (obj.size > cache->params.max_object_size) {
+        PL_DEBUG(p, "Object 0x%"PRIx64" (size %zu) exceeds max size %zu, discarding",
+                 obj.key, obj.size, cache->params.max_object_size);
+        return false;
+    }
+
+    // Make space by deleting old objects
+    while (p->total_size + obj.size > cache->params.max_total_size ||
+           p->objects.num == INT_MAX)
+    {
+        pl_assert(p->objects.num);
+        pl_cache_obj old = p->objects.elem[0];
+        PL_TRACE(p, "Removing object 0x%"PRIx64" (size %zu) to make room",
+                 old.key, old.size);
+        remove_obj(cache, old);
+        PL_ARRAY_REMOVE_AT(p->objects, 0);
+    }
+
+    if (!obj.free) {
+        obj.data = pl_memdup(NULL, obj.data, obj.size);
+        obj.free = pl_free;
+    }
+
+    PL_TRACE(p, "Inserting new object 0x%"PRIx64" (size %zu)", obj.key, obj.size);
+    PL_ARRAY_APPEND((void *) cache, p->objects, obj);
+    p->total_size += obj.size;
+    return true;
+}
+
+static pl_cache_obj strip_obj(pl_cache_obj obj)
+{
+    return (pl_cache_obj) { .key = obj.key };
+}
+
+bool pl_cache_try_set(pl_cache cache, pl_cache_obj *pobj)
+{
+    if (!cache)
+        return false;
+
+    pl_cache_obj obj = *pobj;
+    struct priv *p = PL_PRIV(cache);
+    pl_mutex_lock(&p->lock);
+    bool ok = try_set(cache, obj);
+    pl_mutex_unlock(&p->lock);
+    if (ok) {
+        *pobj = strip_obj(obj); // ownership transfers, clear ptr
+    } else {
+        obj = strip_obj(obj); // ownership remains with caller, clear copy
+    }
+    if (cache->params.set)
+        cache->params.set(cache->params.priv, obj);
+    return ok;
+}
+
+void pl_cache_set(pl_cache cache, pl_cache_obj *obj)
+{
+    if (!pl_cache_try_set(cache, obj)) {
+        if (obj->free)
+            obj->free(obj->data);
+        *obj = (pl_cache_obj) { .key = obj->key };
+    }
+}
+
+static void noop(void *ignored)
+{
+    (void) ignored;
+}
+
+bool pl_cache_get(pl_cache cache, pl_cache_obj *out_obj)
+{
+    const uint64_t key = out_obj->key;
+    if (!cache)
+        goto fail;
+
+    struct priv *p = PL_PRIV(cache);
+    pl_mutex_lock(&p->lock);
+
+    // Search backwards to prioritize recently added entries
+    for (int i = p->objects.num - 1; i >= 0; i--) {
+        pl_cache_obj obj = p->objects.elem[i];
+        if (obj.key == key) {
+            PL_ARRAY_REMOVE_AT(p->objects, i);
+            p->total_size -= obj.size;
+            pl_mutex_unlock(&p->lock);
+            pl_assert(obj.free);
+            *out_obj = obj;
+            return true;
+        }
+    }
+
+    pl_mutex_unlock(&p->lock);
+    if (!cache->params.get)
+        goto fail;
+
+    pl_cache_obj obj = cache->params.get(cache->params.priv, key);
+    if (!obj.size)
+        goto fail;
+
+    // Sanitize object
+    obj.key = key;
+    obj.free = PL_DEF(obj.free, noop);
+    *out_obj = obj;
+    return true;
+
+fail:
+    *out_obj = (pl_cache_obj) { .key = key };
+    return false;
+}
+
+void pl_cache_iterate(pl_cache cache,
+                      void (*cb)(void *priv, pl_cache_obj obj),
+                      void *priv)
+{
+    if (!cache)
+        return;
+
+    struct priv *p = PL_PRIV(cache);
+    pl_mutex_lock(&p->lock);
+    for (int i = 0; i < p->objects.num; i++)
+        cb(priv, p->objects.elem[i]);
+    pl_mutex_unlock(&p->lock);
+}
+
+// --- Saving/loading
+
+#define CACHE_MAGIC   "pl_cache"
+#define CACHE_VERSION 1
+#define PAD_ALIGN(x)  PL_ALIGN2(x, sizeof(uint32_t))
+
+struct __attribute__((__packed__)) cache_header {
+    char     magic[8];
+    uint32_t version;
+    uint32_t num_entries;
+};
+
+struct __attribute__((__packed__)) cache_entry {
+    uint64_t key;
+    uint64_t size;
+    uint64_t hash;
+};
+
+pl_static_assert(sizeof(struct cache_header) % alignof(struct cache_entry) == 0);
+
+int pl_cache_save_ex(pl_cache cache,
+                     void (*write)(void *priv, size_t size, const void *ptr),
+                     void *priv)
+{
+    if (!cache)
+        return 0;
+
+    struct priv *p = PL_PRIV(cache);
+    pl_mutex_lock(&p->lock);
+    pl_clock_t start = pl_clock_now();
+
+    const int num_objects = p->objects.num;
+    const size_t saved_bytes = p->total_size;
+    write(priv, sizeof(struct cache_header), &(struct cache_header) {
+        .magic       = CACHE_MAGIC,
+        .version     = CACHE_VERSION,
+        .num_entries = num_objects,
+    });
+
+    for (int i = 0; i < num_objects; i++) {
+        pl_cache_obj obj = p->objects.elem[i];
+        PL_TRACE(p, "Saving object 0x%"PRIx64" (size %zu)", obj.key, obj.size);
+        write(priv, sizeof(struct cache_entry), &(struct cache_entry) {
+            .key  = obj.key,
+            .size = obj.size,
+            .hash = pl_mem_hash(obj.data, obj.size),
+        });
+        static const uint8_t padding[PAD_ALIGN(1)] = {0};
+        write(priv, obj.size, obj.data);
+        write(priv, PAD_ALIGN(obj.size) - obj.size, padding);
+    }
+
+    pl_mutex_unlock(&p->lock);
+    pl_log_cpu_time(p->log, start, pl_clock_now(), "saving cache");
+    if (num_objects)
+        PL_DEBUG(p, "Saved %d objects, totalling %zu bytes", num_objects, saved_bytes);
+
+    return num_objects;
+}
+
+int pl_cache_load_ex(pl_cache cache,
+                     bool (*read)(void *priv, size_t size, void *ptr),
+                     void *priv)
+{
+    if (!cache)
+        return 0;
+
+    struct priv *p = PL_PRIV(cache);
+    struct cache_header header;
+    if (!read(priv, sizeof(header), &header)) {
+        PL_ERR(p, "Failed loading cache: file seems empty or truncated");
+        return -1;
+    }
+    if (memcmp(header.magic, CACHE_MAGIC, sizeof(header.magic)) != 0) {
+        PL_ERR(p, "Failed loading cache: invalid magic bytes");
+        return -1;
+    }
+    if (header.version != CACHE_VERSION) {
+        PL_INFO(p, "Failed loading cache: wrong version... skipping");
+        return 0;
+    }
+    if (header.num_entries > INT_MAX) {
+        PL_ERR(p, "Failed loading cache: %"PRIu32" entries overflows int",
+               header.num_entries);
+        return 0;
+    }
+
+    int num_loaded = 0;
+    size_t loaded_bytes = 0;
+    pl_mutex_lock(&p->lock);
+    pl_clock_t start = pl_clock_now();
+
+    for (int i = 0; i < header.num_entries; i++) {
+        struct cache_entry entry;
+        if (!read(priv, sizeof(entry), &entry)) {
+            PL_WARN(p, "Cache seems truncated, missing objects.. ignoring rest");
+            goto error;
+        }
+
+        if (entry.size > SIZE_MAX) {
+            PL_WARN(p, "Cache object size %"PRIu64" overflows SIZE_MAX.. "
+                    "suspect broken file, ignoring rest", entry.size);
+            goto error;
+        }
+
+        void *buf = pl_alloc(NULL, PAD_ALIGN(entry.size));
+        if (!read(priv, PAD_ALIGN(entry.size), buf)) {
+            PL_WARN(p, "Cache seems truncated, missing objects.. ignoring rest");
+            pl_free(buf);
+            goto error;
+        }
+
+        uint64_t checksum = pl_mem_hash(buf, entry.size);
+        if (checksum != entry.hash) {
+            PL_WARN(p, "Cache entry seems corrupt, checksum mismatch.. ignoring rest");
+            pl_free(buf);
+            goto error;
+        }
+
+        pl_cache_obj obj = {
+            .key  = entry.key,
+            .size = entry.size,
+            .data = buf,
+            .free = pl_free,
+        };
+
+        PL_TRACE(p, "Loading object 0x%"PRIx64" (size %zu)", obj.key, obj.size);
+        if (try_set(cache, obj)) {
+            num_loaded++;
+            loaded_bytes += entry.size;
+        } else {
+            pl_free(buf);
+        }
+    }
+
+    pl_log_cpu_time(p->log, start, pl_clock_now(), "loading cache");
+    if (num_loaded)
+        PL_DEBUG(p, "Loaded %d objects, totalling %zu bytes", num_loaded, loaded_bytes);
+
+    // fall through
+error:
+    pl_mutex_unlock(&p->lock);
+    return num_loaded;
+}
+
+// Save/load wrappers
+
+struct ptr_ctx {
+    uint8_t *data; // base pointer
+    size_t size;   // total size
+    size_t pos;    // read/write index
+};
+
+static void write_ptr(void *priv, size_t size, const void *ptr)
+{
+    struct ptr_ctx *ctx = priv;
+    size_t end = PL_MIN(ctx->pos + size, ctx->size);
+    if (end > ctx->pos)
+        memcpy(ctx->data + ctx->pos, ptr, end - ctx->pos);
+    ctx->pos += size;
+}
+
+static bool read_ptr(void *priv, size_t size, void *ptr)
+{
+    struct ptr_ctx *ctx = priv;
+    if (ctx->pos + size > ctx->size)
+        return false;
+    memcpy(ptr, ctx->data + ctx->pos, size);
+    ctx->pos += size;
+    return true;
+}
+
+size_t pl_cache_save(pl_cache cache, uint8_t *data, size_t size)
+{
+    struct ptr_ctx ctx = { data, size };
+    pl_cache_save_ex(cache, write_ptr, &ctx);
+    return ctx.pos;
+}
+
+int pl_cache_load(pl_cache cache, const uint8_t *data, size_t size)
+{
+    return pl_cache_load_ex(cache, read_ptr, &(struct ptr_ctx) {
+        .data = (uint8_t *) data,
+        .size = size,
+    });
+}
diff --git a/src/cache.h b/src/cache.h
new file mode 100644
index 0000000..7e0ff2f
--- /dev/null
+++ b/src/cache.h
@@ -0,0 +1,72 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+#include "hash.h"
+
+#include <libplacebo/cache.h>
+
+// Convenience wrapper around pl_cache_set
+static inline void pl_cache_str(pl_cache cache, uint64_t key, pl_str *str)
+{
+    pl_cache_set(cache, &(pl_cache_obj) {
+        .key  = key,
+        .data = pl_steal(NULL, str->buf),
+        .size = str->len,
+        .free = pl_free,
+    });
+    *str = (pl_str) {0};
+}
+
+// Steal and insert a cache object
+static inline void pl_cache_steal(pl_cache cache, pl_cache_obj *obj)
+{
+    if (obj->free == pl_free)
+        obj->data = pl_steal(NULL, obj->data);
+    pl_cache_set(cache, obj);
+}
+
+// Resize `obj->data` to a given size, re-using allocated buffers where possible
+static inline void pl_cache_obj_resize(void *alloc, pl_cache_obj *obj, size_t size)
+{
+    if (obj->free != pl_free) {
+        if (obj->free)
+            obj->free(obj->data);
+        obj->data = pl_alloc(alloc, size);
+        obj->free = pl_free;
+    } else if (pl_get_size(obj->data) < size) {
+        obj->data = pl_steal(alloc, obj->data);
+        obj->data = pl_realloc(alloc, obj->data, size);
+    }
+    obj->size = size;
+}
+
+// Internal list of base seeds for different object types, randomly generated
+
+enum {
+    CACHE_KEY_SH_LUT    = UINT64_C(0x2206183d320352c6), // sh_lut cache
+    CACHE_KEY_ICC_3DLUT = UINT64_C(0xff703a6dd8a996f6), // ICC 3dlut
+    CACHE_KEY_DITHER    = UINT64_C(0x6fed75eb6dce86cb), // dither matrix
+    CACHE_KEY_H274      = UINT64_C(0x2fb9adca04b42c4d), // H.274 film grain DB
+    CACHE_KEY_GAMUT_LUT = UINT64_C(0x6109e47f15d478b1), // gamut mapping 3DLUT
+    CACHE_KEY_SPIRV     = UINT64_C(0x32352f6605ff60a7), // bare SPIR-V module
+    CACHE_KEY_VK_PIPE   = UINT64_C(0x4bdab2817ad02ad4), // VkPipelineCache
+    CACHE_KEY_GL_PROG   = UINT64_C(0x4274c309f4f0477b), // GL_ARB_get_program_binary
+    CACHE_KEY_D3D_DXBC  = UINT64_C(0x807668516811d3bc), // DXBC bytecode
+};
diff --git a/src/colorspace.c b/src/colorspace.c
new file mode 100644
index 0000000..5cef2b5
--- /dev/null
+++ b/src/colorspace.c
@@ -0,0 +1,1609 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include "common.h"
+#include "hash.h"
+
+#include <libplacebo/colorspace.h>
+#include <libplacebo/tone_mapping.h>
+
+bool pl_color_system_is_ycbcr_like(enum pl_color_system sys)
+{
+    switch (sys) {
+    case PL_COLOR_SYSTEM_UNKNOWN:
+    case PL_COLOR_SYSTEM_RGB:
+    case PL_COLOR_SYSTEM_XYZ:
+        return false;
+    case PL_COLOR_SYSTEM_BT_601:
+    case PL_COLOR_SYSTEM_BT_709:
+    case PL_COLOR_SYSTEM_SMPTE_240M:
+    case PL_COLOR_SYSTEM_BT_2020_NC:
+    case PL_COLOR_SYSTEM_BT_2020_C:
+    case PL_COLOR_SYSTEM_BT_2100_PQ:
+    case PL_COLOR_SYSTEM_BT_2100_HLG:
+    case PL_COLOR_SYSTEM_DOLBYVISION:
+    case PL_COLOR_SYSTEM_YCGCO:
+        return true;
+    case PL_COLOR_SYSTEM_COUNT: break;
+    };
+
+    pl_unreachable();
+}
+
+bool pl_color_system_is_linear(enum pl_color_system sys)
+{
+    switch (sys) {
+    case PL_COLOR_SYSTEM_UNKNOWN:
+    case PL_COLOR_SYSTEM_RGB:
+    case PL_COLOR_SYSTEM_BT_601:
+    case PL_COLOR_SYSTEM_BT_709:
+    case PL_COLOR_SYSTEM_SMPTE_240M:
+    case PL_COLOR_SYSTEM_BT_2020_NC:
+    case PL_COLOR_SYSTEM_YCGCO:
+        return true;
+    case PL_COLOR_SYSTEM_BT_2020_C:
+    case PL_COLOR_SYSTEM_BT_2100_PQ:
+    case PL_COLOR_SYSTEM_BT_2100_HLG:
+    case PL_COLOR_SYSTEM_DOLBYVISION:
+    case PL_COLOR_SYSTEM_XYZ:
+        return false;
+    case PL_COLOR_SYSTEM_COUNT: break;
+    };
+
+    pl_unreachable();
+}
+
+enum pl_color_system pl_color_system_guess_ycbcr(int width, int height)
+{
+    if (width >= 1280 || height > 576) {
+        // Typical HD content
+        return PL_COLOR_SYSTEM_BT_709;
+    } else {
+        // Typical SD content
+        return PL_COLOR_SYSTEM_BT_601;
+    }
+}
+
+bool pl_bit_encoding_equal(const struct pl_bit_encoding *b1,
+                           const struct pl_bit_encoding *b2)
+{
+    return b1->sample_depth == b2->sample_depth &&
+           b1->color_depth  == b2->color_depth &&
+           b1->bit_shift    == b2->bit_shift;
+}
+
+const struct pl_color_repr pl_color_repr_unknown = {0};
+
+const struct pl_color_repr pl_color_repr_rgb = {
+    .sys    = PL_COLOR_SYSTEM_RGB,
+    .levels = PL_COLOR_LEVELS_FULL,
+};
+
+const struct pl_color_repr pl_color_repr_sdtv = {
+    .sys    = PL_COLOR_SYSTEM_BT_601,
+    .levels = PL_COLOR_LEVELS_LIMITED,
+};
+
+const struct pl_color_repr pl_color_repr_hdtv = {
+    .sys    = PL_COLOR_SYSTEM_BT_709,
+    .levels = PL_COLOR_LEVELS_LIMITED,
+};
+
+const struct pl_color_repr pl_color_repr_uhdtv = {
+    .sys    = PL_COLOR_SYSTEM_BT_2020_NC,
+    .levels = PL_COLOR_LEVELS_LIMITED,
+};
+
+const struct pl_color_repr pl_color_repr_jpeg = {
+    .sys    = PL_COLOR_SYSTEM_BT_601,
+    .levels = PL_COLOR_LEVELS_FULL,
+};
+
+bool pl_color_repr_equal(const struct pl_color_repr *c1,
+                         const struct pl_color_repr *c2)
+{
+    return c1->sys    == c2->sys &&
+           c1->levels == c2->levels &&
+           c1->alpha  == c2->alpha &&
+           c1->dovi   == c2->dovi &&
+           pl_bit_encoding_equal(&c1->bits, &c2->bits);
+}
+
+static struct pl_bit_encoding pl_bit_encoding_merge(const struct pl_bit_encoding *orig,
+                                                    const struct pl_bit_encoding *new)
+{
+    return (struct pl_bit_encoding) {
+        .sample_depth = PL_DEF(orig->sample_depth, new->sample_depth),
+        .color_depth  = PL_DEF(orig->color_depth,  new->color_depth),
+        .bit_shift    = PL_DEF(orig->bit_shift,    new->bit_shift),
+    };
+}
+
+void pl_color_repr_merge(struct pl_color_repr *orig, const struct pl_color_repr *new)
+{
+    *orig = (struct pl_color_repr) {
+        .sys    = PL_DEF(orig->sys,    new->sys),
+        .levels = PL_DEF(orig->levels, new->levels),
+        .alpha  = PL_DEF(orig->alpha,  new->alpha),
+        .dovi   = PL_DEF(orig->dovi,   new->dovi),
+        .bits   = pl_bit_encoding_merge(&orig->bits, &new->bits),
+    };
+}
+
+enum pl_color_levels pl_color_levels_guess(const struct pl_color_repr *repr)
+{
+    if (repr->sys == PL_COLOR_SYSTEM_DOLBYVISION)
+        return PL_COLOR_LEVELS_FULL;
+
+    if (repr->levels)
+        return repr->levels;
+
+    return pl_color_system_is_ycbcr_like(repr->sys)
+                ? PL_COLOR_LEVELS_LIMITED
+                : PL_COLOR_LEVELS_FULL;
+}
+
+float pl_color_repr_normalize(struct pl_color_repr *repr)
+{
+    float scale = 1.0;
+    struct pl_bit_encoding *bits = &repr->bits;
+
+    if (bits->bit_shift) {
+        scale /= (1LL << bits->bit_shift);
+        bits->bit_shift = 0;
+    }
+
+    // If one of these is set but not the other, use the set one
+    int tex_bits = PL_DEF(bits->sample_depth, 8);
+    int col_bits = PL_DEF(bits->color_depth, tex_bits);
+    tex_bits = PL_DEF(tex_bits, col_bits);
+
+    if (pl_color_levels_guess(repr) == PL_COLOR_LEVELS_LIMITED) {
+        // Limit range is always shifted directly
+        scale *= (float) (1LL << tex_bits) / (1LL << col_bits);
+    } else {
+        // Full range always uses the full range available
+        scale *= ((1LL << tex_bits) - 1.) / ((1LL << col_bits) - 1.);
+    }
+
+    bits->color_depth = bits->sample_depth;
+    return scale;
+}
+
+bool pl_color_primaries_is_wide_gamut(enum pl_color_primaries prim)
+{
+    switch (prim) {
+    case PL_COLOR_PRIM_UNKNOWN:
+    case PL_COLOR_PRIM_BT_601_525:
+    case PL_COLOR_PRIM_BT_601_625:
+    case PL_COLOR_PRIM_BT_709:
+    case PL_COLOR_PRIM_BT_470M:
+    case PL_COLOR_PRIM_EBU_3213:
+        return false;
+    case PL_COLOR_PRIM_BT_2020:
+    case PL_COLOR_PRIM_APPLE:
+    case PL_COLOR_PRIM_ADOBE:
+    case PL_COLOR_PRIM_PRO_PHOTO:
+    case PL_COLOR_PRIM_CIE_1931:
+    case PL_COLOR_PRIM_DCI_P3:
+    case PL_COLOR_PRIM_DISPLAY_P3:
+    case PL_COLOR_PRIM_V_GAMUT:
+    case PL_COLOR_PRIM_S_GAMUT:
+    case PL_COLOR_PRIM_FILM_C:
+    case PL_COLOR_PRIM_ACES_AP0:
+    case PL_COLOR_PRIM_ACES_AP1:
+        return true;
+    case PL_COLOR_PRIM_COUNT: break;
+    }
+
+    pl_unreachable();
+}
+
+enum pl_color_primaries pl_color_primaries_guess(int width, int height)
+{
+    // HD content
+    if (width >= 1280 || height > 576)
+        return PL_COLOR_PRIM_BT_709;
+
+    switch (height) {
+    case 576: // Typical PAL content, including anamorphic/squared
+        return PL_COLOR_PRIM_BT_601_625;
+
+    case 480: // Typical NTSC content, including squared
+    case 486: // NTSC Pro or anamorphic NTSC
+        return PL_COLOR_PRIM_BT_601_525;
+
+    default: // No good metric, just pick BT.709 to minimize damage
+        return PL_COLOR_PRIM_BT_709;
+    }
+}
+
+// HLG 75% value (scene-referred)
+#define HLG_75 3.17955
+
+float pl_color_transfer_nominal_peak(enum pl_color_transfer trc)
+{
+    switch (trc) {
+    case PL_COLOR_TRC_UNKNOWN:
+    case PL_COLOR_TRC_BT_1886:
+    case PL_COLOR_TRC_SRGB:
+    case PL_COLOR_TRC_LINEAR:
+    case PL_COLOR_TRC_GAMMA18:
+    case PL_COLOR_TRC_GAMMA20:
+    case PL_COLOR_TRC_GAMMA22:
+    case PL_COLOR_TRC_GAMMA24:
+    case PL_COLOR_TRC_GAMMA26:
+    case PL_COLOR_TRC_GAMMA28:
+    case PL_COLOR_TRC_PRO_PHOTO:
+    case PL_COLOR_TRC_ST428:
+        return 1.0;
+    case PL_COLOR_TRC_PQ:       return 10000.0 / PL_COLOR_SDR_WHITE;
+    case PL_COLOR_TRC_HLG:      return 12.0 / HLG_75;
+    case PL_COLOR_TRC_V_LOG:    return 46.0855;
+    case PL_COLOR_TRC_S_LOG1:   return 6.52;
+    case PL_COLOR_TRC_S_LOG2:   return 9.212;
+    case PL_COLOR_TRC_COUNT: break;
+    }
+
+    pl_unreachable();
+}
+
+const struct pl_hdr_metadata pl_hdr_metadata_empty = {0};
+const struct pl_hdr_metadata pl_hdr_metadata_hdr10 ={
+    .prim = {
+        .red   = {0.708,    0.292},
+        .green = {0.170,    0.797},
+        .blue  = {0.131,    0.046},
+        .white = {0.31271,  0.32902},
+    },
+    .min_luma = 0,
+    .max_luma = 10000,
+    .max_cll  = 10000,
+    .max_fall = 0, // unknown
+};
+
+static const float PQ_M1 = 2610./4096 * 1./4,
+                   PQ_M2 = 2523./4096 * 128,
+                   PQ_C1 = 3424./4096,
+                   PQ_C2 = 2413./4096 * 32,
+                   PQ_C3 = 2392./4096 * 32;
+
+float pl_hdr_rescale(enum pl_hdr_scaling from, enum pl_hdr_scaling to, float x)
+{
+    if (from == to)
+        return x;
+    if (!x) // micro-optimization for common value
+        return x;
+
+    x = fmaxf(x, 0.0f);
+
+    // Convert input to PL_SCALE_RELATIVE
+    switch (from) {
+    case PL_HDR_PQ:
+        x = powf(x, 1.0f / PQ_M2);
+        x = fmaxf(x - PQ_C1, 0.0f) / (PQ_C2 - PQ_C3 * x);
+        x = powf(x, 1.0f / PQ_M1);
+        x *= 10000.0f;
+        // fall through
+    case PL_HDR_NITS:
+        x /= PL_COLOR_SDR_WHITE;
+        // fall through
+    case PL_HDR_NORM:
+        goto output;
+    case PL_HDR_SQRT:
+        x *= x;
+        goto output;
+    case PL_HDR_SCALING_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+
+output:
+    // Convert PL_SCALE_RELATIVE to output
+    switch (to) {
+    case PL_HDR_NORM:
+        return x;
+    case PL_HDR_SQRT:
+        return sqrtf(x);
+    case PL_HDR_NITS:
+        return x * PL_COLOR_SDR_WHITE;
+    case PL_HDR_PQ:
+        x *= PL_COLOR_SDR_WHITE / 10000.0f;
+        x = powf(x, PQ_M1);
+        x = (PQ_C1 + PQ_C2 * x) / (1.0f + PQ_C3 * x);
+        x = powf(x, PQ_M2);
+        return x;
+    case PL_HDR_SCALING_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+}
+
+static inline bool pl_hdr_bezier_equal(const struct pl_hdr_bezier *a,
+                                       const struct pl_hdr_bezier *b)
+{
+    return a->target_luma == b->target_luma &&
+           a->knee_x      == b->knee_x &&
+           a->knee_y      == b->knee_y &&
+           a->num_anchors == b->num_anchors &&
+           !memcmp(a->anchors, b->anchors, sizeof(a->anchors[0]) * a->num_anchors);
+}
+
+bool pl_hdr_metadata_equal(const struct pl_hdr_metadata *a,
+                           const struct pl_hdr_metadata *b)
+{
+    return pl_raw_primaries_equal(&a->prim, &b->prim) &&
+           a->min_luma == b->min_luma &&
+           a->max_luma == b->max_luma &&
+           a->max_cll  == b->max_cll  &&
+           a->max_fall == b->max_fall &&
+           a->scene_max[0] == b->scene_max[0] &&
+           a->scene_max[1] == b->scene_max[1] &&
+           a->scene_max[2] == b->scene_max[2] &&
+           a->scene_avg == b->scene_avg &&
+           pl_hdr_bezier_equal(&a->ootf, &b->ootf) &&
+           a->max_pq_y == b->max_pq_y &&
+           a->avg_pq_y == b->avg_pq_y;
+}
+
+void pl_hdr_metadata_merge(struct pl_hdr_metadata *orig,
+                           const struct pl_hdr_metadata *update)
+{
+    pl_raw_primaries_merge(&orig->prim, &update->prim);
+    if (!orig->min_luma)
+        orig->min_luma = update->min_luma;
+    if (!orig->max_luma)
+        orig->max_luma = update->max_luma;
+    if (!orig->max_cll)
+        orig->max_cll = update->max_cll;
+    if (!orig->max_fall)
+        orig->max_fall = update->max_fall;
+    if (!orig->scene_max[1])
+        memcpy(orig->scene_max, update->scene_max, sizeof(orig->scene_max));
+    if (!orig->scene_avg)
+        orig->scene_avg = update->scene_avg;
+    if (!orig->ootf.target_luma)
+        orig->ootf = update->ootf;
+    if (!orig->max_pq_y)
+        orig->max_pq_y = update->max_pq_y;
+    if (!orig->avg_pq_y)
+        orig->avg_pq_y = update->avg_pq_y;
+}
+
+bool pl_hdr_metadata_contains(const struct pl_hdr_metadata *data,
+                              enum pl_hdr_metadata_type type)
+{
+    bool has_hdr10 = data->max_luma;
+    bool has_hdr10plus = data->scene_avg && (data->scene_max[0] ||
+                                             data->scene_max[1] ||
+                                             data->scene_max[2]);
+    bool has_cie_y = data->max_pq_y && data->avg_pq_y;
+
+    switch (type) {
+    case PL_HDR_METADATA_NONE:          return true;
+    case PL_HDR_METADATA_ANY:           return has_hdr10 || has_hdr10plus || has_cie_y;
+    case PL_HDR_METADATA_HDR10:         return has_hdr10;
+    case PL_HDR_METADATA_HDR10PLUS:     return has_hdr10plus;
+    case PL_HDR_METADATA_CIE_Y:         return has_cie_y;
+    case PL_HDR_METADATA_TYPE_COUNT:    break;
+    }
+
+    pl_unreachable();
+}
+
+const struct pl_color_space pl_color_space_unknown = {0};
+
+const struct pl_color_space pl_color_space_srgb = {
+    .primaries = PL_COLOR_PRIM_BT_709,
+    .transfer  = PL_COLOR_TRC_SRGB,
+};
+
+const struct pl_color_space pl_color_space_bt709 = {
+    .primaries = PL_COLOR_PRIM_BT_709,
+    .transfer  = PL_COLOR_TRC_BT_1886,
+};
+
+const struct pl_color_space pl_color_space_hdr10 = {
+    .primaries = PL_COLOR_PRIM_BT_2020,
+    .transfer  = PL_COLOR_TRC_PQ,
+};
+
+const struct pl_color_space pl_color_space_bt2020_hlg = {
+    .primaries = PL_COLOR_PRIM_BT_2020,
+    .transfer  = PL_COLOR_TRC_HLG,
+};
+
+const struct pl_color_space pl_color_space_monitor = {
+    .primaries = PL_COLOR_PRIM_BT_709, // sRGB primaries
+    .transfer  = PL_COLOR_TRC_UNKNOWN, // unknown SDR response
+};
+
+bool pl_color_space_is_hdr(const struct pl_color_space *csp)
+{
+    return csp->hdr.max_luma > PL_COLOR_SDR_WHITE ||
+           pl_color_transfer_is_hdr(csp->transfer);
+}
+
+bool pl_color_space_is_black_scaled(const struct pl_color_space *csp)
+{
+    switch (csp->transfer) {
+    case PL_COLOR_TRC_UNKNOWN:
+    case PL_COLOR_TRC_SRGB:
+    case PL_COLOR_TRC_LINEAR:
+    case PL_COLOR_TRC_GAMMA18:
+    case PL_COLOR_TRC_GAMMA20:
+    case PL_COLOR_TRC_GAMMA22:
+    case PL_COLOR_TRC_GAMMA24:
+    case PL_COLOR_TRC_GAMMA26:
+    case PL_COLOR_TRC_GAMMA28:
+    case PL_COLOR_TRC_PRO_PHOTO:
+    case PL_COLOR_TRC_ST428:
+    case PL_COLOR_TRC_HLG:
+        return true;
+
+    case PL_COLOR_TRC_BT_1886:
+    case PL_COLOR_TRC_PQ:
+    case PL_COLOR_TRC_V_LOG:
+    case PL_COLOR_TRC_S_LOG1:
+    case PL_COLOR_TRC_S_LOG2:
+        return false;
+
+    case PL_COLOR_TRC_COUNT: break;
+    }
+
+    pl_unreachable();
+}
+
+void pl_color_space_merge(struct pl_color_space *orig,
+                          const struct pl_color_space *new)
+{
+    if (!orig->primaries)
+        orig->primaries = new->primaries;
+    if (!orig->transfer)
+        orig->transfer = new->transfer;
+    pl_hdr_metadata_merge(&orig->hdr, &new->hdr);
+}
+
+bool pl_color_space_equal(const struct pl_color_space *c1,
+                          const struct pl_color_space *c2)
+{
+    return c1->primaries == c2->primaries &&
+           c1->transfer  == c2->transfer &&
+           pl_hdr_metadata_equal(&c1->hdr, &c2->hdr);
+}
+
+// Estimates luminance from maxRGB by looking at how monochromatic MaxSCL is
+static void luma_from_maxrgb(const struct pl_color_space *csp,
+                             enum pl_hdr_scaling scaling,
+                             float *out_max, float *out_avg)
+{
+    const float maxscl = PL_MAX3(csp->hdr.scene_max[0],
+                                 csp->hdr.scene_max[1],
+                                 csp->hdr.scene_max[2]);
+    if (!maxscl)
+        return;
+
+    struct pl_raw_primaries prim = csp->hdr.prim;
+    pl_raw_primaries_merge(&prim, pl_raw_primaries_get(csp->primaries));
+    const pl_matrix3x3 rgb2xyz = pl_get_rgb2xyz_matrix(&prim);
+
+    const float max_luma = rgb2xyz.m[1][0] * csp->hdr.scene_max[0] +
+                           rgb2xyz.m[1][1] * csp->hdr.scene_max[1] +
+                           rgb2xyz.m[1][2] * csp->hdr.scene_max[2];
+
+    const float coef = max_luma / maxscl;
+    *out_max = pl_hdr_rescale(PL_HDR_NITS, scaling, max_luma);
+    *out_avg = pl_hdr_rescale(PL_HDR_NITS, scaling, coef * csp->hdr.scene_avg);
+}
+
+static inline bool metadata_compat(enum pl_hdr_metadata_type metadata,
+                                   enum pl_hdr_metadata_type compat)
+{
+    return metadata == PL_HDR_METADATA_ANY || metadata == compat;
+}
+
+void pl_color_space_nominal_luma_ex(const struct pl_nominal_luma_params *params)
+{
+    if (!params || (!params->out_min && !params->out_max && !params->out_avg))
+        return;
+
+    const struct pl_color_space *csp = params->color;
+    const enum pl_hdr_scaling scaling = params->scaling;
+
+    float min_luma = 0, max_luma = 0, avg_luma = 0;
+    if (params->metadata != PL_HDR_METADATA_NONE) {
+        // Initialize from static HDR10 metadata, in all cases
+        min_luma = pl_hdr_rescale(PL_HDR_NITS, scaling, csp->hdr.min_luma);
+        max_luma = pl_hdr_rescale(PL_HDR_NITS, scaling, csp->hdr.max_luma);
+    }
+
+    if (metadata_compat(params->metadata, PL_HDR_METADATA_HDR10PLUS) &&
+        pl_hdr_metadata_contains(&csp->hdr, PL_HDR_METADATA_HDR10PLUS))
+    {
+        luma_from_maxrgb(csp, scaling, &max_luma, &avg_luma);
+    }
+
+    if (metadata_compat(params->metadata, PL_HDR_METADATA_CIE_Y) &&
+        pl_hdr_metadata_contains(&csp->hdr, PL_HDR_METADATA_CIE_Y))
+    {
+        max_luma = pl_hdr_rescale(PL_HDR_PQ, scaling, csp->hdr.max_pq_y);
+        avg_luma = pl_hdr_rescale(PL_HDR_PQ, scaling, csp->hdr.avg_pq_y);
+    }
+
+    // Clamp to sane value range
+    const float hdr_min = pl_hdr_rescale(PL_HDR_NITS, scaling, PL_COLOR_HDR_BLACK);
+    const float hdr_max = pl_hdr_rescale(PL_HDR_PQ,   scaling, 1.0f);
+    max_luma = max_luma ? PL_CLAMP(max_luma, hdr_min, hdr_max) : 0;
+    min_luma = min_luma ? PL_CLAMP(min_luma, hdr_min, hdr_max) : 0;
+    if ((max_luma && min_luma >= max_luma) || min_luma >= hdr_max)
+        min_luma = max_luma = 0; // sanity
+
+    // PQ is always scaled down to absolute black, ignoring HDR metadata
+    if (csp->transfer == PL_COLOR_TRC_PQ)
+        min_luma = hdr_min;
+
+    // Baseline/fallback metadata, inferred entirely from the colorspace
+    // description and built-in default assumptions
+    if (!max_luma) {
+        if (csp->transfer == PL_COLOR_TRC_HLG) {
+            max_luma = pl_hdr_rescale(PL_HDR_NITS, scaling, PL_COLOR_HLG_PEAK);
+        } else {
+            const float peak = pl_color_transfer_nominal_peak(csp->transfer);
+            max_luma = pl_hdr_rescale(PL_HDR_NORM, scaling, peak);
+        }
+    }
+
+    if (!min_luma) {
+        if (pl_color_transfer_is_hdr(csp->transfer)) {
+            min_luma = hdr_min;
+        } else {
+            const float peak = pl_hdr_rescale(scaling, PL_HDR_NITS, max_luma);
+            min_luma = pl_hdr_rescale(PL_HDR_NITS, scaling,
+                                      peak / PL_COLOR_SDR_CONTRAST);
+        }
+    }
+
+    if (avg_luma)
+        avg_luma = PL_CLAMP(avg_luma, min_luma, max_luma); // sanity
+
+    if (params->out_min)
+        *params->out_min = min_luma;
+    if (params->out_max)
+        *params->out_max = max_luma;
+    if (params->out_avg)
+        *params->out_avg = avg_luma;
+}
+
+void pl_color_space_nominal_luma(const struct pl_color_space *csp,
+                                 float *out_min, float *out_max)
+{
+    pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+        .color      = csp,
+        .metadata   = PL_HDR_METADATA_ANY,
+        .scaling    = PL_HDR_NORM,
+        .out_min    = out_min,
+        .out_max    = out_max,
+    ));
+}
+
+void pl_color_space_infer(struct pl_color_space *space)
+{
+    if (!space->primaries)
+        space->primaries = PL_COLOR_PRIM_BT_709;
+    if (!space->transfer)
+        space->transfer = PL_COLOR_TRC_BT_1886;
+
+    // Sanitize the static HDR metadata
+    pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+        .color      = space,
+        .metadata   = PL_HDR_METADATA_HDR10,
+        .scaling    = PL_HDR_NITS,
+        .out_max    = &space->hdr.max_luma,
+        // Preserve tagged minimum
+        .out_min    = space->hdr.min_luma ? NULL : &space->hdr.min_luma,
+    ));
+
+    // Default the signal color space based on the nominal raw primaries
+    if (!pl_primaries_valid(&space->hdr.prim))
+        space->hdr.prim = *pl_raw_primaries_get(space->primaries);
+}
+
+static void infer_both_ref(struct pl_color_space *space,
+                           struct pl_color_space *ref)
+{
+    pl_color_space_infer(ref);
+
+    if (!space->primaries) {
+        if (pl_color_primaries_is_wide_gamut(ref->primaries)) {
+            space->primaries = PL_COLOR_PRIM_BT_709;
+        } else {
+            space->primaries = ref->primaries;
+        }
+    }
+
+    if (!space->transfer) {
+        switch (ref->transfer) {
+        case PL_COLOR_TRC_UNKNOWN:
+        case PL_COLOR_TRC_COUNT:
+            pl_unreachable();
+        case PL_COLOR_TRC_BT_1886:
+        case PL_COLOR_TRC_SRGB:
+        case PL_COLOR_TRC_GAMMA22:
+            // Re-use input transfer curve to avoid small adaptations
+            space->transfer = ref->transfer;
+            break;
+        case PL_COLOR_TRC_PQ:
+        case PL_COLOR_TRC_HLG:
+        case PL_COLOR_TRC_V_LOG:
+        case PL_COLOR_TRC_S_LOG1:
+        case PL_COLOR_TRC_S_LOG2:
+            // Pick BT.1886 model because it models SDR contrast accurately,
+            // and we need contrast information for tone mapping
+            space->transfer = PL_COLOR_TRC_BT_1886;
+            break;
+        case PL_COLOR_TRC_PRO_PHOTO:
+            // ProPhotoRGB and sRGB are both piecewise with linear slope
+            space->transfer = PL_COLOR_TRC_SRGB;
+            break;
+        case PL_COLOR_TRC_LINEAR:
+        case PL_COLOR_TRC_GAMMA18:
+        case PL_COLOR_TRC_GAMMA20:
+        case PL_COLOR_TRC_GAMMA24:
+        case PL_COLOR_TRC_GAMMA26:
+        case PL_COLOR_TRC_GAMMA28:
+        case PL_COLOR_TRC_ST428:
+            // Pick pure power output curve to avoid introducing black crush
+            space->transfer = PL_COLOR_TRC_GAMMA22;
+            break;
+        }
+    }
+
+    // Infer the remaining fields after making the above choices
+    pl_color_space_infer(space);
+}
+
+void pl_color_space_infer_ref(struct pl_color_space *space,
+                              const struct pl_color_space *refp)
+{
+    // Make a copy of `refp` to infer missing values first
+    struct pl_color_space ref = *refp;
+    infer_both_ref(space, &ref);
+}
+
+void pl_color_space_infer_map(struct pl_color_space *src,
+                              struct pl_color_space *dst)
+{
+    bool unknown_src_contrast = !src->hdr.min_luma;
+    bool unknown_dst_contrast = !dst->hdr.min_luma;
+
+    infer_both_ref(dst, src);
+
+    // If the src has an unspecified gamma curve with dynamic black scaling,
+    // default it to match the dst colorspace contrast. This does not matter in
+    // most cases, but ensures that BT.1886 is tuned to the appropriate black
+    // point by default.
+    bool dynamic_src_contrast = pl_color_space_is_black_scaled(src) ||
+                                src->transfer == PL_COLOR_TRC_BT_1886;
+    if (unknown_src_contrast && dynamic_src_contrast)
+        src->hdr.min_luma = dst->hdr.min_luma;
+
+    // Do the same in reverse if both src and dst are SDR curves
+    bool src_is_sdr = !pl_color_space_is_hdr(src);
+    bool dst_is_sdr = !pl_color_space_is_hdr(dst);
+    if (unknown_dst_contrast && src_is_sdr && dst_is_sdr)
+        dst->hdr.min_luma = src->hdr.min_luma;
+
+    // If the src is HLG and the output is HDR, tune the HLG peak to the output
+    if (src->transfer == PL_COLOR_TRC_HLG && pl_color_space_is_hdr(dst))
+        src->hdr.max_luma = dst->hdr.max_luma;
+}
+
+const struct pl_color_adjustment pl_color_adjustment_neutral = {
+    PL_COLOR_ADJUSTMENT_NEUTRAL
+};
+
+void pl_chroma_location_offset(enum pl_chroma_location loc, float *x, float *y)
+{
+    *x = *y = 0;
+
+    // This is the majority of subsampled chroma content out there
+    loc = PL_DEF(loc, PL_CHROMA_LEFT);
+
+    switch (loc) {
+    case PL_CHROMA_LEFT:
+    case PL_CHROMA_TOP_LEFT:
+    case PL_CHROMA_BOTTOM_LEFT:
+        *x = -0.5;
+        break;
+    default: break;
+    }
+
+    switch (loc) {
+    case PL_CHROMA_TOP_LEFT:
+    case PL_CHROMA_TOP_CENTER:
+        *y = -0.5;
+        break;
+    default: break;
+    }
+
+    switch (loc) {
+    case PL_CHROMA_BOTTOM_LEFT:
+    case PL_CHROMA_BOTTOM_CENTER:
+        *y = 0.5;
+        break;
+    default: break;
+    }
+}
+
+struct pl_cie_xy pl_white_from_temp(float temp)
+{
+    temp = PL_CLAMP(temp, 2500, 25000);
+
+    double ti = 1000.0 / temp, ti2 = ti * ti, ti3 = ti2 * ti, x;
+    if (temp <= 7000) {
+        x = -4.6070 * ti3 + 2.9678 * ti2 + 0.09911 * ti + 0.244063;
+    } else {
+        x = -2.0064 * ti3 + 1.9018 * ti2 + 0.24748 * ti + 0.237040;
+    }
+
+    return (struct pl_cie_xy) {
+        .x = x,
+        .y = -3 * (x*x) + 2.87 * x - 0.275,
+    };
+}
+
+bool pl_raw_primaries_equal(const struct pl_raw_primaries *a,
+                            const struct pl_raw_primaries *b)
+{
+    return pl_cie_xy_equal(&a->red,   &b->red)   &&
+           pl_cie_xy_equal(&a->green, &b->green) &&
+           pl_cie_xy_equal(&a->blue,  &b->blue)  &&
+           pl_cie_xy_equal(&a->white, &b->white);
+}
+
+bool pl_raw_primaries_similar(const struct pl_raw_primaries *a,
+                              const struct pl_raw_primaries *b)
+{
+    float delta = fabsf(a->red.x   - b->red.x)   +
+                  fabsf(a->red.y   - b->red.y)   +
+                  fabsf(a->green.x - b->green.x) +
+                  fabsf(a->green.y - b->green.y) +
+                  fabsf(a->blue.x  - b->blue.x)  +
+                  fabsf(a->blue.y  - b->blue.y)  +
+                  fabsf(a->white.x - b->white.x) +
+                  fabsf(a->white.y - b->white.y);
+
+    return delta < 0.001;
+}
+
+void pl_raw_primaries_merge(struct pl_raw_primaries *orig,
+                            const struct pl_raw_primaries *update)
+{
+    union {
+        struct pl_raw_primaries prim;
+        float raw[8];
+    } *pa = (void *) orig,
+      *pb = (void *) update;
+
+    pl_static_assert(sizeof(*pa) == sizeof(*orig));
+    for (int i = 0; i < PL_ARRAY_SIZE(pa->raw); i++)
+        pa->raw[i] = PL_DEF(pa->raw[i], pb->raw[i]);
+}
+
+const struct pl_raw_primaries *pl_raw_primaries_get(enum pl_color_primaries prim)
+{
+    /*
+    Values from: ITU-R Recommendations BT.470-6, BT.601-7, BT.709-5, BT.2020-0
+
+    https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.470-6-199811-S!!PDF-E.pdf
+    https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.601-7-201103-I!!PDF-E.pdf
+    https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.709-5-200204-I!!PDF-E.pdf
+    https://www.itu.int/dms_pubrec/itu-r/rec/bt/R-REC-BT.2020-0-201208-I!!PDF-E.pdf
+
+    Other colorspaces from https://en.wikipedia.org/wiki/RGB_color_space#Specifications
+    */
+
+    // CIE standard illuminant series
+#define CIE_D50 {0.3457, 0.3585}
+#define CIE_D65 {0.3127, 0.3290}
+#define CIE_C   {0.3100, 0.3160}
+#define CIE_E   {1.0/3.0, 1.0/3.0}
+#define DCI     {0.3140, 0.3510}
+
+    static const struct pl_raw_primaries primaries[] = {
+        [PL_COLOR_PRIM_BT_470M] = {
+            .red   = {0.670, 0.330},
+            .green = {0.210, 0.710},
+            .blue  = {0.140, 0.080},
+            .white = CIE_C,
+        },
+
+        [PL_COLOR_PRIM_BT_601_525] = {
+            .red   = {0.630, 0.340},
+            .green = {0.310, 0.595},
+            .blue  = {0.155, 0.070},
+            .white = CIE_D65,
+        },
+        [PL_COLOR_PRIM_BT_601_625] = {
+            .red   = {0.640, 0.330},
+            .green = {0.290, 0.600},
+            .blue  = {0.150, 0.060},
+            .white = CIE_D65,
+        },
+        [PL_COLOR_PRIM_BT_709] = {
+            .red   = {0.640, 0.330},
+            .green = {0.300, 0.600},
+            .blue  = {0.150, 0.060},
+            .white = CIE_D65,
+        },
+        [PL_COLOR_PRIM_BT_2020] = {
+            .red   = {0.708, 0.292},
+            .green = {0.170, 0.797},
+            .blue  = {0.131, 0.046},
+            .white = CIE_D65,
+        },
+        [PL_COLOR_PRIM_APPLE] = {
+            .red   = {0.625, 0.340},
+            .green = {0.280, 0.595},
+            .blue  = {0.115, 0.070},
+            .white = CIE_D65,
+        },
+        [PL_COLOR_PRIM_ADOBE] = {
+            .red   = {0.640, 0.330},
+            .green = {0.210, 0.710},
+            .blue  = {0.150, 0.060},
+            .white = CIE_D65,
+        },
+        [PL_COLOR_PRIM_PRO_PHOTO] = {
+            .red   = {0.7347, 0.2653},
+            .green = {0.1596, 0.8404},
+            .blue  = {0.0366, 0.0001},
+            .white = CIE_D50,
+        },
+        [PL_COLOR_PRIM_CIE_1931] = {
+            .red   = {0.7347, 0.2653},
+            .green = {0.2738, 0.7174},
+            .blue  = {0.1666, 0.0089},
+            .white = CIE_E,
+        },
+    // From SMPTE RP 431-2
+        [PL_COLOR_PRIM_DCI_P3] = {
+            .red   = {0.680, 0.320},
+            .green = {0.265, 0.690},
+            .blue  = {0.150, 0.060},
+            .white = DCI,
+        },
+        [PL_COLOR_PRIM_DISPLAY_P3] = {
+            .red   = {0.680, 0.320},
+            .green = {0.265, 0.690},
+            .blue  = {0.150, 0.060},
+            .white = CIE_D65,
+        },
+    // From Panasonic VARICAM reference manual
+        [PL_COLOR_PRIM_V_GAMUT] = {
+            .red   = {0.730, 0.280},
+            .green = {0.165, 0.840},
+            .blue  = {0.100, -0.03},
+            .white = CIE_D65,
+        },
+    // From Sony S-Log reference manual
+        [PL_COLOR_PRIM_S_GAMUT] = {
+            .red   = {0.730, 0.280},
+            .green = {0.140, 0.855},
+            .blue  = {0.100, -0.05},
+            .white = CIE_D65,
+        },
+    // From FFmpeg source code
+        [PL_COLOR_PRIM_FILM_C] = {
+            .red   = {0.681, 0.319},
+            .green = {0.243, 0.692},
+            .blue  = {0.145, 0.049},
+            .white = CIE_C,
+        },
+        [PL_COLOR_PRIM_EBU_3213] = {
+            .red   = {0.630, 0.340},
+            .green = {0.295, 0.605},
+            .blue  = {0.155, 0.077},
+            .white = CIE_D65,
+        },
+    // From Wikipedia
+        [PL_COLOR_PRIM_ACES_AP0] = {
+            .red   = {0.7347, 0.2653},
+            .green = {0.0000, 1.0000},
+            .blue  = {0.0001, -0.0770},
+            .white = {0.32168, 0.33767},
+        },
+        [PL_COLOR_PRIM_ACES_AP1] = {
+            .red   = {0.713, 0.293},
+            .green = {0.165, 0.830},
+            .blue  = {0.128, 0.044},
+            .white = {0.32168, 0.33767},
+        },
+    };
+
+    // This is the default assumption if no colorspace information could
+    // be determined, eg. for files which have no video channel.
+    if (!prim)
+        prim = PL_COLOR_PRIM_BT_709;
+
+    pl_assert(prim < PL_ARRAY_SIZE(primaries));
+    return &primaries[prim];
+}
+
+// Compute the RGB/XYZ matrix as described here:
+// http://www.brucelindbloom.com/index.html?Eqn_RGB_XYZ_Matrix.html
+pl_matrix3x3 pl_get_rgb2xyz_matrix(const struct pl_raw_primaries *prim)
+{
+    pl_matrix3x3 out = {{{0}}};
+    float S[3], X[4], Z[4];
+
+    X[0] = pl_cie_X(prim->red);
+    X[1] = pl_cie_X(prim->green);
+    X[2] = pl_cie_X(prim->blue);
+    X[3] = pl_cie_X(prim->white);
+
+    Z[0] = pl_cie_Z(prim->red);
+    Z[1] = pl_cie_Z(prim->green);
+    Z[2] = pl_cie_Z(prim->blue);
+    Z[3] = pl_cie_Z(prim->white);
+
+    // S = XYZ^-1 * W
+    for (int i = 0; i < 3; i++) {
+        out.m[0][i] = X[i];
+        out.m[1][i] = 1;
+        out.m[2][i] = Z[i];
+    }
+
+    pl_matrix3x3_invert(&out);
+
+    for (int i = 0; i < 3; i++)
+        S[i] = out.m[i][0] * X[3] + out.m[i][1] * 1 + out.m[i][2] * Z[3];
+
+    // M = [Sc * XYZc]
+    for (int i = 0; i < 3; i++) {
+        out.m[0][i] = S[i] * X[i];
+        out.m[1][i] = S[i] * 1;
+        out.m[2][i] = S[i] * Z[i];
+    }
+
+    return out;
+}
+
+pl_matrix3x3 pl_get_xyz2rgb_matrix(const struct pl_raw_primaries *prim)
+{
+    // For simplicity, just invert the rgb2xyz matrix
+    pl_matrix3x3 out = pl_get_rgb2xyz_matrix(prim);
+    pl_matrix3x3_invert(&out);
+    return out;
+}
+
+// LMS<-XYZ revised matrix from CIECAM97, based on a linear transform and
+// normalized for equal energy on monochrome inputs
+static const pl_matrix3x3 m_cat97 = {{
+    {  0.8562,  0.3372, -0.1934 },
+    { -0.8360,  1.8327,  0.0033 },
+    {  0.0357, -0.0469,  1.0112 },
+}};
+
+// M := M * XYZd<-XYZs
+static void apply_chromatic_adaptation(struct pl_cie_xy src,
+                                       struct pl_cie_xy dest,
+                                       pl_matrix3x3 *mat)
+{
+    // If the white points are nearly identical, this is a wasteful identity
+    // operation.
+    if (fabs(src.x - dest.x) < 1e-6 && fabs(src.y - dest.y) < 1e-6)
+        return;
+
+    // XYZd<-XYZs = Ma^-1 * (I*[Cd/Cs]) * Ma
+    // http://www.brucelindbloom.com/index.html?Eqn_ChromAdapt.html
+    // For Ma, we use the CIECAM97 revised (linear) matrix
+    float C[3][2];
+
+    for (int i = 0; i < 3; i++) {
+        // source cone
+        C[i][0] = m_cat97.m[i][0] * pl_cie_X(src)
+                + m_cat97.m[i][1] * 1
+                + m_cat97.m[i][2] * pl_cie_Z(src);
+
+        // dest cone
+        C[i][1] = m_cat97.m[i][0] * pl_cie_X(dest)
+                + m_cat97.m[i][1] * 1
+                + m_cat97.m[i][2] * pl_cie_Z(dest);
+    }
+
+    // tmp := I * [Cd/Cs] * Ma
+    pl_matrix3x3 tmp = {0};
+    for (int i = 0; i < 3; i++)
+        tmp.m[i][i] = C[i][1] / C[i][0];
+
+    pl_matrix3x3_mul(&tmp, &m_cat97);
+
+    // M := M * Ma^-1 * tmp
+    pl_matrix3x3 ma_inv = m_cat97;
+    pl_matrix3x3_invert(&ma_inv);
+    pl_matrix3x3_mul(mat, &ma_inv);
+    pl_matrix3x3_mul(mat, &tmp);
+}
+
+pl_matrix3x3 pl_get_adaptation_matrix(struct pl_cie_xy src, struct pl_cie_xy dst)
+{
+    // Use BT.709 primaries (with chosen white point) as an XYZ reference
+    struct pl_raw_primaries csp = *pl_raw_primaries_get(PL_COLOR_PRIM_BT_709);
+    csp.white = src;
+
+    pl_matrix3x3 rgb2xyz = pl_get_rgb2xyz_matrix(&csp);
+    pl_matrix3x3 xyz2rgb = rgb2xyz;
+    pl_matrix3x3_invert(&xyz2rgb);
+
+    apply_chromatic_adaptation(src, dst, &xyz2rgb);
+    pl_matrix3x3_mul(&xyz2rgb, &rgb2xyz);
+    return xyz2rgb;
+}
+
+pl_matrix3x3 pl_ipt_rgb2lms(const struct pl_raw_primaries *prim)
+{
+    static const pl_matrix3x3 hpe = {{ // HPE XYZ->LMS (D65) method
+        {  0.40024f, 0.70760f, -0.08081f },
+        { -0.22630f, 1.16532f,  0.04570f },
+        {  0.00000f, 0.00000f,  0.91822f },
+    }};
+
+    const float c = 0.04; // 4% crosstalk
+    pl_matrix3x3 m = {{
+        { 1 - 2*c,       c,       c },
+        {       c, 1 - 2*c,       c },
+        {       c,       c, 1 - 2*c },
+    }};
+
+    pl_matrix3x3_mul(&m, &hpe);
+
+    // Apply chromatic adaptation to D65 if the input white point differs
+    static const struct pl_cie_xy d65 = CIE_D65;
+    apply_chromatic_adaptation(prim->white, d65, &m);
+
+    const pl_matrix3x3 rgb2xyz = pl_get_rgb2xyz_matrix(prim);
+    pl_matrix3x3_mul(&m, &rgb2xyz);
+    return m;
+}
+
+pl_matrix3x3 pl_ipt_lms2rgb(const struct pl_raw_primaries *prim)
+{
+    pl_matrix3x3 m = pl_ipt_rgb2lms(prim);
+    pl_matrix3x3_invert(&m);
+    return m;
+}
+
+// As standardized in Ebner & Fairchild IPT (1998)
+const pl_matrix3x3 pl_ipt_lms2ipt = {{
+    { 0.4000,  0.4000,  0.2000 },
+    { 4.4550, -4.8510,  0.3960 },
+    { 0.8056,  0.3572, -1.1628 },
+}};
+
+// Numerically inverted from the matrix above
+const pl_matrix3x3 pl_ipt_ipt2lms = {{
+    { 1.0,  0.0975689,  0.205226 },
+    { 1.0, -0.1138760,  0.133217 },
+    { 1.0,  0.0326151, -0.676887 },
+}};
+
+const struct pl_cone_params pl_vision_normal        = {PL_CONE_NONE, 1.0};
+const struct pl_cone_params pl_vision_protanomaly   = {PL_CONE_L,    0.5};
+const struct pl_cone_params pl_vision_protanopia    = {PL_CONE_L,    0.0};
+const struct pl_cone_params pl_vision_deuteranomaly = {PL_CONE_M,    0.5};
+const struct pl_cone_params pl_vision_deuteranopia  = {PL_CONE_M,    0.0};
+const struct pl_cone_params pl_vision_tritanomaly   = {PL_CONE_S,    0.5};
+const struct pl_cone_params pl_vision_tritanopia    = {PL_CONE_S,    0.0};
+const struct pl_cone_params pl_vision_monochromacy  = {PL_CONE_LM,   0.0};
+const struct pl_cone_params pl_vision_achromatopsia = {PL_CONE_LMS,  0.0};
+
+pl_matrix3x3 pl_get_cone_matrix(const struct pl_cone_params *params,
+                                const struct pl_raw_primaries *prim)
+{
+    // LMS<-RGB := LMS<-XYZ * XYZ<-RGB
+    pl_matrix3x3 rgb2lms = m_cat97;
+    pl_matrix3x3 rgb2xyz = pl_get_rgb2xyz_matrix(prim);
+    pl_matrix3x3_mul(&rgb2lms, &rgb2xyz);
+
+    // LMS versions of the two opposing primaries, plus neutral
+    float lms_r[3] = {1.0, 0.0, 0.0},
+          lms_b[3] = {0.0, 0.0, 1.0},
+          lms_w[3] = {1.0, 1.0, 1.0};
+
+    pl_matrix3x3_apply(&rgb2lms, lms_r);
+    pl_matrix3x3_apply(&rgb2lms, lms_b);
+    pl_matrix3x3_apply(&rgb2lms, lms_w);
+
+    float a, b, c = params->strength;
+    pl_matrix3x3 distort;
+
+    switch (params->cones) {
+    case PL_CONE_NONE:
+        return pl_matrix3x3_identity;
+
+    case PL_CONE_L:
+        // Solve to preserve neutral and blue
+        a = (lms_b[0] - lms_b[2] * lms_w[0] / lms_w[2]) /
+            (lms_b[1] - lms_b[2] * lms_w[1] / lms_w[2]);
+        b = (lms_b[0] - lms_b[1] * lms_w[0] / lms_w[1]) /
+            (lms_b[2] - lms_b[1] * lms_w[2] / lms_w[1]);
+        assert(fabs(a * lms_w[1] + b * lms_w[2] - lms_w[0]) < 1e-6);
+
+        distort = (pl_matrix3x3) {{
+            {            c, (1.0 - c) * a, (1.0 - c) * b},
+            {          0.0,           1.0,           0.0},
+            {          0.0,           0.0,           1.0},
+        }};
+        break;
+
+    case PL_CONE_M:
+        // Solve to preserve neutral and blue
+        a = (lms_b[1] - lms_b[2] * lms_w[1] / lms_w[2]) /
+            (lms_b[0] - lms_b[2] * lms_w[0] / lms_w[2]);
+        b = (lms_b[1] - lms_b[0] * lms_w[1] / lms_w[0]) /
+            (lms_b[2] - lms_b[0] * lms_w[2] / lms_w[0]);
+        assert(fabs(a * lms_w[0] + b * lms_w[2] - lms_w[1]) < 1e-6);
+
+        distort = (pl_matrix3x3) {{
+            {          1.0,           0.0,           0.0},
+            {(1.0 - c) * a,             c, (1.0 - c) * b},
+            {          0.0,           0.0,           1.0},
+        }};
+        break;
+
+    case PL_CONE_S:
+        // Solve to preserve neutral and red
+        a = (lms_r[2] - lms_r[1] * lms_w[2] / lms_w[1]) /
+            (lms_r[0] - lms_r[1] * lms_w[0] / lms_w[1]);
+        b = (lms_r[2] - lms_r[0] * lms_w[2] / lms_w[0]) /
+            (lms_r[1] - lms_r[0] * lms_w[1] / lms_w[0]);
+        assert(fabs(a * lms_w[0] + b * lms_w[1] - lms_w[2]) < 1e-6);
+
+        distort = (pl_matrix3x3) {{
+            {          1.0,           0.0,           0.0},
+            {          0.0,           1.0,           0.0},
+            {(1.0 - c) * a, (1.0 - c) * b,             c},
+        }};
+        break;
+
+    case PL_CONE_LM:
+        // Solve to preserve neutral
+        a = lms_w[0] / lms_w[2];
+        b = lms_w[1] / lms_w[2];
+
+        distort = (pl_matrix3x3) {{
+            {            c,           0.0, (1.0 - c) * a},
+            {          0.0,             c, (1.0 - c) * b},
+            {          0.0,           0.0,           1.0},
+        }};
+        break;
+
+    case PL_CONE_MS:
+        // Solve to preserve neutral
+        a = lms_w[1] / lms_w[0];
+        b = lms_w[2] / lms_w[0];
+
+        distort = (pl_matrix3x3) {{
+            {          1.0,           0.0,           0.0},
+            {(1.0 - c) * a,             c,           0.0},
+            {(1.0 - c) * b,           0.0,             c},
+        }};
+        break;
+
+    case PL_CONE_LS:
+        // Solve to preserve neutral
+        a = lms_w[0] / lms_w[1];
+        b = lms_w[2] / lms_w[1];
+
+        distort = (pl_matrix3x3) {{
+            {            c, (1.0 - c) * a,           0.0},
+            {          0.0,           1.0,           0.0},
+            {          0.0, (1.0 - c) * b,             c},
+        }};
+        break;
+
+    case PL_CONE_LMS: {
+        // Rod cells only, which can be modelled somewhat as a combination of
+        // L and M cones. Either way, this is pushing the limits of the our
+        // color model, so this is only a rough approximation.
+        const float w[3] = {0.3605, 0.6415, -0.002};
+        assert(fabs(w[0] + w[1] + w[2] - 1.0) < 1e-6);
+
+        for (int i = 0; i < 3; i++) {
+            for (int j = 0; j < 3; j++) {
+                distort.m[i][j] = (1.0 - c) * w[j] * lms_w[i] / lms_w[j];
+                if (i == j)
+                    distort.m[i][j] += c;
+            }
+        }
+        break;
+    }
+
+    default:
+        pl_unreachable();
+    }
+
+    // out := RGB<-LMS * distort * LMS<-RGB
+    pl_matrix3x3 out = rgb2lms;
+    pl_matrix3x3_invert(&out);
+    pl_matrix3x3_mul(&out, &distort);
+    pl_matrix3x3_mul(&out, &rgb2lms);
+
+    return out;
+}
+
+pl_matrix3x3 pl_get_color_mapping_matrix(const struct pl_raw_primaries *src,
+                                         const struct pl_raw_primaries *dst,
+                                         enum pl_rendering_intent intent)
+{
+    // In saturation mapping, we don't care about accuracy and just want
+    // primaries to map to primaries, making this an identity transformation.
+    if (intent == PL_INTENT_SATURATION)
+        return pl_matrix3x3_identity;
+
+    // RGBd<-RGBs = RGBd<-XYZd * XYZd<-XYZs * XYZs<-RGBs
+    // Equations from: http://www.brucelindbloom.com/index.html?Math.html
+    // Note: Perceptual is treated like relative colorimetric. There's no
+    // definition for perceptual other than "make it look good".
+
+    // RGBd<-XYZd matrix
+    pl_matrix3x3 xyz2rgb_d = pl_get_xyz2rgb_matrix(dst);
+
+    // Chromatic adaptation, except in absolute colorimetric intent
+    if (intent != PL_INTENT_ABSOLUTE_COLORIMETRIC)
+        apply_chromatic_adaptation(src->white, dst->white, &xyz2rgb_d);
+
+    // XYZs<-RGBs
+    pl_matrix3x3 rgb2xyz_s = pl_get_rgb2xyz_matrix(src);
+    pl_matrix3x3_mul(&xyz2rgb_d, &rgb2xyz_s);
+    return xyz2rgb_d;
+}
+
+// Test the sign of 'p' relative to the line 'ab' (barycentric coordinates)
+static float test_point_line(const struct pl_cie_xy p,
+                             const struct pl_cie_xy a,
+                             const struct pl_cie_xy b)
+{
+    return (p.x - b.x) * (a.y - b.y) - (a.x - b.x) * (p.y - b.y);
+}
+
+// Test if a point is entirely inside a gamut
+static float test_point_gamut(struct pl_cie_xy point,
+                              const struct pl_raw_primaries *prim)
+{
+    float d1 = test_point_line(point, prim->red, prim->green),
+          d2 = test_point_line(point, prim->green, prim->blue),
+          d3 = test_point_line(point, prim->blue, prim->red);
+
+    bool has_neg = d1 < -1e-6f || d2 < -1e-6f || d3 < -1e-6f,
+         has_pos = d1 >  1e-6f || d2 >  1e-6f || d3 >  1e-6f;
+
+    return !(has_neg && has_pos);
+}
+
+bool pl_primaries_superset(const struct pl_raw_primaries *a,
+                           const struct pl_raw_primaries *b)
+{
+    return test_point_gamut(b->red, a) &&
+           test_point_gamut(b->green, a) &&
+           test_point_gamut(b->blue, a);
+}
+
+bool pl_primaries_valid(const struct pl_raw_primaries *prim)
+{
+    // Test to see if the primaries form a valid triangle (nonzero area)
+    float area = (prim->blue.x - prim->green.x) * (prim->red.y  - prim->green.y)
+               - (prim->red.x  - prim->green.x) * (prim->blue.y - prim->green.y);
+
+    return fabs(area) > 1e-6 && test_point_gamut(prim->white, prim);
+}
+
+static inline float xy_dist2(struct pl_cie_xy a, struct pl_cie_xy b)
+{
+    const float dx = a.x - b.x, dy = a.y - b.y;
+    return dx * dx + dy * dy;
+}
+
+bool pl_primaries_compatible(const struct pl_raw_primaries *a,
+                             const struct pl_raw_primaries *b)
+{
+    float RR = xy_dist2(a->red, b->red),    RG = xy_dist2(a->red, b->green),
+          RB = xy_dist2(a->red, b->blue),   GG = xy_dist2(a->green, b->green),
+          GB = xy_dist2(a->green, b->blue), BB = xy_dist2(a->blue, b->blue);
+    return RR < RG && RR < RB && GG < RG && GG < GB && BB < RB && BB < GB;
+}
+
+// returns the intersection of the two lines defined by ab and cd
+static struct pl_cie_xy intersection(struct pl_cie_xy a, struct pl_cie_xy b,
+                                     struct pl_cie_xy c, struct pl_cie_xy d)
+{
+    float det = (a.x - b.x) * (c.y - d.y) - (a.y - b.y) * (c.x - d.x);
+    float t = ((a.x - c.x) * (c.y - d.y) - (a.y - c.y) * (c.x - d.x)) / det;
+    return (struct pl_cie_xy) {
+        .x = t ? a.x + t * (b.x - a.x) : 0.0f,
+        .y = t ? a.y + t * (b.y - a.y) : 0.0f,
+    };
+}
+
+// x, y, z specified in clockwise order, with a, b, c being the enclosing gamut
+static struct pl_cie_xy
+clip_point(struct pl_cie_xy x, struct pl_cie_xy y, struct pl_cie_xy z,
+           struct pl_cie_xy a, struct pl_cie_xy b, struct pl_cie_xy c)
+{
+    const float d1 = test_point_line(y, a, b);
+    const float d2 = test_point_line(y, b, c);
+    if (d1 <= 0.0f && d2 <= 0.0f) {
+        return y; // already inside triangle
+    } else if (d1 > 0.0f && d2 > 0.0f) {
+        return b; // target vertex fully enclosed
+    } else if (d1 > 0.0f) {
+        return intersection(a, b, y, z);
+    } else {
+        return intersection(x, y, b, c);
+    }
+}
+
+struct pl_raw_primaries pl_primaries_clip(const struct pl_raw_primaries *src,
+                                          const struct pl_raw_primaries *dst)
+{
+    return (struct pl_raw_primaries) {
+        .red   = clip_point(src->green, src->red, src->blue,
+                            dst->green, dst->red, dst->blue),
+        .green = clip_point(src->blue, src->green, src->red,
+                            dst->blue, dst->green, dst->red),
+        .blue  = clip_point(src->red, src->blue, src->green,
+                            dst->red, dst->blue, dst->green),
+        .white = src->white,
+    };
+}
+
+/* Fill in the Y, U, V vectors of a yuv-to-rgb conversion matrix
+ * based on the given luma weights of the R, G and B components (lr, lg, lb).
+ * lr+lg+lb is assumed to equal 1.
+ * This function is meant for colorspaces satisfying the following
+ * conditions (which are true for common YUV colorspaces):
+ * - The mapping from input [Y, U, V] to output [R, G, B] is linear.
+ * - Y is the vector [1, 1, 1].  (meaning input Y component maps to 1R+1G+1B)
+ * - U maps to a value with zero R and positive B ([0, x, y], y > 0;
+ *   i.e. blue and green only).
+ * - V maps to a value with zero B and positive R ([x, y, 0], x > 0;
+ *   i.e. red and green only).
+ * - U and V are orthogonal to the luma vector [lr, lg, lb].
+ * - The magnitudes of the vectors U and V are the minimal ones for which
+ *   the image of the set Y=[0...1],U=[-0.5...0.5],V=[-0.5...0.5] under the
+ *   conversion function will cover the set R=[0...1],G=[0...1],B=[0...1]
+ *   (the resulting matrix can be converted for other input/output ranges
+ *   outside this function).
+ * Under these conditions the given parameters lr, lg, lb uniquely
+ * determine the mapping of Y, U, V to R, G, B.
+ */
+static pl_matrix3x3 luma_coeffs(float lr, float lg, float lb)
+{
+    pl_assert(fabs(lr+lg+lb - 1) < 1e-6);
+    return (pl_matrix3x3) {{
+        {1, 0,                    2 * (1-lr)          },
+        {1, -2 * (1-lb) * lb/lg, -2 * (1-lr) * lr/lg  },
+        {1,  2 * (1-lb),          0                   },
+    }};
+}
+
+// Applies hue and saturation controls to a YCbCr->RGB matrix
+static inline void apply_hue_sat(pl_matrix3x3 *m,
+                                 const struct pl_color_adjustment *params)
+{
+    // Hue is equivalent to rotating input [U, V] subvector around the origin.
+    // Saturation scales [U, V].
+    float huecos = params->saturation * cos(params->hue);
+    float huesin = params->saturation * sin(params->hue);
+    for (int i = 0; i < 3; i++) {
+        float u = m->m[i][1], v = m->m[i][2];
+        m->m[i][1] = huecos * u - huesin * v;
+        m->m[i][2] = huesin * u + huecos * v;
+    }
+}
+
+pl_transform3x3 pl_color_repr_decode(struct pl_color_repr *repr,
+                                     const struct pl_color_adjustment *params)
+{
+    params = PL_DEF(params, &pl_color_adjustment_neutral);
+
+    pl_matrix3x3 m;
+    switch (repr->sys) {
+    case PL_COLOR_SYSTEM_BT_709:     m = luma_coeffs(0.2126, 0.7152, 0.0722); break;
+    case PL_COLOR_SYSTEM_BT_601:     m = luma_coeffs(0.2990, 0.5870, 0.1140); break;
+    case PL_COLOR_SYSTEM_SMPTE_240M: m = luma_coeffs(0.2122, 0.7013, 0.0865); break;
+    case PL_COLOR_SYSTEM_BT_2020_NC: m = luma_coeffs(0.2627, 0.6780, 0.0593); break;
+    case PL_COLOR_SYSTEM_BT_2020_C:
+        // Note: This outputs into the [-0.5,0.5] range for chroma information.
+        m = (pl_matrix3x3) {{
+            {0, 0, 1},
+            {1, 0, 0},
+            {0, 1, 0},
+        }};
+        break;
+    case PL_COLOR_SYSTEM_BT_2100_PQ: {
+        // Reversed from the matrix in the spec, hard-coded for efficiency
+        // and precision reasons. Exact values truncated from ITU-T H-series
+        // Supplement 18.
+        static const float lm_t = 0.008609, lm_p = 0.111029625;
+        m = (pl_matrix3x3) {{
+            {1.0,  lm_t,  lm_p},
+            {1.0, -lm_t, -lm_p},
+            {1.0, 0.560031, -0.320627},
+        }};
+        break;
+    }
+    case PL_COLOR_SYSTEM_BT_2100_HLG: {
+        // Similar to BT.2100 PQ, exact values truncated from WolframAlpha
+        static const float lm_t = 0.01571858011, lm_p = 0.2095810681;
+        m = (pl_matrix3x3) {{
+            {1.0,  lm_t,  lm_p},
+            {1.0, -lm_t, -lm_p},
+            {1.0, 1.02127108, -0.605274491},
+        }};
+        break;
+    }
+    case PL_COLOR_SYSTEM_DOLBYVISION:
+        m = repr->dovi->nonlinear;
+        break;
+    case PL_COLOR_SYSTEM_YCGCO:
+        m = (pl_matrix3x3) {{
+            {1,  -1,  1},
+            {1,   1,  0},
+            {1,  -1, -1},
+        }};
+        break;
+    case PL_COLOR_SYSTEM_UNKNOWN: // fall through
+    case PL_COLOR_SYSTEM_RGB:
+        m = pl_matrix3x3_identity;
+        break;
+    case PL_COLOR_SYSTEM_XYZ: {
+        // For lack of anything saner to do, just assume the caller wants
+        // DCI-P3 primaries, which is a reasonable assumption.
+        const struct pl_raw_primaries *dst = pl_raw_primaries_get(PL_COLOR_PRIM_DCI_P3);
+        m = pl_get_xyz2rgb_matrix(dst);
+        // DCDM X'Y'Z' is expected to have equal energy white point (EG 432-1 Annex H)
+        apply_chromatic_adaptation((struct pl_cie_xy)CIE_E, dst->white, &m);
+        break;
+    }
+    case PL_COLOR_SYSTEM_COUNT:
+        pl_unreachable();
+    }
+
+    // Apply hue and saturation in the correct way depending on the colorspace.
+    if (pl_color_system_is_ycbcr_like(repr->sys)) {
+        apply_hue_sat(&m, params);
+    } else if (params->saturation != 1.0 || params->hue != 0.0) {
+        // Arbitrarily simulate hue shifts using the BT.709 YCbCr model
+        pl_matrix3x3 yuv2rgb = luma_coeffs(0.2126, 0.7152, 0.0722);
+        pl_matrix3x3 rgb2yuv = yuv2rgb;
+        pl_matrix3x3_invert(&rgb2yuv);
+        apply_hue_sat(&yuv2rgb, params);
+        // M := RGB<-YUV * YUV<-RGB * M
+        pl_matrix3x3_rmul(&rgb2yuv, &m);
+        pl_matrix3x3_rmul(&yuv2rgb, &m);
+    }
+
+    // Apply color temperature adaptation, relative to BT.709 primaries
+    if (params->temperature) {
+        struct pl_cie_xy src = pl_white_from_temp(6500);
+        struct pl_cie_xy dst = pl_white_from_temp(6500 + 3500 * params->temperature);
+        pl_matrix3x3 adapt = pl_get_adaptation_matrix(src, dst);
+        pl_matrix3x3_rmul(&adapt, &m);
+    }
+
+    pl_transform3x3 out = { .mat = m };
+    int bit_depth = PL_DEF(repr->bits.sample_depth,
+                    PL_DEF(repr->bits.color_depth, 8));
+
+    double ymax, ymin, cmax, cmid;
+    double scale = (1LL << bit_depth) / ((1LL << bit_depth) - 1.0);
+
+    switch (pl_color_levels_guess(repr)) {
+    case PL_COLOR_LEVELS_LIMITED: {
+        ymax = 235 / 256. * scale;
+        ymin =  16 / 256. * scale;
+        cmax = 240 / 256. * scale;
+        cmid = 128 / 256. * scale;
+        break;
+    }
+    case PL_COLOR_LEVELS_FULL:
+        // Note: For full-range YUV, there are multiple, subtly inconsistent
+        // standards. So just pick the sanest implementation, which is to
+        // assume MAX_INT == 1.0.
+        ymax = 1.0;
+        ymin = 0.0;
+        cmax = 1.0;
+        cmid = 128 / 256. * scale; // *not* exactly 0.5
+        break;
+    default:
+        pl_unreachable();
+    }
+
+    double ymul = 1.0 / (ymax - ymin);
+    double cmul = 0.5 / (cmax - cmid);
+
+    double mul[3]   = { ymul, ymul, ymul };
+    double black[3] = { ymin, ymin, ymin };
+
+#ifdef PL_HAVE_DOVI
+    if (repr->sys == PL_COLOR_SYSTEM_DOLBYVISION) {
+        // The RPU matrix already includes levels normalization, but in this
+        // case we also have to respect the signalled color offsets
+        for (int i = 0; i < 3; i++) {
+            mul[i] = 1.0;
+            black[i] = repr->dovi->nonlinear_offset[i] * scale;
+        }
+    } else
+#endif
+    if (pl_color_system_is_ycbcr_like(repr->sys)) {
+        mul[1]   = mul[2]   = cmul;
+        black[1] = black[2] = cmid;
+    }
+
+    // Contrast scales the output value range (gain)
+    // Brightness scales the constant output bias (black lift/boost)
+    for (int i = 0; i < 3; i++) {
+        mul[i]   *= params->contrast;
+        out.c[i] += params->brightness;
+    }
+
+    // Multiply in the texture multiplier and adjust `c` so that black[j] keeps
+    // on mapping to RGB=0 (black to black)
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            out.mat.m[i][j] *= mul[j];
+            out.c[i] -= out.mat.m[i][j] * black[j];
+        }
+    }
+
+    // Finally, multiply in the scaling factor required to get the color up to
+    // the correct representation.
+    pl_matrix3x3_scale(&out.mat, pl_color_repr_normalize(repr));
+
+    // Update the metadata to reflect the change.
+    repr->sys    = PL_COLOR_SYSTEM_RGB;
+    repr->levels = PL_COLOR_LEVELS_FULL;
+
+    return out;
+}
+
+bool pl_icc_profile_equal(const struct pl_icc_profile *p1,
+                          const struct pl_icc_profile *p2)
+{
+    if (p1->len != p2->len)
+        return false;
+
+    // Ignore signatures on length-0 profiles, as a special case
+    return !p1->len || p1->signature == p2->signature;
+}
+
+void pl_icc_profile_compute_signature(struct pl_icc_profile *profile)
+{
+    if (!profile->len)
+        profile->signature = 0;
+
+    // In theory, we could get this value from the profile header itself if
+    // lcms is available, but I'm not sure if it's even worth the trouble. Just
+    // hard-code this to a pl_mem_hash(), which is decently fast anyway.
+    profile->signature = pl_mem_hash(profile->data, profile->len);
+}
diff --git a/src/common.c b/src/common.c
new file mode 100644
index 0000000..8c8a4f0
--- /dev/null
+++ b/src/common.c
@@ -0,0 +1,500 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include "common.h"
+#include "version.h"
+
+#include <libplacebo/common.h>
+
+int pl_fix_ver(void)
+{
+    return BUILD_FIX_VER;
+}
+
+const char *pl_version(void)
+{
+    return BUILD_VERSION;
+}
+
+void pl_rect2d_normalize(pl_rect2d *rc)
+{
+    *rc = (pl_rect2d) {
+        .x0 = PL_MIN(rc->x0, rc->x1),
+        .x1 = PL_MAX(rc->x0, rc->x1),
+        .y0 = PL_MIN(rc->y0, rc->y1),
+        .y1 = PL_MAX(rc->y0, rc->y1),
+    };
+}
+
+void pl_rect3d_normalize(pl_rect3d *rc)
+{
+    *rc = (pl_rect3d) {
+        .x0 = PL_MIN(rc->x0, rc->x1),
+        .x1 = PL_MAX(rc->x0, rc->x1),
+        .y0 = PL_MIN(rc->y0, rc->y1),
+        .y1 = PL_MAX(rc->y0, rc->y1),
+        .z0 = PL_MIN(rc->z0, rc->z1),
+        .z1 = PL_MAX(rc->z0, rc->z1),
+    };
+}
+
+void pl_rect2df_normalize(pl_rect2df *rc)
+{
+    *rc = (pl_rect2df) {
+        .x0 = PL_MIN(rc->x0, rc->x1),
+        .x1 = PL_MAX(rc->x0, rc->x1),
+        .y0 = PL_MIN(rc->y0, rc->y1),
+        .y1 = PL_MAX(rc->y0, rc->y1),
+    };
+}
+
+void pl_rect3df_normalize(pl_rect3df *rc)
+{
+    *rc = (pl_rect3df) {
+        .x0 = PL_MIN(rc->x0, rc->x1),
+        .x1 = PL_MAX(rc->x0, rc->x1),
+        .y0 = PL_MIN(rc->y0, rc->y1),
+        .y1 = PL_MAX(rc->y0, rc->y1),
+        .z0 = PL_MIN(rc->z0, rc->z1),
+        .z1 = PL_MAX(rc->z0, rc->z1),
+    };
+}
+
+pl_rect2d pl_rect2df_round(const pl_rect2df *rc)
+{
+    return (pl_rect2d) {
+        .x0 = roundf(rc->x0),
+        .x1 = roundf(rc->x1),
+        .y0 = roundf(rc->y0),
+        .y1 = roundf(rc->y1),
+    };
+}
+
+pl_rect3d pl_rect3df_round(const pl_rect3df *rc)
+{
+    return (pl_rect3d) {
+        .x0 = roundf(rc->x0),
+        .x1 = roundf(rc->x1),
+        .y0 = roundf(rc->y0),
+        .y1 = roundf(rc->y1),
+        .z0 = roundf(rc->z0),
+        .z1 = roundf(rc->z1),
+    };
+}
+
+const pl_matrix3x3 pl_matrix3x3_identity = {{
+    { 1, 0, 0 },
+    { 0, 1, 0 },
+    { 0, 0, 1 },
+}};
+
+void pl_matrix3x3_apply(const pl_matrix3x3 *mat, float vec[3])
+{
+    float x = vec[0], y = vec[1], z = vec[2];
+
+    for (int i = 0; i < 3; i++)
+        vec[i] = mat->m[i][0] * x + mat->m[i][1] * y + mat->m[i][2] * z;
+}
+
+void pl_matrix3x3_apply_rc(const pl_matrix3x3 *mat, pl_rect3df *rc)
+{
+    float x0 = rc->x0, x1 = rc->x1,
+          y0 = rc->y0, y1 = rc->y1,
+          z0 = rc->z0, z1 = rc->z1;
+
+    rc->x0 = mat->m[0][0] * x0 + mat->m[0][1] * y0 + mat->m[0][2] * z0;
+    rc->y0 = mat->m[1][0] * x0 + mat->m[1][1] * y0 + mat->m[1][2] * z0;
+    rc->z0 = mat->m[2][0] * x0 + mat->m[2][1] * y0 + mat->m[2][2] * z0;
+
+    rc->x1 = mat->m[0][0] * x1 + mat->m[0][1] * y1 + mat->m[0][2] * z1;
+    rc->y1 = mat->m[1][0] * x1 + mat->m[1][1] * y1 + mat->m[1][2] * z1;
+    rc->z1 = mat->m[2][0] * x1 + mat->m[2][1] * y1 + mat->m[2][2] * z1;
+}
+
+void pl_matrix3x3_scale(pl_matrix3x3 *mat, float scale)
+{
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++)
+            mat->m[i][j] *= scale;
+    }
+}
+
+void pl_matrix3x3_invert(pl_matrix3x3 *mat)
+{
+    double m00 = mat->m[0][0], m01 = mat->m[0][1], m02 = mat->m[0][2],
+           m10 = mat->m[1][0], m11 = mat->m[1][1], m12 = mat->m[1][2],
+           m20 = mat->m[2][0], m21 = mat->m[2][1], m22 = mat->m[2][2];
+
+    // calculate the adjoint
+    double a00 =  (m11 * m22 - m21 * m12);
+    double a01 = -(m01 * m22 - m21 * m02);
+    double a02 =  (m01 * m12 - m11 * m02);
+    double a10 = -(m10 * m22 - m20 * m12);
+    double a11 =  (m00 * m22 - m20 * m02);
+    double a12 = -(m00 * m12 - m10 * m02);
+    double a20 =  (m10 * m21 - m20 * m11);
+    double a21 = -(m00 * m21 - m20 * m01);
+    double a22 =  (m00 * m11 - m10 * m01);
+
+    // calculate the determinant (as inverse == 1/det * adjoint,
+    // adjoint * m == identity * det, so this calculates the det)
+    double det = m00 * a00 + m10 * a01 + m20 * a02;
+    det = 1.0 / det;
+
+    mat->m[0][0] = det * a00;
+    mat->m[0][1] = det * a01;
+    mat->m[0][2] = det * a02;
+    mat->m[1][0] = det * a10;
+    mat->m[1][1] = det * a11;
+    mat->m[1][2] = det * a12;
+    mat->m[2][0] = det * a20;
+    mat->m[2][1] = det * a21;
+    mat->m[2][2] = det * a22;
+}
+
+void pl_matrix3x3_mul(pl_matrix3x3 *a, const pl_matrix3x3 *b)
+{
+    float a00 = a->m[0][0], a01 = a->m[0][1], a02 = a->m[0][2],
+          a10 = a->m[1][0], a11 = a->m[1][1], a12 = a->m[1][2],
+          a20 = a->m[2][0], a21 = a->m[2][1], a22 = a->m[2][2];
+
+    for (int i = 0; i < 3; i++) {
+        a->m[0][i] = a00 * b->m[0][i] + a01 * b->m[1][i] + a02 * b->m[2][i];
+        a->m[1][i] = a10 * b->m[0][i] + a11 * b->m[1][i] + a12 * b->m[2][i];
+        a->m[2][i] = a20 * b->m[0][i] + a21 * b->m[1][i] + a22 * b->m[2][i];
+    }
+}
+
+void pl_matrix3x3_rmul(const pl_matrix3x3 *a, pl_matrix3x3 *b)
+{
+    pl_matrix3x3 m = *a;
+    pl_matrix3x3_mul(&m, b);
+    *b = m;
+}
+
+const pl_transform3x3 pl_transform3x3_identity = {
+    .mat = {{
+        { 1, 0, 0 },
+        { 0, 1, 0 },
+        { 0, 0, 1 },
+    }},
+};
+
+void pl_transform3x3_apply(const pl_transform3x3 *t, float vec[3])
+{
+    pl_matrix3x3_apply(&t->mat, vec);
+
+    for (int i = 0; i < 3; i++)
+        vec[i] += t->c[i];
+}
+
+void pl_transform3x3_apply_rc(const pl_transform3x3 *t, pl_rect3df *rc)
+{
+    pl_matrix3x3_apply_rc(&t->mat, rc);
+
+    rc->x0 += t->c[0];
+    rc->x1 += t->c[0];
+    rc->y0 += t->c[1];
+    rc->y1 += t->c[1];
+    rc->z0 += t->c[2];
+    rc->z1 += t->c[2];
+}
+
+void pl_transform3x3_scale(pl_transform3x3 *t, float scale)
+{
+    pl_matrix3x3_scale(&t->mat, scale);
+
+    for (int i = 0; i < 3; i++)
+        t->c[i] *= scale;
+}
+
+// based on DarkPlaces engine (relicensed from GPL to LGPL)
+void pl_transform3x3_invert(pl_transform3x3 *t)
+{
+    pl_matrix3x3_invert(&t->mat);
+
+    float m00 = t->mat.m[0][0], m01 = t->mat.m[0][1], m02 = t->mat.m[0][2],
+          m10 = t->mat.m[1][0], m11 = t->mat.m[1][1], m12 = t->mat.m[1][2],
+          m20 = t->mat.m[2][0], m21 = t->mat.m[2][1], m22 = t->mat.m[2][2];
+
+    // fix the constant coefficient
+    // rgb = M * yuv + C
+    // M^-1 * rgb = yuv + M^-1 * C
+    // yuv = M^-1 * rgb - M^-1 * C
+    //                  ^^^^^^^^^^
+    float c0 = t->c[0], c1 = t->c[1], c2 = t->c[2];
+    t->c[0] = -(m00 * c0 + m01 * c1 + m02 * c2);
+    t->c[1] = -(m10 * c0 + m11 * c1 + m12 * c2);
+    t->c[2] = -(m20 * c0 + m21 * c1 + m22 * c2);
+}
+
+const pl_matrix2x2 pl_matrix2x2_identity = {{
+    { 1, 0 },
+    { 0, 1 },
+}};
+
+pl_matrix2x2 pl_matrix2x2_rotation(float a)
+{
+    return (pl_matrix2x2) {{
+        { cosf(a), -sinf(a) },
+        { sinf(a),  cosf(a) },
+    }};
+}
+
+void pl_matrix2x2_apply(const pl_matrix2x2 *mat, float vec[2])
+{
+    float x = vec[0], y = vec[1];
+
+    for (int i = 0; i < 2; i++)
+        vec[i] = mat->m[i][0] * x + mat->m[i][1] * y;
+}
+
+void pl_matrix2x2_apply_rc(const pl_matrix2x2 *mat, pl_rect2df *rc)
+{
+    float x0 = rc->x0, x1 = rc->x1,
+          y0 = rc->y0, y1 = rc->y1;
+
+    rc->x0 = mat->m[0][0] * x0 + mat->m[0][1] * y0;
+    rc->y0 = mat->m[1][0] * x0 + mat->m[1][1] * y0;
+
+    rc->x1 = mat->m[0][0] * x1 + mat->m[0][1] * y1;
+    rc->y1 = mat->m[1][0] * x1 + mat->m[1][1] * y1;
+}
+
+void pl_matrix2x2_mul(pl_matrix2x2 *a, const pl_matrix2x2 *b)
+{
+    float a00 = a->m[0][0], a01 = a->m[0][1],
+          a10 = a->m[1][0], a11 = a->m[1][1];
+
+    for (int i = 0; i < 2; i++) {
+        a->m[0][i] = a00 * b->m[0][i] + a01 * b->m[1][i];
+        a->m[1][i] = a10 * b->m[0][i] + a11 * b->m[1][i];
+    }
+}
+
+void pl_matrix2x2_rmul(const pl_matrix2x2 *a, pl_matrix2x2 *b)
+{
+    pl_matrix2x2 m = *a;
+    pl_matrix2x2_mul(&m, b);
+    *b = m;
+}
+
+void pl_matrix2x2_scale(pl_matrix2x2 *mat, float scale)
+{
+    for (int i = 0; i < 2; i++) {
+        for (int j = 0; j < 2; j++)
+            mat->m[i][j] *= scale;
+    }
+}
+
+void pl_matrix2x2_invert(pl_matrix2x2 *mat)
+{
+    float m00 = mat->m[0][0], m01 = mat->m[0][1],
+          m10 = mat->m[1][0], m11 = mat->m[1][1];
+    float invdet = 1.0f / (m11 * m00 - m10 * m01);
+
+    mat->m[0][0] =  m11 * invdet;
+    mat->m[0][1] = -m01 * invdet;
+    mat->m[1][0] = -m10 * invdet;
+    mat->m[1][1] =  m00 * invdet;
+}
+
+const pl_transform2x2 pl_transform2x2_identity = {
+    .mat = {{
+        { 1, 0 },
+        { 0, 1 },
+    }},
+};
+
+void pl_transform2x2_apply(const pl_transform2x2 *t, float vec[2])
+{
+    pl_matrix2x2_apply(&t->mat, vec);
+
+    for (int i = 0; i < 2; i++)
+        vec[i] += t->c[i];
+}
+
+void pl_transform2x2_apply_rc(const pl_transform2x2 *t, pl_rect2df *rc)
+{
+    pl_matrix2x2_apply_rc(&t->mat, rc);
+
+    rc->x0 += t->c[0];
+    rc->x1 += t->c[0];
+    rc->y0 += t->c[1];
+    rc->y1 += t->c[1];
+}
+
+void pl_transform2x2_mul(pl_transform2x2 *a, const pl_transform2x2 *b)
+{
+    float c[2] = { b->c[0], b->c[1] };
+    pl_transform2x2_apply(a, c);
+    memcpy(a->c, c, sizeof(c));
+    pl_matrix2x2_mul(&a->mat, &b->mat);
+}
+
+void pl_transform2x2_rmul(const pl_transform2x2 *a, pl_transform2x2 *b)
+{
+    pl_transform2x2_apply(a, b->c);
+    pl_matrix2x2_rmul(&a->mat, &b->mat);
+}
+
+void pl_transform2x2_scale(pl_transform2x2 *t, float scale)
+{
+    pl_matrix2x2_scale(&t->mat, scale);
+
+    for (int i = 0; i < 2; i++)
+        t->c[i] *= scale;
+}
+
+void pl_transform2x2_invert(pl_transform2x2 *t)
+{
+    pl_matrix2x2_invert(&t->mat);
+
+    float m00 = t->mat.m[0][0], m01 = t->mat.m[0][1],
+          m10 = t->mat.m[1][0], m11 = t->mat.m[1][1];
+    float c0 = t->c[0], c1 = t->c[1];
+    t->c[0] = -(m00 * c0 + m01 * c1);
+    t->c[1] = -(m10 * c0 + m11 * c1);
+}
+
+pl_rect2df pl_transform2x2_bounds(const pl_transform2x2 *t, const pl_rect2df *rc)
+{
+    float p[4][2] = {
+        { rc->x0, rc->y0 },
+        { rc->x0, rc->y1 },
+        { rc->x1, rc->y0 },
+        { rc->x1, rc->y1 },
+    };
+    for (int i = 0; i < PL_ARRAY_SIZE(p); i++)
+        pl_transform2x2_apply(t, p[i]);
+
+    return (pl_rect2df) {
+        .x0 = fminf(fminf(p[0][0], p[1][0]), fminf(p[2][0], p[3][0])),
+        .x1 = fmaxf(fmaxf(p[0][0], p[1][0]), fmaxf(p[2][0], p[3][0])),
+        .y0 = fminf(fminf(p[0][1], p[1][1]), fminf(p[2][1], p[3][1])),
+        .y1 = fmaxf(fmaxf(p[0][1], p[1][1]), fmaxf(p[2][1], p[3][1])),
+    };
+}
+
+float pl_rect2df_aspect(const pl_rect2df *rc)
+{
+    float w = fabsf(pl_rect_w(*rc)), h = fabsf(pl_rect_h(*rc));
+    return h ? (w / h) : 0.0;
+}
+
+void pl_rect2df_aspect_set(pl_rect2df *rc, float aspect, float panscan)
+{
+    pl_assert(aspect >= 0);
+    float orig_aspect = pl_rect2df_aspect(rc);
+    if (!aspect || !orig_aspect)
+        return;
+
+    float scale_x, scale_y;
+    if (aspect > orig_aspect) {
+        // New aspect is wider than the original, so we need to either grow in
+        // scale_x (panscan=1) or shrink in scale_y (panscan=0)
+        scale_x = powf(aspect / orig_aspect, panscan);
+        scale_y = powf(aspect / orig_aspect, panscan - 1.0);
+    } else if (aspect < orig_aspect) {
+        // New aspect is taller, so either grow in scale_y (panscan=1) or
+        // shrink in scale_x (panscan=0)
+        scale_x = powf(orig_aspect / aspect, panscan - 1.0);
+        scale_y = powf(orig_aspect / aspect, panscan);
+    } else {
+        return; // No change in aspect
+    }
+
+    pl_rect2df_stretch(rc, scale_x, scale_y);
+}
+
+void pl_rect2df_aspect_fit(pl_rect2df *rc, const pl_rect2df *src, float panscan)
+{
+    float orig_w = fabs(pl_rect_w(*rc)),
+          orig_h = fabs(pl_rect_h(*rc));
+    if (!orig_w || !orig_h)
+        return;
+
+    // If either one of these is larger than 1, then we need to shrink to fit,
+    // otherwise we can just directly stretch the rect.
+    float scale_x = fabs(pl_rect_w(*src)) / orig_w,
+          scale_y = fabs(pl_rect_h(*src)) / orig_h;
+
+    if (scale_x > 1.0 || scale_y > 1.0) {
+        pl_rect2df_aspect_copy(rc, src, panscan);
+    } else {
+        pl_rect2df_stretch(rc, scale_x, scale_y);
+    }
+}
+
+void pl_rect2df_stretch(pl_rect2df *rc, float stretch_x, float stretch_y)
+{
+    float midx = (rc->x0 + rc->x1) / 2.0,
+          midy = (rc->y0 + rc->y1) / 2.0;
+
+    rc->x0 = rc->x0 * stretch_x + midx * (1.0 - stretch_x);
+    rc->x1 = rc->x1 * stretch_x + midx * (1.0 - stretch_x);
+    rc->y0 = rc->y0 * stretch_y + midy * (1.0 - stretch_y);
+    rc->y1 = rc->y1 * stretch_y + midy * (1.0 - stretch_y);
+}
+
+void pl_rect2df_offset(pl_rect2df *rc, float offset_x, float offset_y)
+{
+    if (rc->x1 < rc->x0)
+        offset_x = -offset_x;
+    if (rc->y1 < rc->y0)
+        offset_y = -offset_y;
+
+    rc->x0 += offset_x;
+    rc->x1 += offset_x;
+    rc->y0 += offset_y;
+    rc->y1 += offset_y;
+}
+
+void pl_rect2df_rotate(pl_rect2df *rc, pl_rotation rot)
+{
+    if (!(rot = pl_rotation_normalize(rot)))
+        return;
+
+    float x0 = rc->x0, y0 = rc->y0, x1 = rc->x1, y1 = rc->y1;
+    if (rot >= PL_ROTATION_180) {
+        rot -= PL_ROTATION_180;
+        PL_SWAP(x0, x1);
+        PL_SWAP(y0, y1);
+    }
+
+    switch (rot) {
+    case PL_ROTATION_0:
+        *rc = (pl_rect2df) {
+            .x0 = x0,
+            .y0 = y0,
+            .x1 = x1,
+            .y1 = y1,
+        };
+        return;
+    case PL_ROTATION_90:
+        *rc = (pl_rect2df) {
+            .x0 = y1,
+            .y0 = x0,
+            .x1 = y0,
+            .y1 = x1,
+        };
+        return;
+    default: pl_unreachable();
+    }
+}
diff --git a/src/common.h b/src/common.h
new file mode 100644
index 0000000..0cac24d
--- /dev/null
+++ b/src/common.h
@@ -0,0 +1,191 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#define __STDC_FORMAT_MACROS
+
+#ifdef __cplusplus
+#include <version>
+#endif
+
+#if !defined(__cplusplus) || defined(__cpp_lib_stdatomic_h)
+#define PL_HAVE_STDATOMIC
+#endif
+
+#ifdef PL_HAVE_STDATOMIC
+#include <stdatomic.h>
+#endif
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#if defined(__MINGW32__) && !defined(__clang__)
+#define PL_PRINTF(fmt, va) __attribute__ ((format(gnu_printf, fmt, va))) \
+                           __attribute__ ((nonnull(fmt)))
+#elif defined(__GNUC__)
+#define PL_PRINTF(fmt, va) __attribute__ ((format(printf, fmt, va))) \
+                           __attribute__ ((nonnull(fmt)))
+#else
+#define PL_PRINTF(fmt, va)
+#endif
+
+#define PL_NOINLINE __attribute__((noinline))
+
+#include "os.h"
+
+#include "config_internal.h"
+
+#define PL_DEPRECATED
+
+#include <libplacebo/config.h>
+
+#include "pl_assert.h"
+#include "pl_alloc.h"
+#include "pl_clock.h"
+#include "pl_string.h"
+
+#if PL_API_VER != BUILD_API_VER
+#error Header mismatch? <libplacebo/config.h> pulled from elsewhere!
+#endif
+
+// Divide a number while rounding up (careful: double-eval)
+#define PL_DIV_UP(x, y) (((x) + (y) - 1) / (y))
+
+// Align up to the nearest multiple of an arbitrary alignment, which may also
+// be 0 to signal no alignment requirements.
+#define PL_ALIGN(x, align) ((align) ? PL_DIV_UP(x, align) * (align) : (x))
+
+// This is faster but must only be called on positive powers of two.
+#define PL_ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
+
+// Returns the log base 2 of an unsigned long long
+#define PL_LOG2(x) ((unsigned) (8*sizeof (unsigned long long) - __builtin_clzll((x)) - 1))
+
+// Rounds a number up to the nearest power of two
+#define PL_ALIGN_POT(x) (0x1LLU << (PL_LOG2((x) - 1) + 1))
+
+// Right shift a number while rounding up
+#define PL_RSHIFT_UP(x, s) -((-(x)) >> (s))
+
+// Returns whether or not a number is a power of two (or zero)
+#define PL_ISPOT(x) (((x) & ((x) - 1)) == 0)
+
+// Returns the size of a static array with known size.
+#define PL_ARRAY_SIZE(s) (sizeof(s) / sizeof((s)[0]))
+
+// Swaps two variables
+#define PL_SWAP(a, b)              \
+    do {                           \
+        __typeof__ (a) _tmp = (a); \
+        (a) = (b);                 \
+        (b) = _tmp;                \
+    } while (0)
+
+// Helper functions for transposing a matrix in-place.
+#define PL_TRANSPOSE_DIM(d, m) \
+    pl_transpose((d), (float[(d)*(d)]){0}, (const float *)(m))
+
+#define PL_TRANSPOSE_2X2(m) PL_TRANSPOSE_DIM(2, m)
+#define PL_TRANSPOSE_3X3(m) PL_TRANSPOSE_DIM(3, m)
+#define PL_TRANSPOSE_4X4(m) PL_TRANSPOSE_DIM(4, m)
+
+static inline float *pl_transpose(int dim, float *out, const float *in)
+{
+    for (int i = 0; i < dim; i++) {
+        for (int j = 0; j < dim; j++)
+            out[i * dim + j] = in[j * dim + i];
+    }
+
+    return out;
+}
+
+// Helper functions for some common numeric operations (careful: double-eval)
+#define PL_MAX(x, y) ((x) > (y) ? (x) : (y))
+#define PL_MAX3(x, y, z) PL_MAX(PL_MAX(x, y), z)
+#define PL_MIN(x, y) ((x) < (y) ? (x) : (y))
+#define PL_CLAMP(x, l, h) ((x) < (l) ? (l) : (x) > (h) ? (h) : (x))
+#define PL_CMP(a, b) (((a) > (b)) - ((a) < (b)))
+#define PL_DEF(x, d) ((x) ? (x) : (d))
+#define PL_SQUARE(x) ((x) * (x))
+#define PL_CUBE(x) ((x) * (x) * (x))
+#define PL_MIX(a, b, x) ((x) * (b) + (1 - (x)) * (a))
+
+static inline float pl_smoothstep(float edge0, float edge1, float x)
+{
+    if (edge0 == edge1)
+        return x >= edge0;
+    x = (x - edge0) / (edge1 - edge0);
+    x = PL_CLAMP(x, 0.0f, 1.0f);
+    return x * x * (3.0f - 2.0f * x);
+}
+
+// Helpers for doing alignment calculations
+static inline size_t pl_gcd(size_t x, size_t y)
+{
+    assert(x && y);
+    while (y) {
+        size_t tmp = y;
+        y = x % y;
+        x = tmp;
+    }
+
+    return x;
+}
+
+static inline size_t pl_lcm(size_t x, size_t y)
+{
+    assert(x && y);
+    return x * (y / pl_gcd(x, y));
+}
+
+// Conditional abort() macro that depends on the configuration option
+#ifdef PL_DEBUG_ABORT
+# define pl_debug_abort() do {                          \
+    fprintf(stderr, "pl_debug_abort() triggered!\n");   \
+    abort();                                            \
+} while (0)
+#else
+# define pl_debug_abort() do {} while (0)
+#endif
+
+#ifdef PL_HAVE_STDATOMIC
+
+// Refcounting helpers
+typedef atomic_uint_fast32_t pl_rc_t;
+#define pl_rc_init(rc)  atomic_init(rc, 1)
+#define pl_rc_ref(rc)   ((void) atomic_fetch_add_explicit(rc, 1, memory_order_acquire))
+#define pl_rc_deref(rc) (atomic_fetch_sub_explicit(rc, 1, memory_order_release) == 1)
+#define pl_rc_count(rc)  atomic_load(rc)
+
+#endif
+
+#define pl_unreachable() (assert(!"unreachable"), __builtin_unreachable())
+
+// Helper for parameter validation
+#define pl_require(ctx, expr)                                   \
+  do {                                                          \
+      if (!(expr)) {                                            \
+          PL_ERR(ctx, "Validation failed: %s (%s:%d)",          \
+                  #expr, __FILE__, __LINE__);                   \
+          pl_log_stack_trace(ctx->log, PL_LOG_ERR);             \
+          pl_debug_abort();                                     \
+          goto error;                                           \
+      }                                                         \
+  } while (0)
diff --git a/src/convert.cc b/src/convert.cc
new file mode 100644
index 0000000..05c9dd0
--- /dev/null
+++ b/src/convert.cc
@@ -0,0 +1,233 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <charconv>
+#include <limits>
+#include <system_error>
+
+#if __has_include(<fast_float/fast_float.h>)
+# include <fast_float/fast_float.h>
+#endif
+
+#include "pl_string.h"
+
+[[maybe_unused]]
+static int ccStrPrintDouble( char *str, int bufsize, int decimals, double value );
+
+namespace {
+
+template <typename T>
+struct has_std_to_chars_impl {
+    template <typename CT>
+    static auto _(CT s) -> decltype(std::to_chars(s, s, std::declval<T>()), std::true_type{});
+    static auto _(...) -> std::false_type;
+    static constexpr bool value = decltype(_((char *){}))::value;
+};
+
+template <typename T>
+constexpr bool has_std_to_chars = has_std_to_chars_impl<T>::value;
+
+template <typename T, typename... Args>
+static inline int to_chars(char *buf, size_t len, T n, Args ...args)
+{
+    if constexpr (has_std_to_chars<T>) {
+        auto [ptr, ec] = std::to_chars(buf, buf + len, n, args...);
+        return ec == std::errc() ? ptr - buf : 0;
+    } else {
+        static_assert(std::is_same_v<float, T> || std::is_same_v<double, T>,
+                      "Not implemented!");
+        // FIXME: Fallback for GCC <= 10 currently required for MinGW-w64 on
+        // Ubuntu 22.04. Remove this when Ubuntu 24.04 is released, as it will
+        // provide newer MinGW-w64 GCC and it will be safe to require it.
+        return ccStrPrintDouble(buf, len, std::numeric_limits<T>::max_digits10, n);
+    }
+}
+
+template <typename T>
+struct has_std_from_chars_impl {
+    template <typename CT>
+    static auto _(CT s) -> decltype(std::from_chars(s, s, std::declval<T&>()), std::true_type{});
+    static auto _(...) -> std::false_type;
+    static constexpr bool value = decltype(_((const char *){}))::value;
+};
+
+template <typename T>
+constexpr bool has_std_from_chars = has_std_from_chars_impl<T>::value;
+
+template <typename T, typename... Args>
+static inline bool from_chars(pl_str str, T &n, Args ...args)
+{
+    if constexpr (has_std_from_chars<T>) {
+        auto [ptr, ec] = std::from_chars((const char *) str.buf,
+                                         (const char *) str.buf + str.len,
+                                         n, args...);
+        return ec == std::errc();
+    } else {
+        constexpr bool is_fp = std::is_same_v<float, T> || std::is_same_v<double, T>;
+        static_assert(is_fp, "Not implemented!");
+#if !__has_include(<fast_float/fast_float.h>)
+        static_assert(!is_fp, "<fast_float/fast_float.h> is required, but not " \
+                              "found. Please run `git submodule update --init`" \
+                              " or provide <fast_float/fast_float.h>");
+#else
+        // FIXME: Fallback for libc++, as it does not implement floating-point
+        // variant of std::from_chars. Remove this when appropriate.
+        auto [ptr, ec] = fast_float::from_chars((const char *) str.buf,
+                                                (const char *) str.buf + str.len,
+                                                n, args...);
+        return ec == std::errc();
+#endif
+    }
+}
+
+}
+
+#define CHAR_CONVERT(name, type, ...)                           \
+    int pl_str_print_##name(char *buf, size_t len, type n)      \
+    {                                                           \
+        return to_chars(buf, len, n __VA_OPT__(,) __VA_ARGS__); \
+    }                                                           \
+    bool pl_str_parse_##name(pl_str str, type *n)               \
+    {                                                           \
+        return from_chars(str, *n __VA_OPT__(,) __VA_ARGS__);   \
+    }
+
+CHAR_CONVERT(hex, unsigned short, 16)
+CHAR_CONVERT(int, int)
+CHAR_CONVERT(uint, unsigned int)
+CHAR_CONVERT(int64, int64_t)
+CHAR_CONVERT(uint64, uint64_t)
+CHAR_CONVERT(float, float)
+CHAR_CONVERT(double, double)
+
+/* *****************************************************************************
+ *
+ * Copyright (c) 2007-2016 Alexis Naveros.
+ * Modified for use with libplacebo by Niklas Haas
+ * Changes include:
+ *  - Removed a CC_MIN macro dependency by equivalent logic
+ *  - Removed CC_ALWAYSINLINE
+ *  - Fixed (!seq) check to (!seqlength)
+ *  - Added support for scientific notation (e.g. 1.0e10) in ccSeqParseDouble
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * -----------------------------------------------------------------------------
+ */
+
+static int ccStrPrintDouble( char *str, int bufsize, int decimals, double value )
+{
+    int size, offset, index;
+    int32_t frac, accumsub;
+    double muldec;
+    uint32_t u32;
+    uint64_t u64;
+
+    size = 0;
+    if( value < 0.0 )
+    {
+        size = 1;
+        *str++ = '-';
+        bufsize--;
+        value = -value;
+    }
+
+    if( value < 4294967296.0 )
+    {
+        u32 = (uint32_t)value;
+        offset = pl_str_print_uint( str, bufsize, u32 );
+        if (!offset)
+            goto error;
+        size += offset;
+        bufsize -= size;
+        value -= (double)u32;
+    }
+    else if( value < 18446744073709551616.0 )
+    {
+        u64 = (uint64_t)value;
+        offset = pl_str_print_uint64( str, bufsize, u64 );
+        if (!offset)
+            goto error;
+        size += offset;
+        bufsize -= size;
+        value -= (double)u64;
+    }
+    else
+        goto error;
+
+    if (decimals > bufsize - 2)
+        decimals = bufsize - 2;
+    if( decimals <= 0 )
+        return size;
+
+    muldec = 10.0;
+    accumsub = 0;
+    str += offset;
+
+    for( index = 0 ; index < decimals ; index++ )
+    {
+        // Skip printing insignificant decimal digits
+        if (value * muldec - accumsub <= std::numeric_limits<double>::epsilon())
+            break;
+        if (index == 0) {
+            size += 1;
+            *str++ = '.';
+        }
+        frac = (int32_t)( value * muldec ) - accumsub;
+        frac = PL_CLAMP(frac, 0, 9); // FIXME: why is this needed?
+        str[index] = '0' + (char)frac;
+        accumsub += frac;
+        accumsub = ( accumsub << 3 ) + ( accumsub << 1 );
+        if( muldec < 10000000 )
+            muldec *= 10.0;
+        else
+        {
+            value *= 10000000.0;
+            value -= (int32_t)value;
+            muldec = 10.0;
+            accumsub = 0;
+        }
+    }
+    // Round up the last decimal digit
+    if ( str[ index - 1 ] < '9' && (int32_t)( value * muldec ) - accumsub >= 5 )
+        str[ index - 1 ]++;
+    str[ index ] = 0;
+    size += index;
+    return size;
+
+error:
+    if( bufsize < 4 )
+        *str = 0;
+    else
+    {
+        str[0] = 'E';
+        str[1] = 'R';
+        str[2] = 'R';
+        str[3] = 0;
+    }
+    return 0;
+}
diff --git a/src/d3d11/common.h b/src/d3d11/common.h
new file mode 100644
index 0000000..e14b709
--- /dev/null
+++ b/src/d3d11/common.h
@@ -0,0 +1,66 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../common.h"
+#include "../log.h"
+
+#ifdef PL_HAVE_DXGI_DEBUG
+#include <dxgidebug.h>
+#endif
+
+#include <libplacebo/d3d11.h>
+
+// Shared struct used to hold the D3D11 device and associated interfaces
+struct d3d11_ctx {
+    pl_log log;
+    pl_d3d11 d3d11;
+
+    // Copy of the device from pl_d3d11 for convenience. Does not hold an
+    // additional reference.
+    ID3D11Device *dev;
+
+    // DXGI device. This does hold a reference.
+    IDXGIDevice1 *dxgi_dev;
+
+#ifdef PL_HAVE_DXGI_DEBUG
+    // Debug interfaces
+    IDXGIDebug *debug;
+    IDXGIInfoQueue *iqueue;
+    uint64_t last_discarded; // Last count of discarded messages
+    DXGI_INFO_QUEUE_MESSAGE *dxgi_msg;
+#endif
+
+    // pl_gpu_is_failed (We saw a device removed error!)
+    bool is_failed;
+};
+
+// DDK value. Apparently some D3D functions can return this instead of the
+// proper user-mode error code. See:
+// https://docs.microsoft.com/en-us/windows/win32/api/dxgi/nf-dxgi-idxgiswapchain-present
+#define D3DDDIERR_DEVICEREMOVED (0x88760870)
+
+#ifndef D3D11_FORMAT_SUPPORT2_UAV_TYPED_STORE
+#define D3D11_FORMAT_SUPPORT2_UAV_TYPED_STORE (0x80)
+#endif
+#ifndef D3D11_FORMAT_SUPPORT2_UAV_TYPED_LOAD
+#define D3D11_FORMAT_SUPPORT2_UAV_TYPED_LOAD (0x40)
+#endif
+#ifndef PL_HAVE_DXGI_DEBUG_D3D11
+DEFINE_GUID(DXGI_DEBUG_D3D11, 0x4b99317b, 0xac39, 0x4aa6, 0xbb, 0xb, 0xba, 0xa0, 0x47, 0x84, 0x79, 0x8f);
+#endif
diff --git a/src/d3d11/context.c b/src/d3d11/context.c
new file mode 100644
index 0000000..e0ba90f
--- /dev/null
+++ b/src/d3d11/context.c
@@ -0,0 +1,488 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+
+// Windows 8 enum value, not present in mingw-w64 v7
+#define DXGI_ADAPTER_FLAG_SOFTWARE (2)
+
+const struct pl_d3d11_params pl_d3d11_default_params = { PL_D3D11_DEFAULTS };
+
+static INIT_ONCE d3d11_once = INIT_ONCE_STATIC_INIT;
+static PFN_D3D11_CREATE_DEVICE pD3D11CreateDevice = NULL;
+static __typeof__(&CreateDXGIFactory1) pCreateDXGIFactory1 = NULL;
+#ifdef PL_HAVE_DXGI_DEBUG
+static __typeof__(&DXGIGetDebugInterface) pDXGIGetDebugInterface = NULL;
+#endif
+
+static void d3d11_load(void)
+{
+    BOOL bPending = FALSE;
+    InitOnceBeginInitialize(&d3d11_once, 0, &bPending, NULL);
+
+    if (bPending)
+    {
+        HMODULE d3d11 = LoadLibraryW(L"d3d11.dll");
+        if (d3d11) {
+            pD3D11CreateDevice = (void *)
+                GetProcAddress(d3d11, "D3D11CreateDevice");
+        }
+
+        HMODULE dxgi = LoadLibraryW(L"dxgi.dll");
+        if (dxgi) {
+            pCreateDXGIFactory1 = (void *)
+                GetProcAddress(dxgi, "CreateDXGIFactory1");
+        }
+
+#ifdef PL_HAVE_DXGI_DEBUG
+        HMODULE dxgi_debug = LoadLibraryW(L"dxgidebug.dll");
+        if (dxgi_debug) {
+            pDXGIGetDebugInterface = (void *)
+                GetProcAddress(dxgi_debug, "DXGIGetDebugInterface");
+        }
+#endif
+    }
+
+    InitOnceComplete(&d3d11_once, 0, NULL);
+}
+
+// Get a const array of D3D_FEATURE_LEVELs from max_fl to min_fl (inclusive)
+static int get_feature_levels(int max_fl, int min_fl,
+                              const D3D_FEATURE_LEVEL **out)
+{
+    static const D3D_FEATURE_LEVEL levels[] = {
+        D3D_FEATURE_LEVEL_12_1,
+        D3D_FEATURE_LEVEL_12_0,
+        D3D_FEATURE_LEVEL_11_1,
+        D3D_FEATURE_LEVEL_11_0,
+        D3D_FEATURE_LEVEL_10_1,
+        D3D_FEATURE_LEVEL_10_0,
+        D3D_FEATURE_LEVEL_9_3,
+        D3D_FEATURE_LEVEL_9_2,
+        D3D_FEATURE_LEVEL_9_1,
+    };
+    static const int levels_len = PL_ARRAY_SIZE(levels);
+
+    int start = 0;
+    for (; start < levels_len; start++) {
+        if (levels[start] <= max_fl)
+            break;
+    }
+    int len = 0;
+    for (; start + len < levels_len; len++) {
+        if (levels[start + len] < min_fl)
+            break;
+    }
+    *out = &levels[start];
+    return len;
+}
+
+static bool is_null_luid(LUID luid)
+{
+    return luid.LowPart == 0 && luid.HighPart == 0;
+}
+
+static IDXGIAdapter *get_adapter(pl_d3d11 d3d11, LUID adapter_luid)
+{
+    struct d3d11_ctx *ctx = PL_PRIV(d3d11);
+    IDXGIFactory1 *factory = NULL;
+    IDXGIAdapter1 *adapter1 = NULL;
+    IDXGIAdapter *adapter = NULL;
+    HRESULT hr;
+
+    if (!pCreateDXGIFactory1) {
+        PL_FATAL(ctx, "Failed to load dxgi.dll");
+        goto error;
+    }
+    pCreateDXGIFactory1(&IID_IDXGIFactory1, (void **) &factory);
+
+    for (int i = 0;; i++) {
+        hr = IDXGIFactory1_EnumAdapters1(factory, i, &adapter1);
+        if (hr == DXGI_ERROR_NOT_FOUND)
+            break;
+        if (FAILED(hr)) {
+            PL_FATAL(ctx, "Failed to enumerate adapters");
+            goto error;
+        }
+
+        DXGI_ADAPTER_DESC1 desc;
+        D3D(IDXGIAdapter1_GetDesc1(adapter1, &desc));
+        if (desc.AdapterLuid.LowPart == adapter_luid.LowPart &&
+            desc.AdapterLuid.HighPart == adapter_luid.HighPart)
+        {
+            break;
+        }
+
+        SAFE_RELEASE(adapter1);
+    }
+    if (!adapter1) {
+        PL_FATAL(ctx, "Adapter with LUID %08lx%08lx not found",
+                 adapter_luid.HighPart, adapter_luid.LowPart);
+        goto error;
+    }
+
+    D3D(IDXGIAdapter1_QueryInterface(adapter1, &IID_IDXGIAdapter,
+                                     (void **) &adapter));
+
+error:
+    SAFE_RELEASE(factory);
+    SAFE_RELEASE(adapter1);
+    return adapter;
+}
+
+static bool has_sdk_layers(void)
+{
+    // This will fail if the SDK layers aren't installed
+    return SUCCEEDED(pD3D11CreateDevice(NULL, D3D_DRIVER_TYPE_NULL, NULL,
+        D3D11_CREATE_DEVICE_DEBUG, NULL, 0, D3D11_SDK_VERSION, NULL, NULL,
+        NULL));
+}
+
+static ID3D11Device *create_device(struct pl_d3d11_t *d3d11,
+                                   const struct pl_d3d11_params *params)
+{
+    struct d3d11_ctx *ctx = PL_PRIV(d3d11);
+    bool debug = params->debug;
+    bool warp = params->force_software;
+    int max_fl = params->max_feature_level;
+    int min_fl = params->min_feature_level;
+    ID3D11Device *dev = NULL;
+    IDXGIDevice1 *dxgi_dev = NULL;
+    IDXGIAdapter *adapter = NULL;
+    bool release_adapter = false;
+    HRESULT hr;
+
+    d3d11_load();
+
+    if (!pD3D11CreateDevice) {
+        PL_FATAL(ctx, "Failed to load d3d11.dll");
+        goto error;
+    }
+
+    if (params->adapter) {
+        adapter = params->adapter;
+    } else if (!is_null_luid(params->adapter_luid)) {
+        adapter = get_adapter(d3d11, params->adapter_luid);
+        release_adapter = true;
+    }
+
+    if (debug && !has_sdk_layers()) {
+        PL_INFO(ctx, "Debug layer not available, removing debug flag");
+        debug = false;
+    }
+
+    // Return here to retry creating the device
+    do {
+        // Use these default feature levels if they are not set
+        max_fl = PL_DEF(max_fl, D3D_FEATURE_LEVEL_12_1);
+        min_fl = PL_DEF(min_fl, D3D_FEATURE_LEVEL_9_1);
+
+        // Get a list of feature levels from min_fl to max_fl
+        const D3D_FEATURE_LEVEL *levels;
+        int levels_len = get_feature_levels(max_fl, min_fl, &levels);
+        if (!levels_len) {
+            PL_FATAL(ctx, "No suitable Direct3D feature level found");
+            goto error;
+        }
+
+        D3D_DRIVER_TYPE type = D3D_DRIVER_TYPE_UNKNOWN;
+        if (!adapter) {
+            if (warp) {
+                type = D3D_DRIVER_TYPE_WARP;
+            } else {
+                type = D3D_DRIVER_TYPE_HARDWARE;
+            }
+        }
+
+        UINT flags = params->flags;
+        if (debug)
+            flags |= D3D11_CREATE_DEVICE_DEBUG;
+
+        hr = pD3D11CreateDevice(adapter, type, NULL, flags, levels, levels_len,
+                                D3D11_SDK_VERSION, &dev, NULL, NULL);
+        if (SUCCEEDED(hr))
+            break;
+
+        pl_d3d11_after_error(ctx, hr);
+
+        // Trying to create a D3D_FEATURE_LEVEL_12_0 device on Windows 8.1 or
+        // below will not succeed. Try an 11_1 device.
+        if (hr == E_INVALIDARG && max_fl >= D3D_FEATURE_LEVEL_12_0 &&
+                                  min_fl <= D3D_FEATURE_LEVEL_11_1) {
+            PL_DEBUG(ctx, "Failed to create 12_0+ device, trying 11_1");
+            max_fl = D3D_FEATURE_LEVEL_11_1;
+            continue;
+        }
+
+        // Trying to create a D3D_FEATURE_LEVEL_11_1 device on Windows 7
+        // without the platform update will not succeed. Try an 11_0 device.
+        if (hr == E_INVALIDARG && max_fl >= D3D_FEATURE_LEVEL_11_1 &&
+                                  min_fl <= D3D_FEATURE_LEVEL_11_0) {
+            PL_DEBUG(ctx, "Failed to create 11_1+ device, trying 11_0");
+            max_fl = D3D_FEATURE_LEVEL_11_0;
+            continue;
+        }
+
+        // Retry with WARP if allowed
+        if (!adapter && !warp && params->allow_software) {
+            PL_DEBUG(ctx, "Failed to create hardware device, trying WARP: %s",
+                     pl_hresult_to_str(hr));
+            warp = true;
+            max_fl = params->max_feature_level;
+            min_fl = params->min_feature_level;
+            continue;
+        }
+
+        PL_FATAL(ctx, "Failed to create Direct3D 11 device: %s",
+                 pl_hresult_to_str(hr));
+        goto error;
+    } while (true);
+
+    if (params->max_frame_latency) {
+        D3D(ID3D11Device_QueryInterface(dev, &IID_IDXGIDevice1,
+                                        (void **) &dxgi_dev));
+        IDXGIDevice1_SetMaximumFrameLatency(dxgi_dev, params->max_frame_latency);
+    }
+
+    d3d11->software = warp;
+
+error:
+    if (release_adapter)
+        SAFE_RELEASE(adapter);
+    SAFE_RELEASE(dxgi_dev);
+    return dev;
+}
+
+static void init_debug_layer(struct d3d11_ctx *ctx, bool leak_check)
+{
+#ifdef PL_HAVE_DXGI_DEBUG
+    if (!pDXGIGetDebugInterface)
+        d3d11_load();
+
+    if (!pDXGIGetDebugInterface)
+        goto error;
+
+    D3D(pDXGIGetDebugInterface(&IID_IDXGIInfoQueue, (void **) &ctx->iqueue));
+
+    // Push empty filter to get everything
+    IDXGIInfoQueue_PushStorageFilter(ctx->iqueue, DXGI_DEBUG_ALL,
+                                     &(DXGI_INFO_QUEUE_FILTER){0});
+
+    // Filter some annoying D3D11 messages
+    DXGI_INFO_QUEUE_MESSAGE_ID deny_ids[] = {
+        // This false-positive error occurs every time we Draw() with a shader
+        // that samples from a texture format that only supports point sampling.
+        // Since we already use CheckFormatSupport to know which formats can be
+        // linearly sampled from, we shouldn't ever bind a non-point sampler to
+        // a format that doesn't support it.
+        D3D11_MESSAGE_ID_DEVICE_DRAW_RESOURCE_FORMAT_SAMPLE_UNSUPPORTED,
+    };
+    DXGI_INFO_QUEUE_FILTER filter = {
+        .DenyList = {
+            .NumIDs = PL_ARRAY_SIZE(deny_ids),
+            .pIDList = deny_ids,
+        },
+    };
+    IDXGIInfoQueue_PushStorageFilter(ctx->iqueue, DXGI_DEBUG_D3D11, &filter);
+
+    IDXGIInfoQueue_SetMessageCountLimit(ctx->iqueue, DXGI_DEBUG_D3D11, -1);
+    IDXGIInfoQueue_SetMessageCountLimit(ctx->iqueue, DXGI_DEBUG_DXGI, -1);
+
+    if (leak_check)
+        D3D(pDXGIGetDebugInterface(&IID_IDXGIDebug, (void **) &ctx->debug));
+
+error:
+    return;
+#endif
+}
+
+void pl_d3d11_destroy(pl_d3d11 *ptr)
+{
+    pl_d3d11 d3d11 = *ptr;
+    if (!d3d11)
+        return;
+    struct d3d11_ctx *ctx = PL_PRIV(d3d11);
+
+    pl_gpu_destroy(d3d11->gpu);
+
+    SAFE_RELEASE(ctx->dev);
+    SAFE_RELEASE(ctx->dxgi_dev);
+
+#ifdef PL_HAVE_DXGI_DEBUG
+    if (ctx->debug) {
+        // Report any leaked objects
+        pl_d3d11_flush_message_queue(ctx, "After destroy");
+        IDXGIDebug_ReportLiveObjects(ctx->debug, DXGI_DEBUG_ALL, DXGI_DEBUG_RLO_DETAIL);
+        pl_d3d11_flush_message_queue(ctx, "After leak check");
+        IDXGIDebug_ReportLiveObjects(ctx->debug, DXGI_DEBUG_ALL, DXGI_DEBUG_RLO_SUMMARY);
+        pl_d3d11_flush_message_queue(ctx, "After leak summary");
+    }
+
+    SAFE_RELEASE(ctx->debug);
+    SAFE_RELEASE(ctx->iqueue);
+#endif
+
+    pl_free_ptr((void **) ptr);
+}
+
+pl_d3d11 pl_d3d11_create(pl_log log, const struct pl_d3d11_params *params)
+{
+    params = PL_DEF(params, &pl_d3d11_default_params);
+    IDXGIAdapter1 *adapter = NULL;
+    IDXGIAdapter2 *adapter2 = NULL;
+    bool success = false;
+    HRESULT hr;
+
+    struct pl_d3d11_t *d3d11 = pl_zalloc_obj(NULL, d3d11, struct d3d11_ctx);
+    struct d3d11_ctx *ctx = PL_PRIV(d3d11);
+    ctx->log = log;
+    ctx->d3d11 = d3d11;
+
+    if (params->device) {
+        d3d11->device = params->device;
+        ID3D11Device_AddRef(d3d11->device);
+    } else {
+        d3d11->device = create_device(d3d11, params);
+        if (!d3d11->device)
+            goto error;
+    }
+    ctx->dev = d3d11->device;
+
+    if (params->debug ||
+        ID3D11Device_GetCreationFlags(d3d11->device) & D3D11_CREATE_DEVICE_DEBUG)
+    {
+        // Do not report live object on pl_d3d11_destroy if device was created
+        // externally, it makes no sense as there will be a lot of things alive.
+        init_debug_layer(ctx, !params->device);
+    }
+
+    D3D(ID3D11Device_QueryInterface(d3d11->device, &IID_IDXGIDevice1,
+                                    (void **) &ctx->dxgi_dev));
+    D3D(IDXGIDevice1_GetParent(ctx->dxgi_dev, &IID_IDXGIAdapter1,
+                               (void **) &adapter));
+
+    hr = IDXGIAdapter1_QueryInterface(adapter, &IID_IDXGIAdapter2,
+                                      (void **) &adapter2);
+    if (FAILED(hr))
+        adapter2 = NULL;
+
+    if (adapter2) {
+        PL_INFO(ctx, "Using DXGI 1.2+");
+    } else {
+        PL_INFO(ctx, "Using DXGI 1.1");
+    }
+
+    D3D_FEATURE_LEVEL fl = ID3D11Device_GetFeatureLevel(d3d11->device);
+    PL_INFO(ctx, "Using Direct3D 11 feature level %u_%u",
+            ((unsigned) fl) >> 12, (((unsigned) fl) >> 8) & 0xf);
+
+    char *dev_name = NULL;
+    UINT vendor_id, device_id, revision, subsys_id;
+    LUID adapter_luid;
+    UINT flags;
+
+    if (adapter2) {
+        // DXGI 1.2 IDXGIAdapter2::GetDesc2 is preferred over the DXGI 1.1
+        // version because it reports the real adapter information when using
+        // feature level 9 hardware
+        DXGI_ADAPTER_DESC2 desc;
+        D3D(IDXGIAdapter2_GetDesc2(adapter2, &desc));
+
+        dev_name = pl_to_utf8(NULL, desc.Description);
+        vendor_id = desc.VendorId;
+        device_id = desc.DeviceId;
+        revision = desc.Revision;
+        subsys_id = desc.SubSysId;
+        adapter_luid = desc.AdapterLuid;
+        flags = desc.Flags;
+    } else {
+        DXGI_ADAPTER_DESC1 desc;
+        D3D(IDXGIAdapter1_GetDesc1(adapter, &desc));
+
+        dev_name = pl_to_utf8(NULL, desc.Description);
+        vendor_id = desc.VendorId;
+        device_id = desc.DeviceId;
+        revision = desc.Revision;
+        subsys_id = desc.SubSysId;
+        adapter_luid = desc.AdapterLuid;
+        flags = desc.Flags;
+    }
+
+    PL_INFO(ctx, "Direct3D 11 device properties:");
+    PL_INFO(ctx, "    Device Name: %s", dev_name);
+    PL_INFO(ctx, "    Device ID: %04x:%04x (rev %02x)",
+            vendor_id, device_id, revision);
+    PL_INFO(ctx, "    Subsystem ID: %04x:%04x",
+            LOWORD(subsys_id), HIWORD(subsys_id));
+    PL_INFO(ctx, "    LUID: %08lx%08lx",
+            adapter_luid.HighPart, adapter_luid.LowPart);
+    pl_free(dev_name);
+
+    LARGE_INTEGER version;
+    hr = IDXGIAdapter1_CheckInterfaceSupport(adapter, &IID_IDXGIDevice, &version);
+    if (SUCCEEDED(hr)) {
+        PL_INFO(ctx, "    Driver version: %u.%u.%u.%u",
+                HIWORD(version.HighPart), LOWORD(version.HighPart),
+                HIWORD(version.LowPart), LOWORD(version.LowPart));
+    }
+
+    // Note: DXGI_ADAPTER_FLAG_SOFTWARE doesn't exist before Windows 8, but we
+    // also set d3d11->software in create_device if we pick WARP ourselves
+    if (flags & DXGI_ADAPTER_FLAG_SOFTWARE)
+        d3d11->software = true;
+
+    // If the primary display adapter is a software adapter, the
+    // DXGI_ADAPTER_FLAG_SOFTWARE flag won't be set, but the device IDs should
+    // still match the Microsoft Basic Render Driver
+    if (vendor_id == 0x1414 && device_id == 0x8c)
+        d3d11->software = true;
+
+    if (d3d11->software) {
+        bool external_adapter = params->device || params->adapter ||
+                                !is_null_luid(params->adapter_luid);
+
+        // The allow_software flag only applies if the API user didn't manually
+        // specify an adapter or a device
+        if (!params->allow_software && !external_adapter) {
+            // If we got this far with allow_software set, the primary adapter
+            // must be a software adapter
+            PL_ERR(ctx, "Primary adapter is a software adapter");
+            goto error;
+        }
+
+        // If a software adapter was manually specified, don't show a warning
+        enum pl_log_level level = PL_LOG_WARN;
+        if (external_adapter || params->force_software)
+            level = PL_LOG_INFO;
+
+        PL_MSG(ctx, level, "Using a software adapter");
+    }
+
+    d3d11->gpu = pl_gpu_create_d3d11(ctx);
+    if (!d3d11->gpu)
+        goto error;
+
+    success = true;
+error:
+    if (!success) {
+        PL_FATAL(ctx, "Failed initializing Direct3D 11 device");
+        pl_d3d11_destroy((pl_d3d11 *) &d3d11);
+    }
+    SAFE_RELEASE(adapter);
+    SAFE_RELEASE(adapter2);
+    return d3d11;
+}
diff --git a/src/d3d11/formats.c b/src/d3d11/formats.c
new file mode 100644
index 0000000..7aaec26
--- /dev/null
+++ b/src/d3d11/formats.c
@@ -0,0 +1,293 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "formats.h"
+#include "gpu.h"
+
+#define FMT(_minor, _name, _dxfmt, _type, num, size, bits, order)  \
+    (struct d3d_format) {                                          \
+        .dxfmt = DXGI_FORMAT_##_dxfmt##_##_type,                   \
+        .minor = _minor,                                           \
+        .fmt = {                                                   \
+            .name = _name,                                         \
+            .type = PL_FMT_##_type,                                \
+            .num_components  = num,                                \
+            .component_depth = bits,                               \
+            .texel_size      = size,                               \
+            .texel_align     = 1,                                  \
+            .internal_size   = size,                               \
+            .host_bits       = bits,                               \
+            .sample_order    = order,                              \
+        },                                                         \
+    }
+
+#define IDX(...) {__VA_ARGS__}
+#define BITS(...) {__VA_ARGS__}
+
+#define REGFMT(name, dxfmt, type, num, bits)            \
+    FMT(0, name, dxfmt, type, num, (num) * (bits) / 8,  \
+        BITS(bits, bits, bits, bits),                   \
+        IDX(0, 1, 2, 3))
+
+#define EMUFMT(_name, _dxfmt, _type, in, en, ib, eb)  \
+    (struct d3d_format) {                             \
+        .dxfmt = DXGI_FORMAT_##_dxfmt##_##_type,      \
+        .minor = 0,                                   \
+        .fmt = {                                      \
+            .name = _name,                            \
+            .type = PL_FMT_##_type,                   \
+            .num_components  = en,                    \
+            .component_depth = BITS(ib, ib, ib, ib),  \
+            .internal_size   = (in) * (ib) / 8,       \
+            .opaque          = false,                 \
+            .emulated        = true,                  \
+            .texel_size      = (en) * (eb) / 8,       \
+            .texel_align     = (eb) / 8,              \
+            .host_bits       = BITS(eb, eb, eb, eb),  \
+            .sample_order    = IDX(0, 1, 2, 3),       \
+        },                                            \
+    }
+
+const struct d3d_format pl_d3d11_formats[] = {
+    REGFMT("r8",       R8,           UNORM, 1,  8),
+    REGFMT("rg8",      R8G8,         UNORM, 2,  8),
+    EMUFMT("rgb8",     R8G8B8A8,     UNORM, 4,  3, 8, 8),
+    REGFMT("rgba8",    R8G8B8A8,     UNORM, 4,  8),
+    REGFMT("r16",      R16,          UNORM, 1, 16),
+    REGFMT("rg16",     R16G16,       UNORM, 2, 16),
+    EMUFMT("rgb16",    R16G16B16A16, UNORM, 4,  3, 16, 16),
+    REGFMT("rgba16",   R16G16B16A16, UNORM, 4, 16),
+
+    REGFMT("r8s",      R8,           SNORM, 1,  8),
+    REGFMT("rg8s",     R8G8,         SNORM, 2,  8),
+    REGFMT("rgba8s",   R8G8B8A8,     SNORM, 4,  8),
+    REGFMT("r16s",     R16,          SNORM, 1, 16),
+    REGFMT("rg16s",    R16G16,       SNORM, 2, 16),
+    REGFMT("rgba16s",  R16G16B16A16, SNORM, 4, 16),
+
+    REGFMT("r16hf",    R16,          FLOAT, 1, 16),
+    REGFMT("rg16hf",   R16G16,       FLOAT, 2, 16),
+    EMUFMT("rgb16hf",  R16G16B16A16, FLOAT, 4,  3, 16, 16),
+    REGFMT("rgba16hf", R16G16B16A16, FLOAT, 4, 16),
+    REGFMT("r32f",     R32,          FLOAT, 1, 32),
+    REGFMT("rg32f",    R32G32,       FLOAT, 2, 32),
+    REGFMT("rgb32f",   R32G32B32,    FLOAT, 3, 32),
+    REGFMT("rgba32f",  R32G32B32A32, FLOAT, 4, 32),
+
+    EMUFMT("r16f",     R16,          FLOAT, 1,  1, 16, 32),
+    EMUFMT("rg16f",    R16G16,       FLOAT, 2,  2, 16, 32),
+    EMUFMT("rgb16f",   R16G16B16A16, FLOAT, 4,  3, 16, 32),
+    EMUFMT("rgba16f",  R16G16B16A16, FLOAT, 4,  4, 16, 32),
+
+    REGFMT("r8u",      R8,           UINT,  1,  8),
+    REGFMT("rg8u",     R8G8,         UINT,  2,  8),
+    REGFMT("rgba8u",   R8G8B8A8,     UINT,  4,  8),
+    REGFMT("r16u",     R16,          UINT,  1, 16),
+    REGFMT("rg16u",    R16G16,       UINT,  2, 16),
+    REGFMT("rgba16u",  R16G16B16A16, UINT,  4, 16),
+    REGFMT("r32u",     R32,          UINT,  1, 32),
+    REGFMT("rg32u",    R32G32,       UINT,  2, 32),
+    REGFMT("rgb32u",   R32G32B32,    UINT,  3, 32),
+    REGFMT("rgba32u",  R32G32B32A32, UINT,  4, 32),
+
+    REGFMT("r8i",      R8,           SINT,  1,  8),
+    REGFMT("rg8i",     R8G8,         SINT,  2,  8),
+    REGFMT("rgba8i",   R8G8B8A8,     SINT,  4,  8),
+    REGFMT("r16i",     R16,          SINT,  1, 16),
+    REGFMT("rg16i",    R16G16,       SINT,  2, 16),
+    REGFMT("rgba16i",  R16G16B16A16, SINT,  4, 16),
+    REGFMT("r32i",     R32,          SINT,  1, 32),
+    REGFMT("rg32i",    R32G32,       SINT,  2, 32),
+    REGFMT("rgb32i",   R32G32B32,    SINT,  3, 32),
+    REGFMT("rgba32i",  R32G32B32A32, SINT,  4, 32),
+
+    FMT(0, "rgb10a2",  R10G10B10A2,  UNORM, 4,  4, BITS(10, 10, 10,  2), IDX(0, 1, 2, 3)),
+    FMT(0, "rgb10a2u", R10G10B10A2,  UINT,  4,  4, BITS(10, 10, 10,  2), IDX(0, 1, 2, 3)),
+
+    FMT(0, "bgra8",    B8G8R8A8,     UNORM, 4,  4, BITS( 8,  8,  8,  8), IDX(2, 1, 0, 3)),
+    FMT(0, "bgrx8",    B8G8R8X8,     UNORM, 3,  4, BITS( 8,  8,  8),     IDX(2, 1, 0)),
+    FMT(0, "rg11b10f", R11G11B10,    FLOAT, 3,  4, BITS(11, 11, 10),     IDX(0, 1, 2)),
+
+     // D3D11.1 16-bit formats (resurrected D3D9 formats)
+    FMT(1, "bgr565",   B5G6R5,       UNORM, 3,  2, BITS( 5,  6,  5),     IDX(2, 1, 0)),
+    FMT(1, "bgr5a1",   B5G5R5A1,     UNORM, 4,  2, BITS( 5,  5,  5,  1), IDX(2, 1, 0, 3)),
+    FMT(1, "bgra4",    B4G4R4A4,     UNORM, 4,  2, BITS( 4,  4,  4,  4), IDX(2, 1, 0, 3)),
+
+    {0}
+};
+#undef BITS
+#undef IDX
+#undef REGFMT
+#undef FMT
+
+void pl_d3d11_setup_formats(struct pl_gpu_t *gpu)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    PL_ARRAY(pl_fmt) formats = {0};
+    HRESULT hr;
+
+    for (int i = 0; pl_d3d11_formats[i].dxfmt; i++) {
+        const struct d3d_format *d3d_fmt = &pl_d3d11_formats[i];
+
+        // The Direct3D 11.0 debug layer will segfault if CheckFormatSupport is
+        // called on a format it doesn't know about
+        if (pl_d3d11_formats[i].minor > p->minor)
+            continue;
+
+        UINT sup = 0;
+        hr = ID3D11Device_CheckFormatSupport(p->dev, d3d_fmt->dxfmt, &sup);
+        if (FAILED(hr))
+            continue;
+
+        D3D11_FEATURE_DATA_FORMAT_SUPPORT2 sup2 = { .InFormat = d3d_fmt->dxfmt };
+        ID3D11Device_CheckFeatureSupport(p->dev, D3D11_FEATURE_FORMAT_SUPPORT2,
+                                         &sup2, sizeof(sup2));
+
+        struct pl_fmt_t *fmt = pl_alloc_obj(gpu, fmt, struct d3d_fmt *);
+        const struct d3d_format **fmtp = PL_PRIV(fmt);
+        *fmt = d3d_fmt->fmt;
+        *fmtp = d3d_fmt;
+
+        // For sanity, clear the superfluous fields
+        for (int j = fmt->num_components; j < 4; j++) {
+            fmt->component_depth[j] = 0;
+            fmt->sample_order[j] = 0;
+            fmt->host_bits[j] = 0;
+        }
+
+        static const struct {
+            enum pl_fmt_caps caps;
+            UINT sup;
+            UINT sup2;
+        } support[] = {
+            {
+                .caps = PL_FMT_CAP_SAMPLEABLE,
+                .sup = D3D11_FORMAT_SUPPORT_TEXTURE2D,
+            },
+            {
+                .caps = PL_FMT_CAP_STORABLE,
+                // SHADER_LOAD is for readonly images, which can use a SRV
+                .sup = D3D11_FORMAT_SUPPORT_TEXTURE2D |
+                       D3D11_FORMAT_SUPPORT_TYPED_UNORDERED_ACCESS_VIEW |
+                       D3D11_FORMAT_SUPPORT_SHADER_LOAD,
+                .sup2 = D3D11_FORMAT_SUPPORT2_UAV_TYPED_STORE,
+            },
+            {
+                .caps = PL_FMT_CAP_READWRITE,
+                .sup = D3D11_FORMAT_SUPPORT_TEXTURE2D |
+                       D3D11_FORMAT_SUPPORT_TYPED_UNORDERED_ACCESS_VIEW,
+                .sup2 = D3D11_FORMAT_SUPPORT2_UAV_TYPED_LOAD,
+            },
+            {
+                .caps = PL_FMT_CAP_LINEAR,
+                .sup = D3D11_FORMAT_SUPPORT_TEXTURE2D |
+                       D3D11_FORMAT_SUPPORT_SHADER_SAMPLE,
+            },
+            {
+                .caps = PL_FMT_CAP_RENDERABLE,
+                .sup = D3D11_FORMAT_SUPPORT_RENDER_TARGET,
+            },
+            {
+                .caps = PL_FMT_CAP_BLENDABLE,
+                .sup = D3D11_FORMAT_SUPPORT_RENDER_TARGET |
+                       D3D11_FORMAT_SUPPORT_BLENDABLE,
+            },
+            {
+                .caps = PL_FMT_CAP_VERTEX,
+                .sup = D3D11_FORMAT_SUPPORT_IA_VERTEX_BUFFER,
+            },
+            {
+                .caps = PL_FMT_CAP_TEXEL_UNIFORM,
+                .sup = D3D11_FORMAT_SUPPORT_BUFFER |
+                       D3D11_FORMAT_SUPPORT_SHADER_LOAD,
+            },
+            {
+                .caps = PL_FMT_CAP_TEXEL_STORAGE,
+                // SHADER_LOAD is for readonly buffers, which can use a SRV
+                .sup = D3D11_FORMAT_SUPPORT_BUFFER |
+                       D3D11_FORMAT_SUPPORT_TYPED_UNORDERED_ACCESS_VIEW |
+                       D3D11_FORMAT_SUPPORT_SHADER_LOAD,
+                .sup2 = D3D11_FORMAT_SUPPORT2_UAV_TYPED_STORE,
+            },
+            {
+                .caps = PL_FMT_CAP_HOST_READABLE,
+                .sup = D3D11_FORMAT_SUPPORT_CPU_LOCKABLE,
+            },
+        };
+
+        for (int j = 0; j < PL_ARRAY_SIZE(support); j++) {
+            if ((sup & support[j].sup) == support[j].sup &&
+                (sup2.OutFormatSupport2 & support[j].sup2) == support[j].sup2)
+            {
+                fmt->caps |= support[j].caps;
+            }
+        }
+
+        // PL_FMT_CAP_STORABLE implies compute shaders, so don't set it if we
+        // don't have them
+        if (!gpu->glsl.compute)
+            fmt->caps &= ~PL_FMT_CAP_STORABLE;
+
+        // PL_FMT_CAP_READWRITE implies PL_FMT_CAP_STORABLE
+        if (!(fmt->caps & PL_FMT_CAP_STORABLE))
+            fmt->caps &= ~PL_FMT_CAP_READWRITE;
+
+        // `fmt->gatherable` must have PL_FMT_CAP_SAMPLEABLE
+        if ((fmt->caps & PL_FMT_CAP_SAMPLEABLE) &&
+            (sup & D3D11_FORMAT_SUPPORT_SHADER_GATHER))
+        {
+            fmt->gatherable = true;
+        }
+
+        // PL_FMT_CAP_BLITTABLE implies support for stretching, flipping and
+        // loose format conversion, which require a shader pass in D3D11
+        if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+            // On >=FL11_0, we use a compute pass, which supports 1D and 3D
+            // textures
+            if (fmt->caps & PL_FMT_CAP_STORABLE)
+                fmt->caps |= PL_FMT_CAP_BLITTABLE;
+        } else {
+            // On <FL11_0 we use a raster pass
+            static const enum pl_fmt_caps req = PL_FMT_CAP_RENDERABLE |
+                                                PL_FMT_CAP_SAMPLEABLE;
+            if ((fmt->caps & req) == req)
+                fmt->caps |= PL_FMT_CAP_BLITTABLE;
+        }
+
+        if (fmt->caps & (PL_FMT_CAP_VERTEX | PL_FMT_CAP_TEXEL_UNIFORM |
+                                             PL_FMT_CAP_TEXEL_STORAGE)) {
+            fmt->glsl_type = pl_var_glsl_type_name(pl_var_from_fmt(fmt, ""));
+            pl_assert(fmt->glsl_type);
+        }
+
+        if (fmt->caps & (PL_FMT_CAP_STORABLE | PL_FMT_CAP_TEXEL_STORAGE))
+            fmt->glsl_format = pl_fmt_glsl_format(fmt, fmt->num_components);
+
+        fmt->fourcc = pl_fmt_fourcc(fmt);
+
+        // If no caps, D3D11 only supports this for things we don't care about
+        if (!fmt->caps) {
+            pl_free(fmt);
+            continue;
+        }
+
+        PL_ARRAY_APPEND(gpu, formats, fmt);
+    }
+
+    gpu->formats = formats.elem;
+    gpu->num_formats = formats.num;
+}
diff --git a/src/d3d11/formats.h b/src/d3d11/formats.h
new file mode 100644
index 0000000..08336c0
--- /dev/null
+++ b/src/d3d11/formats.h
@@ -0,0 +1,36 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+struct d3d_format {
+    DXGI_FORMAT dxfmt;
+    int minor; // The D3D11 minor version number which supports this format
+    struct pl_fmt_t fmt;
+};
+
+extern const struct d3d_format pl_d3d11_formats[];
+
+static inline DXGI_FORMAT fmt_to_dxgi(pl_fmt fmt)
+{
+    const struct d3d_format **fmtp = PL_PRIV(fmt);
+    return (*fmtp)->dxfmt;
+}
+
+void pl_d3d11_setup_formats(struct pl_gpu_t *gpu);
diff --git a/src/d3d11/gpu.c b/src/d3d11/gpu.c
new file mode 100644
index 0000000..05a08a3
--- /dev/null
+++ b/src/d3d11/gpu.c
@@ -0,0 +1,685 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <initguid.h>
+#include <windows.h>
+#include <versionhelpers.h>
+
+#include "common.h"
+#include "gpu.h"
+#include "formats.h"
+#include "glsl/spirv.h"
+
+#define DXGI_ADAPTER_FLAG3_SUPPORT_MONITORED_FENCES (0x8)
+
+struct timer_query {
+    ID3D11Query *ts_start;
+    ID3D11Query *ts_end;
+    ID3D11Query *disjoint;
+};
+
+struct pl_timer_t {
+    // Ring buffer of timer queries to use
+    int current;
+    int pending;
+    struct timer_query queries[16];
+};
+
+void pl_d3d11_timer_start(pl_gpu gpu, pl_timer timer)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+
+    if (!timer)
+        return;
+    struct timer_query *query = &timer->queries[timer->current];
+
+    // Create the query objects lazilly
+    if (!query->ts_start) {
+        D3D(ID3D11Device_CreateQuery(p->dev,
+            &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, &query->ts_start));
+        D3D(ID3D11Device_CreateQuery(p->dev,
+            &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, &query->ts_end));
+
+        // Measuring duration in D3D11 requires three queries: start and end
+        // timestamp queries, and a disjoint query containing a flag which says
+        // whether the timestamps are usable or if a discontinuity occurred
+        // between them, like a change in power state or clock speed. The
+        // disjoint query also contains the timer frequency, so the timestamps
+        // are useless without it.
+        D3D(ID3D11Device_CreateQuery(p->dev,
+            &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP_DISJOINT }, &query->disjoint));
+    }
+
+    // Query the start timestamp
+    ID3D11DeviceContext_Begin(p->imm, (ID3D11Asynchronous *) query->disjoint);
+    ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) query->ts_start);
+    return;
+
+error:
+    SAFE_RELEASE(query->ts_start);
+    SAFE_RELEASE(query->ts_end);
+    SAFE_RELEASE(query->disjoint);
+}
+
+void pl_d3d11_timer_end(pl_gpu gpu, pl_timer timer)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+
+    if (!timer)
+        return;
+    struct timer_query *query = &timer->queries[timer->current];
+
+    // Even if timer_start and timer_end are called in-order, timer_start might
+    // have failed to create the timer objects
+    if (!query->ts_start)
+        return;
+
+    // Query the end timestamp
+    ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) query->ts_end);
+    ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) query->disjoint);
+
+    // Advance to the next set of queries, for the next call to timer_start
+    timer->current++;
+    if (timer->current >= PL_ARRAY_SIZE(timer->queries))
+        timer->current = 0; // Wrap around
+
+    // Increment the number of pending queries, unless the ring buffer is full,
+    // in which case, timer->current now points to the oldest one, which will be
+    // dropped and reused
+    if (timer->pending < PL_ARRAY_SIZE(timer->queries))
+        timer->pending++;
+}
+
+static uint64_t timestamp_to_ns(uint64_t timestamp, uint64_t freq)
+{
+    static const uint64_t ns_per_s = 1000000000llu;
+    return timestamp / freq * ns_per_s + timestamp % freq * ns_per_s / freq;
+}
+
+static uint64_t d3d11_timer_query(pl_gpu gpu, pl_timer timer)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+    HRESULT hr;
+
+    for (; timer->pending > 0; timer->pending--) {
+        int index = timer->current - timer->pending;
+        if (index < 0)
+            index += PL_ARRAY_SIZE(timer->queries);
+        struct timer_query *query = &timer->queries[index];
+
+        UINT64 start, end;
+        D3D11_QUERY_DATA_TIMESTAMP_DISJOINT dj;
+
+        // Fetch the results of each query, or on S_FALSE, return 0 to indicate
+        // the queries are still pending
+        D3D(hr = ID3D11DeviceContext_GetData(p->imm,
+            (ID3D11Asynchronous *) query->disjoint, &dj, sizeof(dj),
+            D3D11_ASYNC_GETDATA_DONOTFLUSH));
+        if (hr == S_FALSE)
+            return 0;
+        D3D(hr = ID3D11DeviceContext_GetData(p->imm,
+            (ID3D11Asynchronous *) query->ts_end, &end, sizeof(end),
+            D3D11_ASYNC_GETDATA_DONOTFLUSH));
+        if (hr == S_FALSE)
+            return 0;
+        D3D(hr = ID3D11DeviceContext_GetData(p->imm,
+            (ID3D11Asynchronous *) query->ts_start, &start, sizeof(start),
+            D3D11_ASYNC_GETDATA_DONOTFLUSH));
+        if (hr == S_FALSE)
+            return 0;
+
+        // There was a discontinuity during the queries, so a timestamp can't be
+        // produced. Skip it and try the next one.
+        if (dj.Disjoint || !dj.Frequency)
+            continue;
+
+        // We got a result. Return it to the caller.
+        timer->pending--;
+        pl_d3d11_flush_message_queue(ctx, "After timer query");
+
+        uint64_t ns = timestamp_to_ns(end - start, dj.Frequency);
+        return PL_MAX(ns, 1);
+
+    error:
+        // There was an error fetching the timer result, so skip it and try the
+        // next one
+        continue;
+    }
+
+    // No more unprocessed results
+    return 0;
+}
+
+static void d3d11_timer_destroy(pl_gpu gpu, pl_timer timer)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+
+    for (int i = 0; i < PL_ARRAY_SIZE(timer->queries); i++) {
+        SAFE_RELEASE(timer->queries[i].ts_start);
+        SAFE_RELEASE(timer->queries[i].ts_end);
+        SAFE_RELEASE(timer->queries[i].disjoint);
+    }
+
+    pl_d3d11_flush_message_queue(ctx, "After timer destroy");
+
+    pl_free(timer);
+}
+
+static pl_timer d3d11_timer_create(pl_gpu gpu)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    if (!p->has_timestamp_queries)
+        return NULL;
+
+    struct pl_timer_t *timer = pl_alloc_ptr(NULL, timer);
+    *timer = (struct pl_timer_t) {0};
+    return timer;
+}
+
+static int d3d11_desc_namespace(pl_gpu gpu, enum pl_desc_type type)
+{
+    // Vulkan-style binding, where all descriptors are in the same namespace, is
+    // required to use SPIRV-Cross' HLSL resource mapping API, which targets
+    // resources by binding number
+    return 0;
+}
+
+static void d3d11_gpu_flush(pl_gpu gpu)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+    ID3D11DeviceContext_Flush(p->imm);
+
+    pl_d3d11_flush_message_queue(ctx, "After gpu flush");
+}
+
+static void d3d11_gpu_finish(pl_gpu gpu)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+    HRESULT hr;
+
+    if (p->finish_fence) {
+        p->finish_value++;
+        D3D(ID3D11Fence_SetEventOnCompletion(p->finish_fence, p->finish_value,
+                                             p->finish_event));
+        ID3D11DeviceContext4_Signal(p->imm4, p->finish_fence, p->finish_value);
+        ID3D11DeviceContext_Flush(p->imm);
+        WaitForSingleObject(p->finish_event, INFINITE);
+    } else {
+        ID3D11DeviceContext_End(p->imm, (ID3D11Asynchronous *) p->finish_query);
+
+        // D3D11 doesn't have blocking queries, but it does have blocking
+        // readback. As a performance hack to try to avoid polling, do a dummy
+        // copy/readback between two buffers. Hopefully this will block until
+        // all prior commands are finished. If it does, the first GetData call
+        // will return a result and we won't have to poll.
+        pl_buf_copy(gpu, p->finish_buf_dst, 0, p->finish_buf_src, 0, sizeof(uint32_t));
+        pl_buf_read(gpu, p->finish_buf_dst, 0, &(uint32_t) {0}, sizeof(uint32_t));
+
+        // Poll the event query until it completes
+        for (;;) {
+            BOOL idle;
+            D3D(hr = ID3D11DeviceContext_GetData(p->imm,
+                (ID3D11Asynchronous *) p->finish_query, &idle, sizeof(idle), 0));
+            if (hr == S_OK && idle)
+                break;
+            Sleep(1);
+        }
+    }
+
+    pl_d3d11_flush_message_queue(ctx, "After gpu finish");
+
+error:
+    return;
+}
+
+static bool d3d11_gpu_is_failed(pl_gpu gpu)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+
+    if (ctx->is_failed)
+        return true;
+
+    // GetDeviceRemovedReason returns S_OK if the device isn't removed
+    HRESULT hr = ID3D11Device_GetDeviceRemovedReason(p->dev);
+    if (FAILED(hr)) {
+        ctx->is_failed = true;
+        pl_d3d11_after_error(ctx, hr);
+    }
+
+    return ctx->is_failed;
+}
+
+static void d3d11_gpu_destroy(pl_gpu gpu)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+
+    pl_buf_destroy(gpu, &p->finish_buf_src);
+    pl_buf_destroy(gpu, &p->finish_buf_dst);
+
+    // Release everything except the immediate context
+    SAFE_RELEASE(p->dev);
+    SAFE_RELEASE(p->dev1);
+    SAFE_RELEASE(p->dev5);
+    SAFE_RELEASE(p->imm1);
+    SAFE_RELEASE(p->imm4);
+    SAFE_RELEASE(p->vbuf.buf);
+    SAFE_RELEASE(p->ibuf.buf);
+    SAFE_RELEASE(p->rstate);
+    SAFE_RELEASE(p->dsstate);
+    for (int i = 0; i < PL_TEX_SAMPLE_MODE_COUNT; i++) {
+        for (int j = 0; j < PL_TEX_ADDRESS_MODE_COUNT; j++) {
+            SAFE_RELEASE(p->samplers[i][j]);
+        }
+    }
+    SAFE_RELEASE(p->finish_fence);
+    if (p->finish_event)
+        CloseHandle(p->finish_event);
+    SAFE_RELEASE(p->finish_query);
+
+    // Destroy the immediate context synchronously so referenced objects don't
+    // show up in the leak check
+    if (p->imm) {
+        ID3D11DeviceContext_ClearState(p->imm);
+        ID3D11DeviceContext_Flush(p->imm);
+        SAFE_RELEASE(p->imm);
+    }
+
+    pl_spirv_destroy(&p->spirv);
+    pl_free((void *) gpu);
+}
+
+pl_d3d11 pl_d3d11_get(pl_gpu gpu)
+{
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    if (impl->destroy == d3d11_gpu_destroy) {
+        struct pl_gpu_d3d11 *p = (struct pl_gpu_d3d11 *) impl;
+        return p->ctx->d3d11;
+    }
+
+    return NULL;
+}
+
+static bool load_d3d_compiler(pl_gpu gpu)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    HMODULE d3dcompiler = NULL;
+
+    static const struct {
+        const wchar_t *name;
+        bool inbox;
+    } compiler_dlls[] = {
+        // Try the inbox D3DCompiler first (Windows 8.1 and up)
+        { .name = L"d3dcompiler_47.dll", .inbox = true },
+        // Check for a packaged version of d3dcompiler_47.dll
+        { .name = L"d3dcompiler_47.dll" },
+        // Try d3dcompiler_46.dll from the Windows 8 SDK
+        { .name = L"d3dcompiler_46.dll" },
+        // Try d3dcompiler_43.dll from the June 2010 DirectX SDK
+        { .name = L"d3dcompiler_43.dll" },
+    };
+
+    for (int i = 0; i < PL_ARRAY_SIZE(compiler_dlls); i++) {
+        if (compiler_dlls[i].inbox) {
+            if (!IsWindows8Point1OrGreater())
+                continue;
+            d3dcompiler = LoadLibraryExW(compiler_dlls[i].name, NULL,
+                                         LOAD_LIBRARY_SEARCH_SYSTEM32);
+        } else {
+            d3dcompiler = LoadLibraryW(compiler_dlls[i].name);
+        }
+        if (!d3dcompiler)
+            continue;
+
+        p->D3DCompile = (void *) GetProcAddress(d3dcompiler, "D3DCompile");
+        if (!p->D3DCompile)
+            return false;
+        p->d3d_compiler_ver = pl_get_dll_version(compiler_dlls[i].name);
+
+        return true;
+    }
+
+    return false;
+}
+
+static struct pl_gpu_fns pl_fns_d3d11 = {
+    .tex_create             = pl_d3d11_tex_create,
+    .tex_destroy            = pl_d3d11_tex_destroy,
+    .tex_invalidate         = pl_d3d11_tex_invalidate,
+    .tex_clear_ex           = pl_d3d11_tex_clear_ex,
+    .tex_blit               = pl_d3d11_tex_blit,
+    .tex_upload             = pl_d3d11_tex_upload,
+    .tex_download           = pl_d3d11_tex_download,
+    .buf_create             = pl_d3d11_buf_create,
+    .buf_destroy            = pl_d3d11_buf_destroy,
+    .buf_write              = pl_d3d11_buf_write,
+    .buf_read               = pl_d3d11_buf_read,
+    .buf_copy               = pl_d3d11_buf_copy,
+    .desc_namespace         = d3d11_desc_namespace,
+    .pass_create            = pl_d3d11_pass_create,
+    .pass_destroy           = pl_d3d11_pass_destroy,
+    .pass_run               = pl_d3d11_pass_run,
+    .timer_create           = d3d11_timer_create,
+    .timer_destroy          = d3d11_timer_destroy,
+    .timer_query            = d3d11_timer_query,
+    .gpu_flush              = d3d11_gpu_flush,
+    .gpu_finish             = d3d11_gpu_finish,
+    .gpu_is_failed          = d3d11_gpu_is_failed,
+    .destroy                = d3d11_gpu_destroy,
+};
+
+pl_gpu pl_gpu_create_d3d11(struct d3d11_ctx *ctx)
+{
+    pl_assert(ctx->dev);
+    IDXGIDevice1 *dxgi_dev = NULL;
+    IDXGIAdapter1 *adapter = NULL;
+    IDXGIAdapter4 *adapter4 = NULL;
+    bool success = false;
+    HRESULT hr;
+
+    struct pl_gpu_t *gpu = pl_zalloc_obj(NULL, gpu, struct pl_gpu_d3d11);
+    gpu->log = ctx->log;
+
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    uint32_t spirv_ver = PL_MIN(SPV_VERSION, PL_MAX_SPIRV_VER);
+    *p = (struct pl_gpu_d3d11) {
+        .ctx = ctx,
+        .impl = pl_fns_d3d11,
+        .dev = ctx->dev,
+        .spirv = pl_spirv_create(ctx->log, (struct pl_spirv_version) {
+            .env_version = pl_spirv_version_to_vulkan(spirv_ver),
+            .spv_version = spirv_ver,
+        }),
+        .vbuf.bind_flags = D3D11_BIND_VERTEX_BUFFER,
+        .ibuf.bind_flags = D3D11_BIND_INDEX_BUFFER,
+    };
+    if (!p->spirv)
+        goto error;
+
+    ID3D11Device_AddRef(p->dev);
+    ID3D11Device_GetImmediateContext(p->dev, &p->imm);
+
+    // Check D3D11.1 interfaces
+    hr = ID3D11Device_QueryInterface(p->dev, &IID_ID3D11Device1,
+                                     (void **) &p->dev1);
+    if (SUCCEEDED(hr)) {
+        p->minor = 1;
+        ID3D11Device1_GetImmediateContext1(p->dev1, &p->imm1);
+    }
+
+    // Check D3D11.4 interfaces
+    hr = ID3D11Device_QueryInterface(p->dev, &IID_ID3D11Device5,
+                                     (void **) &p->dev5);
+    if (SUCCEEDED(hr)) {
+        // There is no GetImmediateContext4 method
+        hr = ID3D11DeviceContext_QueryInterface(p->imm, &IID_ID3D11DeviceContext4,
+                                                (void **) &p->imm4);
+        if (SUCCEEDED(hr))
+            p->minor = 4;
+    }
+
+    PL_INFO(gpu, "Using Direct3D 11.%d runtime", p->minor);
+
+    D3D(ID3D11Device_QueryInterface(p->dev, &IID_IDXGIDevice1, (void **) &dxgi_dev));
+    D3D(IDXGIDevice1_GetParent(dxgi_dev, &IID_IDXGIAdapter1, (void **) &adapter));
+
+    DXGI_ADAPTER_DESC1 adapter_desc = {0};
+    IDXGIAdapter1_GetDesc1(adapter, &adapter_desc);
+
+    // No resource can be larger than max_res_size in bytes
+    unsigned int max_res_size = PL_CLAMP(
+        D3D11_REQ_RESOURCE_SIZE_IN_MEGABYTES_EXPRESSION_B_TERM * adapter_desc.DedicatedVideoMemory,
+        D3D11_REQ_RESOURCE_SIZE_IN_MEGABYTES_EXPRESSION_A_TERM * 1024u * 1024u,
+        D3D11_REQ_RESOURCE_SIZE_IN_MEGABYTES_EXPRESSION_C_TERM * 1024u * 1024u);
+
+    gpu->glsl = (struct pl_glsl_version) {
+        .version = 450,
+        .vulkan = true,
+    };
+
+    gpu->limits = (struct pl_gpu_limits) {
+        .max_buf_size = max_res_size,
+        .max_ssbo_size = max_res_size,
+        .max_vbo_size = max_res_size,
+        .align_vertex_stride = 1,
+
+        // Make up some values
+        .align_tex_xfer_offset = 32,
+        .align_tex_xfer_pitch = 1,
+        .fragment_queues = 1,
+    };
+
+    p->fl = ID3D11Device_GetFeatureLevel(p->dev);
+
+    // If we're not using FL9_x, we can use the same suballocated buffer as a
+    // vertex buffer and index buffer
+    if (p->fl >= D3D_FEATURE_LEVEL_10_0)
+        p->vbuf.bind_flags |= D3D11_BIND_INDEX_BUFFER;
+
+    if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
+        gpu->limits.max_ubo_size = D3D11_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * CBUF_ELEM;
+    } else {
+        // 10level9 restriction:
+        // https://docs.microsoft.com/en-us/windows/win32/direct3d11/d3d11-graphics-reference-10level9-context
+        gpu->limits.max_ubo_size = 255 * CBUF_ELEM;
+    }
+
+    if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+        gpu->limits.max_tex_1d_dim = D3D11_REQ_TEXTURE1D_U_DIMENSION;
+        gpu->limits.max_tex_2d_dim = D3D11_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+        gpu->limits.max_tex_3d_dim = D3D11_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
+    } else if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
+        gpu->limits.max_tex_1d_dim = D3D10_REQ_TEXTURE1D_U_DIMENSION;
+        gpu->limits.max_tex_2d_dim = D3D10_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+        gpu->limits.max_tex_3d_dim = D3D10_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
+    } else if (p->fl >= D3D_FEATURE_LEVEL_9_3) {
+        gpu->limits.max_tex_2d_dim = D3D_FL9_3_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+        // Same limit as FL9_1
+        gpu->limits.max_tex_3d_dim = D3D_FL9_1_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
+    } else {
+        gpu->limits.max_tex_2d_dim = D3D_FL9_1_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+        gpu->limits.max_tex_3d_dim = D3D_FL9_1_REQ_TEXTURE3D_U_V_OR_W_DIMENSION;
+    }
+
+    if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
+        gpu->limits.max_buffer_texels =
+            1 << D3D11_REQ_BUFFER_RESOURCE_TEXEL_COUNT_2_TO_EXP;
+    }
+
+    if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+        gpu->glsl.compute = true;
+        gpu->limits.compute_queues = 1;
+        // Set `gpu->limits.blittable_1d_3d`, since `pl_tex_blit_compute`, which
+        // is used to emulate blits on 11_0 and up, supports 1D and 3D textures
+        gpu->limits.blittable_1d_3d = true;
+
+        gpu->glsl.max_shmem_size = D3D11_CS_TGSM_REGISTER_COUNT * sizeof(float);
+        gpu->glsl.max_group_threads = D3D11_CS_THREAD_GROUP_MAX_THREADS_PER_GROUP;
+        gpu->glsl.max_group_size[0] = D3D11_CS_THREAD_GROUP_MAX_X;
+        gpu->glsl.max_group_size[1] = D3D11_CS_THREAD_GROUP_MAX_Y;
+        gpu->glsl.max_group_size[2] = D3D11_CS_THREAD_GROUP_MAX_Z;
+        gpu->limits.max_dispatch[0] = gpu->limits.max_dispatch[1] =
+            gpu->limits.max_dispatch[2] =
+            D3D11_CS_DISPATCH_MAX_THREAD_GROUPS_PER_DIMENSION;
+    }
+
+    if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+        // The offset limits are defined by HLSL:
+        // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/gather4-po--sm5---asm-
+        gpu->glsl.min_gather_offset = -32;
+        gpu->glsl.max_gather_offset = 31;
+    } else if (p->fl >= D3D_FEATURE_LEVEL_10_1) {
+        // SM4.1 has no gather4_po, so the offset must be specified by an
+        // immediate with a range of [-8, 7]
+        // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/gather4--sm4-1---asm-
+        // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sample--sm4---asm-#address-offset
+        gpu->glsl.min_gather_offset = -8;
+        gpu->glsl.max_gather_offset = 7;
+    }
+
+    if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
+        p->max_srvs = D3D11_COMMONSHADER_INPUT_RESOURCE_SLOT_COUNT;
+    } else {
+        // 10level9 restriction:
+        // https://docs.microsoft.com/en-us/windows/win32/direct3d11/d3d11-graphics-reference-10level9-context
+        p->max_srvs = 8;
+    }
+
+    if (p->fl >= D3D_FEATURE_LEVEL_11_1) {
+        p->max_uavs = D3D11_1_UAV_SLOT_COUNT;
+    } else {
+        p->max_uavs = D3D11_PS_CS_UAV_REGISTER_COUNT;
+    }
+
+    if (!load_d3d_compiler(gpu)) {
+        PL_FATAL(gpu, "Could not find D3DCompiler DLL");
+        goto error;
+    }
+    PL_INFO(gpu, "D3DCompiler version: %u.%u.%u.%u",
+            p->d3d_compiler_ver.major, p->d3d_compiler_ver.minor,
+            p->d3d_compiler_ver.build, p->d3d_compiler_ver.revision);
+
+    // Detect support for timestamp queries. Some FL9_x devices don't support them.
+    hr = ID3D11Device_CreateQuery(p->dev,
+        &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, NULL);
+    p->has_timestamp_queries = SUCCEEDED(hr);
+
+    pl_d3d11_setup_formats(gpu);
+
+    // The rasterizer state never changes, so create it here
+    D3D11_RASTERIZER_DESC rdesc = {
+        .FillMode = D3D11_FILL_SOLID,
+        .CullMode = D3D11_CULL_NONE,
+        .FrontCounterClockwise = FALSE,
+        .DepthClipEnable = TRUE, // Required for 10level9
+        .ScissorEnable = TRUE,
+    };
+    D3D(ID3D11Device_CreateRasterizerState(p->dev, &rdesc, &p->rstate));
+
+    // The depth stencil state never changes either, and we only set it to turn
+    // depth testing off so the debug layer doesn't complain about an unbound
+    // depth buffer
+    D3D11_DEPTH_STENCIL_DESC dsdesc = {
+        .DepthEnable = FALSE,
+        .DepthWriteMask = D3D11_DEPTH_WRITE_MASK_ALL,
+        .DepthFunc = D3D11_COMPARISON_LESS,
+        .StencilReadMask = D3D11_DEFAULT_STENCIL_READ_MASK,
+        .StencilWriteMask = D3D11_DEFAULT_STENCIL_WRITE_MASK,
+        .FrontFace = {
+            .StencilFailOp = D3D11_STENCIL_OP_KEEP,
+            .StencilDepthFailOp = D3D11_STENCIL_OP_KEEP,
+            .StencilPassOp = D3D11_STENCIL_OP_KEEP,
+            .StencilFunc = D3D11_COMPARISON_ALWAYS,
+        },
+        .BackFace = {
+            .StencilFailOp = D3D11_STENCIL_OP_KEEP,
+            .StencilDepthFailOp = D3D11_STENCIL_OP_KEEP,
+            .StencilPassOp = D3D11_STENCIL_OP_KEEP,
+            .StencilFunc = D3D11_COMPARISON_ALWAYS,
+        },
+    };
+    D3D(ID3D11Device_CreateDepthStencilState(p->dev, &dsdesc, &p->dsstate));
+
+    // Initialize the samplers
+    for (int sample_mode = 0; sample_mode < PL_TEX_SAMPLE_MODE_COUNT; sample_mode++) {
+        for (int address_mode = 0; address_mode < PL_TEX_ADDRESS_MODE_COUNT; address_mode++) {
+            static const D3D11_TEXTURE_ADDRESS_MODE d3d_address_mode[] = {
+                [PL_TEX_ADDRESS_CLAMP] = D3D11_TEXTURE_ADDRESS_CLAMP,
+                [PL_TEX_ADDRESS_REPEAT] = D3D11_TEXTURE_ADDRESS_WRAP,
+                [PL_TEX_ADDRESS_MIRROR] = D3D11_TEXTURE_ADDRESS_MIRROR,
+            };
+            static const D3D11_FILTER d3d_filter[] = {
+                [PL_TEX_SAMPLE_NEAREST] = D3D11_FILTER_MIN_MAG_MIP_POINT,
+                [PL_TEX_SAMPLE_LINEAR] = D3D11_FILTER_MIN_MAG_MIP_LINEAR,
+            };
+
+            D3D11_SAMPLER_DESC sdesc = {
+                .AddressU = d3d_address_mode[address_mode],
+                .AddressV = d3d_address_mode[address_mode],
+                .AddressW = d3d_address_mode[address_mode],
+                .ComparisonFunc = D3D11_COMPARISON_NEVER,
+                .MinLOD = 0,
+                .MaxLOD = D3D11_FLOAT32_MAX,
+                .MaxAnisotropy = 1,
+                .Filter = d3d_filter[sample_mode],
+            };
+            D3D(ID3D11Device_CreateSamplerState(p->dev, &sdesc,
+                &p->samplers[sample_mode][address_mode]));
+        }
+    }
+
+    hr = IDXGIAdapter1_QueryInterface(adapter, &IID_IDXGIAdapter4,
+                                      (void **) &adapter4);
+    if (SUCCEEDED(hr)) {
+        DXGI_ADAPTER_DESC3 adapter_desc3 = {0};
+        IDXGIAdapter4_GetDesc3(adapter4, &adapter_desc3);
+
+        p->has_monitored_fences =
+            adapter_desc3.Flags & DXGI_ADAPTER_FLAG3_SUPPORT_MONITORED_FENCES;
+    }
+
+    // Try to create a D3D11.4 fence object to wait on in pl_gpu_finish()
+    if (p->dev5 && p->has_monitored_fences) {
+        hr = ID3D11Device5_CreateFence(p->dev5, 0, D3D11_FENCE_FLAG_NONE,
+                                       &IID_ID3D11Fence,
+                                       (void **) &p->finish_fence);
+        if (SUCCEEDED(hr)) {
+            p->finish_event = CreateEventW(NULL, FALSE, FALSE, NULL);
+            if (!p->finish_event) {
+                PL_ERR(gpu, "Failed to create finish() event");
+                goto error;
+            }
+        }
+    }
+
+    // If fences are not available, we will have to poll a event query instead
+    if (!p->finish_fence) {
+        // Buffers for dummy copy/readback (see d3d11_gpu_finish())
+        p->finish_buf_src = pl_buf_create(gpu, pl_buf_params(
+            .size = sizeof(uint32_t),
+            .drawable = true, // Make these vertex buffers for 10level9
+            .initial_data = &(uint32_t) {0x11223344},
+        ));
+        p->finish_buf_dst = pl_buf_create(gpu, pl_buf_params(
+            .size = sizeof(uint32_t),
+            .host_readable = true,
+            .drawable = true,
+        ));
+
+        D3D(ID3D11Device_CreateQuery(p->dev,
+            &(D3D11_QUERY_DESC) { D3D11_QUERY_EVENT }, &p->finish_query));
+    }
+
+    pl_d3d11_flush_message_queue(ctx, "After gpu create");
+
+    success = true;
+error:
+    SAFE_RELEASE(dxgi_dev);
+    SAFE_RELEASE(adapter);
+    SAFE_RELEASE(adapter4);
+    if (success) {
+        return pl_gpu_finalize(gpu);
+    } else {
+        d3d11_gpu_destroy(gpu);
+        return NULL;
+    }
+}
diff --git a/src/d3d11/gpu.h b/src/d3d11/gpu.h
new file mode 100644
index 0000000..cbc706a
--- /dev/null
+++ b/src/d3d11/gpu.h
@@ -0,0 +1,212 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <stdalign.h>
+#include <d3d11_4.h>
+#include <dxgi1_6.h>
+#include <d3dcompiler.h>
+#include <spirv_cross_c.h>
+
+#include "../gpu.h"
+#include "../glsl/spirv.h"
+
+#include "common.h"
+#include "utils.h"
+
+pl_gpu pl_gpu_create_d3d11(struct d3d11_ctx *ctx);
+
+// --- pl_gpu internal structs and helpers
+
+// Size of one constant in a constant buffer
+#define CBUF_ELEM (sizeof(float[4]))
+
+struct d3d_stream_buf {
+    UINT bind_flags;
+    ID3D11Buffer *buf;
+    size_t size;
+    size_t used;
+    unsigned int align;
+};
+
+struct pl_gpu_d3d11 {
+    struct pl_gpu_fns impl;
+    struct d3d11_ctx *ctx;
+    ID3D11Device *dev;
+    ID3D11Device1 *dev1;
+    ID3D11Device5 *dev5;
+    ID3D11DeviceContext *imm;
+    ID3D11DeviceContext1 *imm1;
+    ID3D11DeviceContext4 *imm4;
+
+    // The Direct3D 11 minor version number
+    int minor;
+
+    pl_spirv spirv;
+
+    pD3DCompile D3DCompile;
+    struct dll_version d3d_compiler_ver;
+
+    // Device capabilities
+    D3D_FEATURE_LEVEL fl;
+    bool has_timestamp_queries;
+    bool has_monitored_fences;
+
+    int max_srvs;
+    int max_uavs;
+
+    // Streaming vertex and index buffers
+    struct d3d_stream_buf vbuf;
+    struct d3d_stream_buf ibuf;
+
+    // Shared rasterizer state
+    ID3D11RasterizerState *rstate;
+
+    // Shared depth-stencil state
+    ID3D11DepthStencilState *dsstate;
+
+    // Array of ID3D11SamplerStates for every combination of sample/address modes
+    ID3D11SamplerState *samplers[PL_TEX_SAMPLE_MODE_COUNT][PL_TEX_ADDRESS_MODE_COUNT];
+
+    // Resources for finish()
+    ID3D11Fence *finish_fence;
+    uint64_t finish_value;
+    HANDLE finish_event;
+    ID3D11Query *finish_query;
+    pl_buf finish_buf_src;
+    pl_buf finish_buf_dst;
+};
+
+void pl_d3d11_setup_formats(struct pl_gpu_t *gpu);
+
+void pl_d3d11_timer_start(pl_gpu gpu, pl_timer timer);
+void pl_d3d11_timer_end(pl_gpu gpu, pl_timer timer);
+
+struct pl_buf_d3d11 {
+    ID3D11Buffer *buf;
+    ID3D11Buffer *staging;
+    ID3D11ShaderResourceView *raw_srv;
+    ID3D11UnorderedAccessView *raw_uav;
+    ID3D11ShaderResourceView *texel_srv;
+    ID3D11UnorderedAccessView *texel_uav;
+
+    char *data;
+    bool dirty;
+};
+
+void pl_d3d11_buf_destroy(pl_gpu gpu, pl_buf buf);
+pl_buf pl_d3d11_buf_create(pl_gpu gpu, const struct pl_buf_params *params);
+void pl_d3d11_buf_write(pl_gpu gpu, pl_buf buf, size_t offset, const void *data,
+                        size_t size);
+bool pl_d3d11_buf_read(pl_gpu gpu, pl_buf buf, size_t offset, void *dest,
+                       size_t size);
+void pl_d3d11_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset, pl_buf src,
+                       size_t src_offset, size_t size);
+
+// Ensure a buffer is up-to-date with its system memory mirror before it is used
+void pl_d3d11_buf_resolve(pl_gpu gpu, pl_buf buf);
+
+struct pl_tex_d3d11 {
+    // res mirrors one of tex1d, tex2d or tex3d for convenience. It does not
+    // hold an additional reference to the texture object.
+    ID3D11Resource *res;
+
+    ID3D11Texture1D *tex1d;
+    ID3D11Texture2D *tex2d;
+    ID3D11Texture3D *tex3d;
+    int array_slice;
+
+    // Mirrors one of staging1d, staging2d, or staging3d, and doesn't hold a ref
+    ID3D11Resource *staging;
+
+    // Staging textures for pl_tex_download
+    ID3D11Texture1D *staging1d;
+    ID3D11Texture2D *staging2d;
+    ID3D11Texture3D *staging3d;
+
+    ID3D11ShaderResourceView *srv;
+    ID3D11RenderTargetView *rtv;
+    ID3D11UnorderedAccessView *uav;
+
+    // for tex_upload/download fallback code
+    pl_fmt texel_fmt;
+};
+
+void pl_d3d11_tex_destroy(pl_gpu gpu, pl_tex tex);
+pl_tex pl_d3d11_tex_create(pl_gpu gpu, const struct pl_tex_params *params);
+void pl_d3d11_tex_invalidate(pl_gpu gpu, pl_tex tex);
+void pl_d3d11_tex_clear_ex(pl_gpu gpu, pl_tex tex,
+                           const union pl_clear_color color);
+void pl_d3d11_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params);
+bool pl_d3d11_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params);
+bool pl_d3d11_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params);
+
+// Constant buffer layout used for gl_NumWorkGroups emulation
+struct d3d_num_workgroups_buf {
+    alignas(CBUF_ELEM) uint32_t num_wgs[3];
+};
+
+enum {
+    HLSL_BINDING_NOT_USED = -1, // Slot should always be bound as NULL
+    HLSL_BINDING_NUM_WORKGROUPS = -2, // Slot used for gl_NumWorkGroups emulation
+};
+
+// Represents a specific shader stage in a pl_pass (VS, PS, CS)
+struct d3d_pass_stage {
+    // Lists for each resource type, to simplify binding in pl_pass_run. Indexes
+    // match the index of the arrays passed to the ID3D11DeviceContext methods.
+    // Entries are the index of pass->params.descriptors which should be bound
+    // in that position, or a HLSL_BINDING_* special value.
+    PL_ARRAY(int) cbvs;
+    PL_ARRAY(int) srvs;
+    PL_ARRAY(int) samplers;
+};
+
+struct pl_pass_d3d11 {
+    ID3D11PixelShader *ps;
+    ID3D11VertexShader *vs;
+    ID3D11ComputeShader *cs;
+    ID3D11InputLayout *layout;
+    ID3D11BlendState *bstate;
+
+    // gl_NumWorkGroups emulation
+    struct d3d_num_workgroups_buf last_num_wgs;
+    ID3D11Buffer *num_workgroups_buf;
+    bool num_workgroups_used;
+
+    // Maximum binding number
+    int max_binding;
+
+    struct d3d_pass_stage main; // PS and CS
+    struct d3d_pass_stage vertex;
+
+    // List of resources, as in `struct pass_stage`, except UAVs are shared
+    // between all shader stages
+    PL_ARRAY(int) uavs;
+
+    // Pre-allocated resource arrays to use in pl_pass_run
+    ID3D11Buffer **cbv_arr;
+    ID3D11ShaderResourceView **srv_arr;
+    ID3D11SamplerState **sampler_arr;
+    ID3D11UnorderedAccessView **uav_arr;
+};
+
+void pl_d3d11_pass_destroy(pl_gpu gpu, pl_pass pass);
+const struct pl_pass_t *pl_d3d11_pass_create(pl_gpu gpu,
+                                             const struct pl_pass_params *params);
+void pl_d3d11_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params);
diff --git a/src/d3d11/gpu_buf.c b/src/d3d11/gpu_buf.c
new file mode 100644
index 0000000..955e6e1
--- /dev/null
+++ b/src/d3d11/gpu_buf.c
@@ -0,0 +1,310 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+#include "formats.h"
+
+void pl_d3d11_buf_destroy(pl_gpu gpu, pl_buf buf)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+    struct pl_buf_d3d11 *buf_p = PL_PRIV(buf);
+
+    SAFE_RELEASE(buf_p->buf);
+    SAFE_RELEASE(buf_p->staging);
+    SAFE_RELEASE(buf_p->raw_srv);
+    SAFE_RELEASE(buf_p->raw_uav);
+    SAFE_RELEASE(buf_p->texel_srv);
+    SAFE_RELEASE(buf_p->texel_uav);
+
+    pl_d3d11_flush_message_queue(ctx, "After buffer destroy");
+
+    pl_free((void *) buf);
+}
+
+pl_buf pl_d3d11_buf_create(pl_gpu gpu, const struct pl_buf_params *params)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+
+    struct pl_buf_t *buf = pl_zalloc_obj(NULL, buf, struct pl_buf_d3d11);
+    buf->params = *params;
+    buf->params.initial_data = NULL;
+
+    struct pl_buf_d3d11 *buf_p = PL_PRIV(buf);
+
+    D3D11_BUFFER_DESC desc = { .ByteWidth = params->size };
+
+    if (params->uniform && !params->format &&
+        (params->storable || params->drawable))
+    {
+        // TODO: Figure out what to do with these
+        PL_ERR(gpu, "Uniform buffers cannot share any other buffer type");
+        goto error;
+    }
+
+    // TODO: Distinguish between uniform buffers and texel uniform buffers.
+    // Currently we assume that if uniform and format are set, it's a texel
+    // buffer and NOT a uniform buffer.
+    if (params->uniform && !params->format) {
+        desc.BindFlags |= D3D11_BIND_CONSTANT_BUFFER;
+        desc.ByteWidth = PL_ALIGN2(desc.ByteWidth, CBUF_ELEM);
+    }
+    if (params->uniform && params->format) {
+        desc.BindFlags |= D3D11_BIND_SHADER_RESOURCE;
+    }
+    if (params->storable) {
+        desc.BindFlags |= D3D11_BIND_UNORDERED_ACCESS
+                        | D3D11_BIND_SHADER_RESOURCE;
+        desc.ByteWidth = PL_ALIGN2(desc.ByteWidth, sizeof(float));
+        desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS;
+    }
+    if (params->drawable) {
+        desc.BindFlags |= D3D11_BIND_VERTEX_BUFFER;
+
+        // In FL9_x, a vertex buffer can't also be an index buffer, so index
+        // buffers are unsupported in FL9_x for now
+        if (p->fl > D3D_FEATURE_LEVEL_9_3)
+            desc.BindFlags |= D3D11_BIND_INDEX_BUFFER;
+    }
+
+    char *data = NULL;
+
+    // D3D11 doesn't allow partial constant buffer updates without special
+    // conditions. To support partial buffer updates, keep a mirror of the
+    // buffer data in system memory and upload the whole thing before the buffer
+    // is used.
+    //
+    // Note: We don't use a staging buffer for this because of Intel.
+    // https://github.com/mpv-player/mpv/issues/5293
+    // https://crbug.com/593024
+    if (params->uniform && !params->format && params->host_writable) {
+        data = pl_zalloc(buf, desc.ByteWidth);
+        buf_p->data = data;
+    }
+
+    D3D11_SUBRESOURCE_DATA srdata = { 0 };
+    if (params->initial_data) {
+        if (desc.ByteWidth != params->size) {
+            // If the size had to be rounded-up, uploading from
+            // params->initial_data is technically undefined behavior, so copy
+            // the initial data to an allocation first
+            if (!data)
+                data = pl_zalloc(buf, desc.ByteWidth);
+            srdata.pSysMem = data;
+        } else {
+            srdata.pSysMem = params->initial_data;
+        }
+
+        if (data)
+            memcpy(data, params->initial_data, params->size);
+    }
+
+    D3D(ID3D11Device_CreateBuffer(p->dev, &desc,
+                                  params->initial_data ? &srdata : NULL,
+                                  &buf_p->buf));
+
+    if (!buf_p->data)
+        pl_free(data);
+
+    // Create raw views for PL_DESC_BUF_STORAGE
+    if (params->storable) {
+        // A SRV is used for PL_DESC_ACCESS_READONLY
+        D3D11_SHADER_RESOURCE_VIEW_DESC sdesc = {
+            .Format = DXGI_FORMAT_R32_TYPELESS,
+            .ViewDimension = D3D11_SRV_DIMENSION_BUFFEREX,
+            .BufferEx = {
+                .NumElements =
+                    PL_ALIGN2(buf->params.size, sizeof(float)) / sizeof(float),
+                .Flags = D3D11_BUFFEREX_SRV_FLAG_RAW,
+            },
+        };
+        D3D(ID3D11Device_CreateShaderResourceView(p->dev,
+            (ID3D11Resource *) buf_p->buf, &sdesc, &buf_p->raw_srv));
+
+        // A UAV is used for all other access modes
+        D3D11_UNORDERED_ACCESS_VIEW_DESC udesc = {
+            .Format = DXGI_FORMAT_R32_TYPELESS,
+            .ViewDimension = D3D11_UAV_DIMENSION_BUFFER,
+            .Buffer = {
+                .NumElements =
+                    PL_ALIGN2(buf->params.size, sizeof(float)) / sizeof(float),
+                .Flags = D3D11_BUFFER_UAV_FLAG_RAW,
+            },
+        };
+        D3D(ID3D11Device_CreateUnorderedAccessView(p->dev,
+            (ID3D11Resource *) buf_p->buf, &udesc, &buf_p->raw_uav));
+    }
+
+    // Create a typed SRV for PL_BUF_TEXEL_UNIFORM and PL_BUF_TEXEL_STORAGE
+    if (params->format) {
+        if (params->uniform) {
+            D3D11_SHADER_RESOURCE_VIEW_DESC sdesc = {
+                .Format = fmt_to_dxgi(params->format),
+                .ViewDimension = D3D11_SRV_DIMENSION_BUFFER,
+                .Buffer = {
+                    .NumElements =
+                        PL_ALIGN(buf->params.size, buf->params.format->texel_size)
+                            / buf->params.format->texel_size,
+                },
+            };
+            D3D(ID3D11Device_CreateShaderResourceView(p->dev,
+                (ID3D11Resource *) buf_p->buf, &sdesc, &buf_p->texel_srv));
+        }
+
+        // Create a typed UAV for PL_BUF_TEXEL_STORAGE
+        if (params->storable) {
+            D3D11_UNORDERED_ACCESS_VIEW_DESC udesc = {
+                .Format = fmt_to_dxgi(buf->params.format),
+                .ViewDimension = D3D11_UAV_DIMENSION_BUFFER,
+                .Buffer = {
+                    .NumElements =
+                        PL_ALIGN(buf->params.size, buf->params.format->texel_size)
+                            / buf->params.format->texel_size,
+                },
+            };
+            D3D(ID3D11Device_CreateUnorderedAccessView(p->dev,
+                (ID3D11Resource *) buf_p->buf, &udesc, &buf_p->texel_uav));
+        }
+    }
+
+
+    if (!buf_p->data) {
+        // Create the staging buffer regardless of whether params->host_readable
+        // is set or not, so that buf_copy can copy to system-memory-backed
+        // buffers
+        // TODO: Consider sharing a big staging buffer for this, rather than
+        // having one staging buffer per buffer
+        desc.BindFlags = 0;
+        desc.MiscFlags = 0;
+        desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+        desc.Usage = D3D11_USAGE_STAGING;
+        D3D(ID3D11Device_CreateBuffer(p->dev, &desc, NULL, &buf_p->staging));
+    }
+
+    pl_d3d11_flush_message_queue(ctx, "After buffer create");
+
+    return buf;
+
+error:
+    pl_d3d11_buf_destroy(gpu, buf);
+    return NULL;
+}
+
+void pl_d3d11_buf_write(pl_gpu gpu, pl_buf buf, size_t offset, const void *data,
+                        size_t size)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct pl_buf_d3d11 *buf_p = PL_PRIV(buf);
+
+    if (buf_p->data) {
+        memcpy(buf_p->data + offset, data, size);
+        buf_p->dirty = true;
+    } else {
+        ID3D11DeviceContext_UpdateSubresource(p->imm,
+            (ID3D11Resource *) buf_p->buf, 0, (&(D3D11_BOX) {
+                .left = offset,
+                .top = 0,
+                .front = 0,
+                .right = offset + size,
+                .bottom = 1,
+                .back = 1,
+            }), data, 0, 0);
+    }
+}
+
+void pl_d3d11_buf_resolve(pl_gpu gpu, pl_buf buf)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct pl_buf_d3d11 *buf_p = PL_PRIV(buf);
+
+    if (!buf_p->data || !buf_p->dirty)
+        return;
+
+    ID3D11DeviceContext_UpdateSubresource(p->imm, (ID3D11Resource *) buf_p->buf,
+        0, NULL, buf_p->data, 0, 0);
+}
+
+bool pl_d3d11_buf_read(pl_gpu gpu, pl_buf buf, size_t offset, void *dest,
+                       size_t size)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+    struct pl_buf_d3d11 *buf_p = PL_PRIV(buf);
+
+    // If there is a system-memory mirror of the buffer contents, use it
+    if (buf_p->data) {
+        memcpy(dest, buf_p->data + offset, size);
+        return true;
+    }
+
+    ID3D11DeviceContext_CopyResource(p->imm, (ID3D11Resource *) buf_p->staging,
+        (ID3D11Resource *) buf_p->buf);
+
+    D3D11_MAPPED_SUBRESOURCE lock;
+    D3D(ID3D11DeviceContext_Map(p->imm, (ID3D11Resource *) buf_p->staging, 0,
+                                D3D11_MAP_READ, 0, &lock));
+
+    char *csrc = lock.pData;
+    memcpy(dest, csrc + offset, size);
+
+    ID3D11DeviceContext_Unmap(p->imm, (ID3D11Resource *) buf_p->staging, 0);
+
+    pl_d3d11_flush_message_queue(ctx, "After buffer read");
+
+    return true;
+
+error:
+    return false;
+}
+
+void pl_d3d11_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset, pl_buf src,
+                       size_t src_offset, size_t size)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+    struct pl_buf_d3d11 *src_p = PL_PRIV(src);
+    struct pl_buf_d3d11 *dst_p = PL_PRIV(dst);
+
+    // Handle system memory copies in case one or both of the buffers has a
+    // system memory mirror
+    if (src_p->data && dst_p->data) {
+        memcpy(dst_p->data + dst_offset, src_p->data + src_offset, size);
+        dst_p->dirty = true;
+    } else if (src_p->data) {
+        pl_d3d11_buf_write(gpu, dst, dst_offset, src_p->data + src_offset, size);
+    } else if (dst_p->data) {
+        if (pl_d3d11_buf_read(gpu, src, src_offset, dst_p->data + dst_offset, size)) {
+            dst_p->dirty = true;
+        } else {
+            PL_ERR(gpu, "Failed to read from GPU during buffer copy");
+        }
+    } else {
+        ID3D11DeviceContext_CopySubresourceRegion(p->imm,
+            (ID3D11Resource *) dst_p->buf, 0, dst_offset, 0, 0,
+            (ID3D11Resource *) src_p->buf, 0, (&(D3D11_BOX) {
+                .left = src_offset,
+                .top = 0,
+                .front = 0,
+                .right = src_offset + size,
+                .bottom = 1,
+                .back = 1,
+            }));
+    }
+
+    pl_d3d11_flush_message_queue(ctx, "After buffer copy");
+}
diff --git a/src/d3d11/gpu_pass.c b/src/d3d11/gpu_pass.c
new file mode 100644
index 0000000..0e46ccd
--- /dev/null
+++ b/src/d3d11/gpu_pass.c
@@ -0,0 +1,1293 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+#include "formats.h"
+#include "glsl/spirv.h"
+#include "../cache.h"
+
+struct stream_buf_slice {
+    const void *data;
+    unsigned int size;
+    unsigned int offset;
+};
+
+// Upload one or more slices of single-use data to a suballocated dynamic
+// buffer. Only call this once per-buffer per-pass, since it will discard or
+// reallocate the buffer when full.
+static bool stream_buf_upload(pl_gpu gpu, struct d3d_stream_buf *stream,
+                              struct stream_buf_slice *slices, int num_slices)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+    unsigned int align = PL_DEF(stream->align, sizeof(float));
+
+    // Get total size, rounded up to the buffer's alignment
+    size_t size = 0;
+    for (int i = 0; i < num_slices; i++)
+        size += PL_ALIGN2(slices[i].size, align);
+
+    if (size > gpu->limits.max_buf_size) {
+        PL_ERR(gpu, "Streaming buffer is too large");
+        return -1;
+    }
+
+    // If the data doesn't fit, realloc the buffer
+    if (size > stream->size) {
+        size_t new_size = stream->size;
+        // Arbitrary base size
+        if (!new_size)
+            new_size = 16 * 1024;
+        while (new_size < size)
+            new_size *= 2;
+        new_size = PL_MIN(new_size, gpu->limits.max_buf_size);
+
+        ID3D11Buffer *new_buf;
+        D3D11_BUFFER_DESC vbuf_desc = {
+            .ByteWidth = new_size,
+            .Usage = D3D11_USAGE_DYNAMIC,
+            .BindFlags = stream->bind_flags,
+            .CPUAccessFlags = D3D11_CPU_ACCESS_WRITE,
+        };
+        D3D(ID3D11Device_CreateBuffer(p->dev, &vbuf_desc, NULL, &new_buf));
+
+        SAFE_RELEASE(stream->buf);
+        stream->buf = new_buf;
+        stream->size = new_size;
+        stream->used = 0;
+    }
+
+    bool discard = false;
+    size_t offset = stream->used;
+    if (offset + size > stream->size) {
+        // We reached the end of the buffer, so discard and wrap around
+        discard = true;
+        offset = 0;
+    }
+
+    D3D11_MAPPED_SUBRESOURCE map = {0};
+    UINT type = discard ? D3D11_MAP_WRITE_DISCARD : D3D11_MAP_WRITE_NO_OVERWRITE;
+    D3D(ID3D11DeviceContext_Map(p->imm, (ID3D11Resource *) stream->buf, 0, type,
+                                0, &map));
+
+    // Upload each slice
+    char *cdata = map.pData;
+    stream->used = offset;
+    for (int i = 0; i < num_slices; i++) {
+        slices[i].offset = stream->used;
+        memcpy(cdata + slices[i].offset, slices[i].data, slices[i].size);
+        stream->used += PL_ALIGN2(slices[i].size, align);
+    }
+
+    ID3D11DeviceContext_Unmap(p->imm, (ID3D11Resource *) stream->buf, 0);
+
+    return true;
+
+error:
+    return false;
+}
+
+static const char *get_shader_target(pl_gpu gpu, enum glsl_shader_stage stage)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    switch (p->fl) {
+    default:
+        switch (stage) {
+        case GLSL_SHADER_VERTEX:   return "vs_5_0";
+        case GLSL_SHADER_FRAGMENT: return "ps_5_0";
+        case GLSL_SHADER_COMPUTE:  return "cs_5_0";
+        }
+        break;
+    case D3D_FEATURE_LEVEL_10_1:
+        switch (stage) {
+        case GLSL_SHADER_VERTEX:   return "vs_4_1";
+        case GLSL_SHADER_FRAGMENT: return "ps_4_1";
+        case GLSL_SHADER_COMPUTE:  return "cs_4_1";
+        }
+        break;
+    case D3D_FEATURE_LEVEL_10_0:
+        switch (stage) {
+        case GLSL_SHADER_VERTEX:   return "vs_4_0";
+        case GLSL_SHADER_FRAGMENT: return "ps_4_0";
+        case GLSL_SHADER_COMPUTE:  return "cs_4_0";
+        }
+        break;
+    case D3D_FEATURE_LEVEL_9_3:
+        switch (stage) {
+        case GLSL_SHADER_VERTEX:   return "vs_4_0_level_9_3";
+        case GLSL_SHADER_FRAGMENT: return "ps_4_0_level_9_3";
+        case GLSL_SHADER_COMPUTE:  return NULL;
+        }
+        break;
+    case D3D_FEATURE_LEVEL_9_2:
+    case D3D_FEATURE_LEVEL_9_1:
+        switch (stage) {
+        case GLSL_SHADER_VERTEX:   return "vs_4_0_level_9_1";
+        case GLSL_SHADER_FRAGMENT: return "ps_4_0_level_9_1";
+        case GLSL_SHADER_COMPUTE:  return NULL;
+        }
+        break;
+    }
+    return NULL;
+}
+
+static SpvExecutionModel stage_to_spv(enum glsl_shader_stage stage)
+{
+    static const SpvExecutionModel spv_execution_model[] = {
+        [GLSL_SHADER_VERTEX]   = SpvExecutionModelVertex,
+        [GLSL_SHADER_FRAGMENT] = SpvExecutionModelFragment,
+        [GLSL_SHADER_COMPUTE]  = SpvExecutionModelGLCompute,
+    };
+    return spv_execution_model[stage];
+}
+
+#define SC(cmd)                                                             \
+    do {                                                                    \
+        spvc_result res = (cmd);                                            \
+        if (res != SPVC_SUCCESS) {                                          \
+            PL_ERR(gpu, "%s: %s (%d) (%s:%d)",                              \
+                   #cmd, sc ? spvc_context_get_last_error_string(sc) : "",  \
+                   res, __FILE__, __LINE__);                                \
+            goto error;                                                     \
+        }                                                                   \
+    } while (0)
+
+// Some decorations, like SpvDecorationNonWritable, are actually found on the
+// members of a buffer block, rather than the buffer block itself. If all
+// members have a certain decoration, SPIRV-Cross considers it to apply to the
+// buffer block too, which determines things like whether a SRV or UAV is used
+// for an SSBO. This function checks if SPIRV-Cross considers a decoration to
+// apply to a buffer block.
+static spvc_result buffer_block_has_decoration(spvc_compiler sc_comp,
+                                               spvc_variable_id id,
+                                               SpvDecoration decoration,
+                                               bool *out)
+{
+    const SpvDecoration *decorations;
+    size_t num_decorations = 0;
+
+    spvc_result res = spvc_compiler_get_buffer_block_decorations(sc_comp, id,
+        &decorations, &num_decorations);
+    if (res != SPVC_SUCCESS)
+        return res;
+
+    for (size_t j = 0; j < num_decorations; j++) {
+        if (decorations[j] == decoration) {
+            *out = true;
+            return res;
+        }
+    }
+
+    *out = false;
+    return res;
+}
+
+static bool alloc_hlsl_reg_bindings(pl_gpu gpu, pl_pass pass,
+                                    struct d3d_pass_stage *pass_s,
+                                    spvc_context sc,
+                                    spvc_compiler sc_comp,
+                                    spvc_resources resources,
+                                    spvc_resource_type res_type,
+                                    enum glsl_shader_stage stage)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+    const spvc_reflected_resource *res_list;
+    size_t res_count;
+
+    SC(spvc_resources_get_resource_list_for_type(resources, res_type,
+                                                 &res_list, &res_count));
+
+    // In a raster pass, one of the UAV slots is used by the runtime for the RTV
+    int uav_offset = stage == GLSL_SHADER_COMPUTE ? 0 : 1;
+    int max_uavs = p->max_uavs - uav_offset;
+
+    for (int i = 0; i < res_count; i++) {
+        unsigned int binding = spvc_compiler_get_decoration(sc_comp,
+            res_list[i].id, SpvDecorationBinding);
+        unsigned int descriptor_set = spvc_compiler_get_decoration(sc_comp,
+            res_list[i].id, SpvDecorationDescriptorSet);
+        if (descriptor_set != 0)
+            continue;
+
+        pass_p->max_binding = PL_MAX(pass_p->max_binding, binding);
+
+        spvc_hlsl_resource_binding hlslbind;
+        spvc_hlsl_resource_binding_init(&hlslbind);
+        hlslbind.stage = stage_to_spv(stage);
+        hlslbind.binding = binding;
+        hlslbind.desc_set = descriptor_set;
+
+        bool has_cbv = false, has_sampler = false, has_srv = false, has_uav = false;
+        switch (res_type) {
+        case SPVC_RESOURCE_TYPE_UNIFORM_BUFFER:
+            has_cbv = true;
+            break;
+        case SPVC_RESOURCE_TYPE_STORAGE_BUFFER:;
+            bool non_writable_bb = false;
+            SC(buffer_block_has_decoration(sc_comp, res_list[i].id,
+                SpvDecorationNonWritable, &non_writable_bb));
+            if (non_writable_bb) {
+                has_srv = true;
+            } else {
+                has_uav = true;
+            }
+            break;
+        case SPVC_RESOURCE_TYPE_STORAGE_IMAGE:;
+            bool non_writable = spvc_compiler_has_decoration(sc_comp,
+                res_list[i].id, SpvDecorationNonWritable);
+            if (non_writable) {
+                has_srv = true;
+            } else {
+                has_uav = true;
+            }
+            break;
+        case SPVC_RESOURCE_TYPE_SEPARATE_IMAGE:
+            has_srv = true;
+            break;
+        case SPVC_RESOURCE_TYPE_SAMPLED_IMAGE:;
+            spvc_type type = spvc_compiler_get_type_handle(sc_comp,
+                                                           res_list[i].type_id);
+            SpvDim dimension = spvc_type_get_image_dimension(type);
+            // Uniform texel buffers are technically sampled images, but they
+            // aren't sampled from, so don't allocate a sampler
+            if (dimension != SpvDimBuffer)
+                has_sampler = true;
+            has_srv = true;
+            break;
+        default:
+            break;
+        }
+
+        if (has_cbv) {
+            hlslbind.cbv.register_binding = pass_s->cbvs.num;
+            PL_ARRAY_APPEND(pass, pass_s->cbvs, binding);
+            if (pass_s->cbvs.num > D3D11_COMMONSHADER_CONSTANT_BUFFER_API_SLOT_COUNT) {
+                PL_ERR(gpu, "Too many constant buffers in shader");
+                goto error;
+            }
+        }
+
+        if (has_sampler) {
+            hlslbind.sampler.register_binding = pass_s->samplers.num;
+            PL_ARRAY_APPEND(pass, pass_s->samplers, binding);
+            if (pass_s->samplers.num > D3D11_COMMONSHADER_SAMPLER_SLOT_COUNT) {
+                PL_ERR(gpu, "Too many samplers in shader");
+                goto error;
+            }
+        }
+
+        if (has_srv) {
+            hlslbind.srv.register_binding = pass_s->srvs.num;
+            PL_ARRAY_APPEND(pass, pass_s->srvs, binding);
+            if (pass_s->srvs.num > p->max_srvs) {
+                PL_ERR(gpu, "Too many SRVs in shader");
+                goto error;
+            }
+        }
+
+        if (has_uav) {
+            // UAV registers are shared between the vertex and fragment shaders
+            // in a raster pass, so check if the UAV for this resource has
+            // already been allocated
+            bool uav_bound = false;
+            for (int j = 0; j < pass_p->uavs.num; j++) {
+                if (pass_p->uavs.elem[j] == binding) {
+                    uav_bound = true;
+                    break;
+                }
+            }
+
+            if (!uav_bound) {
+                hlslbind.uav.register_binding = pass_p->uavs.num + uav_offset;
+                PL_ARRAY_APPEND(pass, pass_p->uavs, binding);
+                if (pass_p->uavs.num > max_uavs) {
+                    PL_ERR(gpu, "Too many UAVs in shader");
+                    goto error;
+                }
+            }
+        }
+
+        SC(spvc_compiler_hlsl_add_resource_binding(sc_comp, &hlslbind));
+    }
+
+    return true;
+error:
+    return false;
+}
+
+static const char *shader_names[] = {
+    [GLSL_SHADER_VERTEX]   = "vertex",
+    [GLSL_SHADER_FRAGMENT] = "fragment",
+    [GLSL_SHADER_COMPUTE]  = "compute",
+};
+
+static ID3DBlob *shader_compile_glsl(pl_gpu gpu, pl_pass pass,
+                                     struct d3d_pass_stage *pass_s,
+                                     enum glsl_shader_stage stage,
+                                     const char *glsl)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+    void *tmp = pl_tmp(NULL);
+    spvc_context sc = NULL;
+    spvc_compiler sc_comp = NULL;
+    const char *hlsl = NULL;
+    ID3DBlob *out = NULL;
+    ID3DBlob *errors = NULL;
+    HRESULT hr;
+
+    pl_clock_t start = pl_clock_now();
+    pl_str spirv = pl_spirv_compile_glsl(p->spirv, tmp, gpu->glsl, stage, glsl);
+    if (!spirv.len)
+        goto error;
+
+    pl_clock_t after_glsl = pl_clock_now();
+    pl_log_cpu_time(gpu->log, start, after_glsl, "translating GLSL to SPIR-V");
+
+    SC(spvc_context_create(&sc));
+
+    spvc_parsed_ir sc_ir;
+    SC(spvc_context_parse_spirv(sc, (SpvId *) spirv.buf,
+                                spirv.len / sizeof(SpvId), &sc_ir));
+
+    SC(spvc_context_create_compiler(sc, SPVC_BACKEND_HLSL, sc_ir,
+                                    SPVC_CAPTURE_MODE_TAKE_OWNERSHIP,
+                                    &sc_comp));
+
+    spvc_compiler_options sc_opts;
+    SC(spvc_compiler_create_compiler_options(sc_comp, &sc_opts));
+
+    int sc_shader_model;
+    if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+        sc_shader_model = 50;
+    } else if (p->fl >= D3D_FEATURE_LEVEL_10_1) {
+        sc_shader_model = 41;
+    } else {
+        sc_shader_model = 40;
+    }
+
+    SC(spvc_compiler_options_set_uint(sc_opts,
+        SPVC_COMPILER_OPTION_HLSL_SHADER_MODEL, sc_shader_model));
+
+    // Unlike Vulkan and OpenGL, in D3D11, the clip-space is "flipped" with
+    // respect to framebuffer-space. In other words, if you render to a pixel at
+    // (0, -1), you have to sample from (0, 1) to get the value back. We unflip
+    // it by setting the following option, which inserts the equivalent of
+    // `gl_Position.y = -gl_Position.y` into the vertex shader
+    if (stage == GLSL_SHADER_VERTEX) {
+        SC(spvc_compiler_options_set_bool(sc_opts,
+            SPVC_COMPILER_OPTION_FLIP_VERTEX_Y, SPVC_TRUE));
+    }
+
+    // Bind readonly images and imageBuffers as SRVs. This is done because a lot
+    // of hardware (especially FL11_x hardware) has very poor format support for
+    // reading values from UAVs. It allows the common case of readonly and
+    // writeonly images to support more formats, though the less common case of
+    // readwrite images still requires format support for UAV loads (represented
+    // by the PL_FMT_CAP_READWRITE cap in libplacebo.)
+    //
+    // Note that setting this option comes at the cost of GLSL support. Readonly
+    // and readwrite images are the same type in GLSL, but SRV and UAV bound
+    // textures are different types in HLSL, so for example, a GLSL function
+    // with an image parameter may fail to compile as HLSL if it's called with a
+    // readonly image and a readwrite image at different call sites.
+    SC(spvc_compiler_options_set_bool(sc_opts,
+        SPVC_COMPILER_OPTION_HLSL_NONWRITABLE_UAV_TEXTURE_AS_SRV, SPVC_TRUE));
+
+    SC(spvc_compiler_install_compiler_options(sc_comp, sc_opts));
+
+    spvc_set active = NULL;
+    SC(spvc_compiler_get_active_interface_variables(sc_comp, &active));
+    spvc_resources resources = NULL;
+    SC(spvc_compiler_create_shader_resources_for_active_variables(
+        sc_comp, &resources, active));
+
+    // Allocate HLSL registers for each resource type
+    alloc_hlsl_reg_bindings(gpu, pass, pass_s, sc, sc_comp, resources,
+                            SPVC_RESOURCE_TYPE_SAMPLED_IMAGE, stage);
+    alloc_hlsl_reg_bindings(gpu, pass, pass_s, sc, sc_comp, resources,
+                            SPVC_RESOURCE_TYPE_SEPARATE_IMAGE, stage);
+    alloc_hlsl_reg_bindings(gpu, pass, pass_s, sc, sc_comp, resources,
+                            SPVC_RESOURCE_TYPE_UNIFORM_BUFFER, stage);
+    alloc_hlsl_reg_bindings(gpu, pass, pass_s, sc, sc_comp, resources,
+                            SPVC_RESOURCE_TYPE_STORAGE_BUFFER, stage);
+    alloc_hlsl_reg_bindings(gpu, pass, pass_s, sc, sc_comp, resources,
+                            SPVC_RESOURCE_TYPE_STORAGE_IMAGE, stage);
+
+    if (stage == GLSL_SHADER_COMPUTE) {
+        // Check if the gl_NumWorkGroups builtin is used. If it is, we have to
+        // emulate it with a constant buffer, so allocate it a CBV register.
+        spvc_variable_id num_workgroups_id =
+            spvc_compiler_hlsl_remap_num_workgroups_builtin(sc_comp);
+        if (num_workgroups_id) {
+            pass_p->num_workgroups_used = true;
+
+            spvc_hlsl_resource_binding binding;
+            spvc_hlsl_resource_binding_init(&binding);
+            binding.stage = stage_to_spv(stage);
+            binding.binding = pass_p->max_binding + 1;
+
+            // Allocate a CBV register for the buffer
+            binding.cbv.register_binding = pass_s->cbvs.num;
+            PL_ARRAY_APPEND(pass, pass_s->cbvs, HLSL_BINDING_NUM_WORKGROUPS);
+            if (pass_s->cbvs.num >
+                    D3D11_COMMONSHADER_CONSTANT_BUFFER_API_SLOT_COUNT) {
+                PL_ERR(gpu, "Not enough constant buffer slots for gl_NumWorkGroups");
+                goto error;
+            }
+
+            spvc_compiler_set_decoration(sc_comp, num_workgroups_id,
+                                         SpvDecorationDescriptorSet, 0);
+            spvc_compiler_set_decoration(sc_comp, num_workgroups_id,
+                                         SpvDecorationBinding, binding.binding);
+
+            SC(spvc_compiler_hlsl_add_resource_binding(sc_comp, &binding));
+        }
+    }
+
+    SC(spvc_compiler_compile(sc_comp, &hlsl));
+
+    pl_clock_t after_spvc = pl_clock_now();
+    pl_log_cpu_time(gpu->log, after_glsl, after_spvc, "translating SPIR-V to HLSL");
+
+    hr = p->D3DCompile(hlsl, strlen(hlsl), NULL, NULL, NULL, "main",
+        get_shader_target(gpu, stage),
+        D3DCOMPILE_SKIP_VALIDATION | D3DCOMPILE_OPTIMIZATION_LEVEL3, 0, &out,
+        &errors);
+    if (FAILED(hr)) {
+        SAFE_RELEASE(out);
+        PL_ERR(gpu, "D3DCompile failed: %s\n%.*s", pl_hresult_to_str(hr),
+               (int) ID3D10Blob_GetBufferSize(errors),
+               (char *) ID3D10Blob_GetBufferPointer(errors));
+        goto error;
+    }
+
+    pl_log_cpu_time(gpu->log, after_spvc, pl_clock_now(), "translating HLSL to DXBC");
+
+error:;
+    if (hlsl) {
+        int level = out ? PL_LOG_DEBUG : PL_LOG_ERR;
+        PL_MSG(gpu, level, "%s shader HLSL source:", shader_names[stage]);
+        pl_msg_source(gpu->log, level, hlsl);
+    }
+
+    if (sc)
+        spvc_context_destroy(sc);
+    SAFE_RELEASE(errors);
+    pl_free(tmp);
+    return out;
+}
+
+struct d3d11_cache_header {
+    uint64_t hash;
+    bool num_workgroups_used;
+    int num_main_cbvs;
+    int num_main_srvs;
+    int num_main_samplers;
+    int num_vertex_cbvs;
+    int num_vertex_srvs;
+    int num_vertex_samplers;
+    int num_uavs;
+    size_t vert_bc_len;
+    size_t frag_bc_len;
+    size_t comp_bc_len;
+};
+
+static inline uint64_t pass_cache_signature(pl_gpu gpu, uint64_t *key,
+                                            const struct pl_pass_params *params)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+
+    uint64_t hash = CACHE_KEY_D3D_DXBC; // seed to uniquely identify d3d11 shaders
+
+    pl_hash_merge(&hash, pl_str0_hash(params->glsl_shader));
+    if (params->type == PL_PASS_RASTER)
+        pl_hash_merge(&hash, pl_str0_hash(params->vertex_shader));
+
+    // store hash based on the shader bodys as the lookup key
+    if (key)
+        *key = hash;
+
+    // and add the compiler version information into the verification signature
+    pl_hash_merge(&hash, p->spirv->signature);
+
+    unsigned spvc_major, spvc_minor, spvc_patch;
+    spvc_get_version(&spvc_major, &spvc_minor, &spvc_patch);
+
+    pl_hash_merge(&hash, spvc_major);
+    pl_hash_merge(&hash, spvc_minor);
+    pl_hash_merge(&hash, spvc_patch);
+
+    pl_hash_merge(&hash, ((uint64_t)p->d3d_compiler_ver.major << 48)
+                       | ((uint64_t)p->d3d_compiler_ver.minor << 32)
+                       | ((uint64_t)p->d3d_compiler_ver.build << 16)
+                       |  (uint64_t)p->d3d_compiler_ver.revision);
+    pl_hash_merge(&hash, p->fl);
+
+    return hash;
+}
+
+static inline size_t cache_payload_size(struct d3d11_cache_header *header)
+{
+    size_t required = (header->num_main_cbvs + header->num_main_srvs +
+                       header->num_main_samplers + header->num_vertex_cbvs +
+                       header->num_vertex_srvs + header->num_vertex_samplers +
+                       header->num_uavs) * sizeof(int) + header->vert_bc_len +
+                       header->frag_bc_len + header->comp_bc_len;
+
+    return required;
+}
+
+static bool d3d11_use_cached_program(pl_gpu gpu, struct pl_pass_t *pass,
+                                     const struct pl_pass_params *params,
+                                     pl_cache_obj *obj, uint64_t *out_sig,
+                                     pl_str *vert_bc, pl_str *frag_bc, pl_str *comp_bc)
+{
+    struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+    const pl_cache gpu_cache = pl_gpu_cache(gpu);
+    if (!gpu_cache)
+        return false;
+
+    *out_sig = pass_cache_signature(gpu, &obj->key, params);
+    if (!pl_cache_get(gpu_cache, obj))
+        return false;
+
+    pl_str cache = (pl_str) { obj->data, obj->size };
+    if (cache.len < sizeof(struct d3d11_cache_header))
+        return false;
+
+    struct d3d11_cache_header *header = (struct d3d11_cache_header *) cache.buf;
+    cache = pl_str_drop(cache, sizeof(*header));
+
+    if (header->hash != *out_sig)
+        return false;
+
+    // determine required cache size before reading anything
+    size_t required = cache_payload_size(header);
+
+    if (cache.len < required)
+        return false;
+
+    pass_p->num_workgroups_used = header->num_workgroups_used;
+
+#define GET_ARRAY(object, name, num_elems)                                     \
+    do {                                                                       \
+        PL_ARRAY_MEMDUP(pass, (object)->name, cache.buf, num_elems);           \
+        cache = pl_str_drop(cache, num_elems * sizeof(*(object)->name.elem));  \
+    } while (0)
+
+#define GET_STAGE_ARRAY(stage, name) \
+            GET_ARRAY(&pass_p->stage, name, header->num_##stage##_##name)
+
+    GET_STAGE_ARRAY(main, cbvs);
+    GET_STAGE_ARRAY(main, srvs);
+    GET_STAGE_ARRAY(main, samplers);
+    GET_STAGE_ARRAY(vertex, cbvs);
+    GET_STAGE_ARRAY(vertex, srvs);
+    GET_STAGE_ARRAY(vertex, samplers);
+    GET_ARRAY(pass_p, uavs, header->num_uavs);
+
+#define GET_SHADER(ptr)                                    \
+    do {                                                   \
+        if (ptr)                                           \
+            *ptr = pl_str_take(cache, header->ptr##_len);  \
+        cache = pl_str_drop(cache, header->ptr##_len);     \
+    } while (0)
+
+    GET_SHADER(vert_bc);
+    GET_SHADER(frag_bc);
+    GET_SHADER(comp_bc);
+
+    return true;
+}
+
+static void d3d11_update_program_cache(pl_gpu gpu, struct pl_pass_t *pass,
+                                       uint64_t key, uint64_t sig,
+                                       const pl_str *vs_str, const pl_str *ps_str,
+                                       const pl_str *cs_str)
+{
+    struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+    const pl_cache gpu_cache = pl_gpu_cache(gpu);
+    if (!gpu_cache)
+        return;
+
+    struct d3d11_cache_header header = {
+        .hash = sig,
+        .num_workgroups_used = pass_p->num_workgroups_used,
+        .num_main_cbvs = pass_p->main.cbvs.num,
+        .num_main_srvs = pass_p->main.srvs.num,
+        .num_main_samplers = pass_p->main.samplers.num,
+        .num_vertex_cbvs = pass_p->vertex.cbvs.num,
+        .num_vertex_srvs = pass_p->vertex.srvs.num,
+        .num_vertex_samplers = pass_p->vertex.samplers.num,
+        .num_uavs = pass_p->uavs.num,
+        .vert_bc_len = vs_str ? vs_str->len : 0,
+        .frag_bc_len = ps_str ? ps_str->len : 0,
+        .comp_bc_len = cs_str ? cs_str->len : 0,
+    };
+
+    size_t cache_size = sizeof(header) + cache_payload_size(&header);
+    pl_str cache = {0};
+    pl_str_append(NULL, &cache, (pl_str){ (uint8_t *) &header, sizeof(header) });
+
+#define WRITE_ARRAY(name) pl_str_append(NULL, &cache, \
+        (pl_str){ (uint8_t *) pass_p->name.elem, \
+                  sizeof(*pass_p->name.elem) * pass_p->name.num })
+    WRITE_ARRAY(main.cbvs);
+    WRITE_ARRAY(main.srvs);
+    WRITE_ARRAY(main.samplers);
+    WRITE_ARRAY(vertex.cbvs);
+    WRITE_ARRAY(vertex.srvs);
+    WRITE_ARRAY(vertex.samplers);
+    WRITE_ARRAY(uavs);
+
+    if (vs_str)
+        pl_str_append(NULL, &cache, *vs_str);
+
+    if (ps_str)
+        pl_str_append(NULL, &cache, *ps_str);
+
+    if (cs_str)
+        pl_str_append(NULL, &cache, *cs_str);
+
+    pl_assert(cache_size == cache.len);
+    pl_cache_str(gpu_cache, key, &cache);
+}
+
+void pl_d3d11_pass_destroy(pl_gpu gpu, pl_pass pass)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+    struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+
+    SAFE_RELEASE(pass_p->vs);
+    SAFE_RELEASE(pass_p->ps);
+    SAFE_RELEASE(pass_p->cs);
+    SAFE_RELEASE(pass_p->layout);
+    SAFE_RELEASE(pass_p->bstate);
+    SAFE_RELEASE(pass_p->num_workgroups_buf);
+
+    pl_d3d11_flush_message_queue(ctx, "After pass destroy");
+
+    pl_free((void *) pass);
+}
+
+static bool pass_create_raster(pl_gpu gpu, struct pl_pass_t *pass,
+                               const struct pl_pass_params *params)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+    struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+    ID3DBlob *vs_blob = NULL;
+    pl_str vs_str = {0};
+    ID3DBlob *ps_blob = NULL;
+    pl_str ps_str = {0};
+    D3D11_INPUT_ELEMENT_DESC *in_descs = NULL;
+    pl_cache_obj obj = {0};
+    uint64_t sig = 0;
+    bool success = false;
+
+    if (d3d11_use_cached_program(gpu, pass, params, &obj, &sig, &vs_str, &ps_str, NULL))
+        PL_DEBUG(gpu, "Using cached DXBC shaders");
+
+    pl_assert((vs_str.len == 0) == (ps_str.len == 0));
+    if (vs_str.len == 0) {
+        vs_blob = shader_compile_glsl(gpu, pass, &pass_p->vertex,
+                                      GLSL_SHADER_VERTEX, params->vertex_shader);
+        if (!vs_blob)
+            goto error;
+
+        vs_str = (pl_str) {
+            .buf = ID3D10Blob_GetBufferPointer(vs_blob),
+            .len = ID3D10Blob_GetBufferSize(vs_blob),
+        };
+
+        ps_blob = shader_compile_glsl(gpu, pass, &pass_p->main,
+                                      GLSL_SHADER_FRAGMENT, params->glsl_shader);
+        if (!ps_blob)
+            goto error;
+
+        ps_str = (pl_str) {
+            .buf = ID3D10Blob_GetBufferPointer(ps_blob),
+            .len = ID3D10Blob_GetBufferSize(ps_blob),
+        };
+    }
+
+    D3D(ID3D11Device_CreateVertexShader(p->dev, vs_str.buf, vs_str.len, NULL,
+                                        &pass_p->vs));
+
+    D3D(ID3D11Device_CreatePixelShader(p->dev, ps_str.buf, ps_str.len, NULL,
+                                       &pass_p->ps));
+
+    in_descs = pl_calloc_ptr(pass, params->num_vertex_attribs, in_descs);
+    for (int i = 0; i < params->num_vertex_attribs; i++) {
+        struct pl_vertex_attrib *va = &params->vertex_attribs[i];
+
+        in_descs[i] = (D3D11_INPUT_ELEMENT_DESC) {
+            // The semantic name doesn't mean much and is just used to verify
+            // the input description matches the shader. SPIRV-Cross always
+            // uses TEXCOORD, so we should too.
+            .SemanticName = "TEXCOORD",
+            .SemanticIndex = va->location,
+            .AlignedByteOffset = va->offset,
+            .Format = fmt_to_dxgi(va->fmt),
+        };
+    }
+    D3D(ID3D11Device_CreateInputLayout(p->dev, in_descs,
+        params->num_vertex_attribs, vs_str.buf, vs_str.len, &pass_p->layout));
+
+    static const D3D11_BLEND blend_options[] = {
+        [PL_BLEND_ZERO] = D3D11_BLEND_ZERO,
+        [PL_BLEND_ONE] = D3D11_BLEND_ONE,
+        [PL_BLEND_SRC_ALPHA] = D3D11_BLEND_SRC_ALPHA,
+        [PL_BLEND_ONE_MINUS_SRC_ALPHA] = D3D11_BLEND_INV_SRC_ALPHA,
+    };
+
+    D3D11_BLEND_DESC bdesc = {
+        .RenderTarget[0] = {
+            .RenderTargetWriteMask = D3D11_COLOR_WRITE_ENABLE_ALL,
+        },
+    };
+    if (params->blend_params) {
+        bdesc.RenderTarget[0] = (D3D11_RENDER_TARGET_BLEND_DESC) {
+            .BlendEnable = TRUE,
+            .SrcBlend = blend_options[params->blend_params->src_rgb],
+            .DestBlend = blend_options[params->blend_params->dst_rgb],
+            .BlendOp = D3D11_BLEND_OP_ADD,
+            .SrcBlendAlpha = blend_options[params->blend_params->src_alpha],
+            .DestBlendAlpha = blend_options[params->blend_params->dst_alpha],
+            .BlendOpAlpha = D3D11_BLEND_OP_ADD,
+            .RenderTargetWriteMask = D3D11_COLOR_WRITE_ENABLE_ALL,
+        };
+    }
+    D3D(ID3D11Device_CreateBlendState(p->dev, &bdesc, &pass_p->bstate));
+
+    d3d11_update_program_cache(gpu, pass, obj.key, sig, &vs_str, &ps_str, NULL);
+
+    success = true;
+error:
+    SAFE_RELEASE(vs_blob);
+    SAFE_RELEASE(ps_blob);
+    pl_cache_obj_free(&obj);
+    pl_free(in_descs);
+    return success;
+}
+
+static bool pass_create_compute(pl_gpu gpu, struct pl_pass_t *pass,
+                                const struct pl_pass_params *params)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+    struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+    ID3DBlob *cs_blob = NULL;
+    pl_str cs_str = {0};
+    pl_cache_obj obj = {0};
+    uint64_t sig = 0;
+    bool success = false;
+
+    if (d3d11_use_cached_program(gpu, pass, params, &obj, &sig, NULL, NULL, &cs_str))
+        PL_DEBUG(gpu, "Using cached DXBC shader");
+
+    if (cs_str.len == 0) {
+        cs_blob = shader_compile_glsl(gpu, pass, &pass_p->main,
+                                      GLSL_SHADER_COMPUTE, params->glsl_shader);
+        if (!cs_blob)
+            goto error;
+
+        cs_str = (pl_str) {
+            .buf = ID3D10Blob_GetBufferPointer(cs_blob),
+            .len = ID3D10Blob_GetBufferSize(cs_blob),
+        };
+    }
+
+    D3D(ID3D11Device_CreateComputeShader(p->dev, cs_str.buf, cs_str.len, NULL,
+                                         &pass_p->cs));
+
+    if (pass_p->num_workgroups_used) {
+        D3D11_BUFFER_DESC bdesc = {
+            .BindFlags = D3D11_BIND_CONSTANT_BUFFER,
+            .ByteWidth = sizeof(pass_p->last_num_wgs),
+        };
+        D3D(ID3D11Device_CreateBuffer(p->dev, &bdesc, NULL,
+                                      &pass_p->num_workgroups_buf));
+    }
+
+    d3d11_update_program_cache(gpu, pass, obj.key, sig, NULL, NULL, &cs_str);
+
+    success = true;
+error:
+    pl_cache_obj_free(&obj);
+    SAFE_RELEASE(cs_blob);
+    return success;
+}
+
+const struct pl_pass_t *pl_d3d11_pass_create(pl_gpu gpu,
+                                             const struct pl_pass_params *params)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+
+    struct pl_pass_t *pass = pl_zalloc_obj(NULL, pass, struct pl_pass_d3d11);
+    pass->params = pl_pass_params_copy(pass, params);
+    struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+    *pass_p = (struct pl_pass_d3d11) {
+        .max_binding = -1,
+    };
+
+    if (params->type == PL_PASS_COMPUTE) {
+        if (!pass_create_compute(gpu, pass, params))
+            goto error;
+    } else {
+        if (!pass_create_raster(gpu, pass, params))
+            goto error;
+    }
+
+    // Pre-allocate resource arrays to use in pl_pass_run
+    pass_p->cbv_arr = pl_calloc(pass,
+        PL_MAX(pass_p->main.cbvs.num, pass_p->vertex.cbvs.num),
+        sizeof(*pass_p->cbv_arr));
+    pass_p->srv_arr = pl_calloc(pass,
+        PL_MAX(pass_p->main.srvs.num, pass_p->vertex.srvs.num),
+        sizeof(*pass_p->srv_arr));
+    pass_p->sampler_arr = pl_calloc(pass,
+        PL_MAX(pass_p->main.samplers.num, pass_p->vertex.samplers.num),
+        sizeof(*pass_p->sampler_arr));
+    pass_p->uav_arr = pl_calloc(pass, pass_p->uavs.num, sizeof(*pass_p->uav_arr));
+
+    // Find the highest binding number used in `params->descriptors` if we
+    // haven't found it already. (If the shader was compiled fresh rather than
+    // loaded from cache, `pass_p->max_binding` should already be set.)
+    if (pass_p->max_binding == -1) {
+        for (int i = 0; i < params->num_descriptors; i++) {
+            pass_p->max_binding = PL_MAX(pass_p->max_binding,
+                                         params->descriptors[i].binding);
+        }
+    }
+
+    // Build a mapping from binding numbers to descriptor array indexes
+    int *binding_map = pl_calloc_ptr(pass, pass_p->max_binding + 1, binding_map);
+    for (int i = 0; i <= pass_p->max_binding; i++)
+        binding_map[i] = HLSL_BINDING_NOT_USED;
+    for (int i = 0; i < params->num_descriptors; i++)
+        binding_map[params->descriptors[i].binding] = i;
+
+#define MAP_RESOURCES(array)                                 \
+    do {                                                     \
+        for (int i = 0; i < array.num; i++) {                \
+            if (array.elem[i] > pass_p->max_binding) {       \
+                array.elem[i] = HLSL_BINDING_NOT_USED;       \
+            } else if (array.elem[i] >= 0) {                 \
+                array.elem[i] = binding_map[array.elem[i]];  \
+            }                                                \
+        }                                                    \
+    } while (0)
+
+    // During shader compilation (or after loading a compiled shader from cache)
+    // the entries of the following resource lists are shader binding numbers,
+    // however, it's more efficient for `pl_pass_run` if they refer to indexes
+    // of the `params->descriptors` array instead, so remap them here
+    MAP_RESOURCES(pass_p->main.cbvs);
+    MAP_RESOURCES(pass_p->main.samplers);
+    MAP_RESOURCES(pass_p->main.srvs);
+    MAP_RESOURCES(pass_p->vertex.cbvs);
+    MAP_RESOURCES(pass_p->vertex.samplers);
+    MAP_RESOURCES(pass_p->vertex.srvs);
+    MAP_RESOURCES(pass_p->uavs);
+    pl_free(binding_map);
+
+    pl_d3d11_flush_message_queue(ctx, "After pass create");
+
+    return pass;
+
+error:
+    pl_d3d11_pass_destroy(gpu, pass);
+    return NULL;
+}
+
+// Shared logic between VS, PS and CS for filling the resource arrays that are
+// passed to ID3D11DeviceContext methods
+static void fill_resources(pl_gpu gpu, pl_pass pass,
+                           struct d3d_pass_stage *pass_s,
+                           const struct pl_pass_run_params *params,
+                           ID3D11Buffer **cbvs, ID3D11ShaderResourceView **srvs,
+                           ID3D11SamplerState **samplers)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+
+    for (int i = 0; i < pass_s->cbvs.num; i++) {
+        int binding = pass_s->cbvs.elem[i];
+        if (binding == HLSL_BINDING_NUM_WORKGROUPS) {
+            cbvs[i] = pass_p->num_workgroups_buf;
+            continue;
+        } else if (binding < 0) {
+            cbvs[i] = NULL;
+            continue;
+        }
+
+        pl_buf buf = params->desc_bindings[binding].object;
+        pl_d3d11_buf_resolve(gpu, buf);
+        struct pl_buf_d3d11 *buf_p = PL_PRIV(buf);
+        cbvs[i] = buf_p->buf;
+    }
+
+    for (int i = 0; i < pass_s->srvs.num; i++) {
+        int binding = pass_s->srvs.elem[i];
+        if (binding < 0) {
+            srvs[i] = NULL;
+            continue;
+        }
+
+        pl_tex tex;
+        struct pl_tex_d3d11 *tex_p;
+        pl_buf buf;
+        struct pl_buf_d3d11 *buf_p;
+        switch (pass->params.descriptors[binding].type) {
+        case PL_DESC_SAMPLED_TEX:
+        case PL_DESC_STORAGE_IMG:
+            tex = params->desc_bindings[binding].object;
+            tex_p = PL_PRIV(tex);
+            srvs[i] = tex_p->srv;
+            break;
+        case PL_DESC_BUF_STORAGE:
+            buf = params->desc_bindings[binding].object;
+            buf_p = PL_PRIV(buf);
+            srvs[i] = buf_p->raw_srv;
+            break;
+        case PL_DESC_BUF_TEXEL_UNIFORM:
+        case PL_DESC_BUF_TEXEL_STORAGE:
+            buf = params->desc_bindings[binding].object;
+            buf_p = PL_PRIV(buf);
+            srvs[i] = buf_p->texel_srv;
+            break;
+        default:
+            break;
+        }
+    }
+
+    for (int i = 0; i < pass_s->samplers.num; i++) {
+        int binding = pass_s->samplers.elem[i];
+        if (binding < 0) {
+            samplers[i] = NULL;
+            continue;
+        }
+
+        struct pl_desc_binding *db = &params->desc_bindings[binding];
+        samplers[i] = p->samplers[db->sample_mode][db->address_mode];
+    }
+}
+
+static void fill_uavs(pl_pass pass, const struct pl_pass_run_params *params,
+                      ID3D11UnorderedAccessView **uavs)
+{
+    struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+
+    for (int i = 0; i < pass_p->uavs.num; i++) {
+        int binding = pass_p->uavs.elem[i];
+        if (binding < 0) {
+            uavs[i] = NULL;
+            continue;
+        }
+
+        pl_tex tex;
+        struct pl_tex_d3d11 *tex_p;
+        pl_buf buf;
+        struct pl_buf_d3d11 *buf_p;
+        switch (pass->params.descriptors[binding].type) {
+        case PL_DESC_BUF_STORAGE:
+            buf = params->desc_bindings[binding].object;
+            buf_p = PL_PRIV(buf);
+            uavs[i] = buf_p->raw_uav;
+            break;
+        case PL_DESC_STORAGE_IMG:
+            tex = params->desc_bindings[binding].object;
+            tex_p = PL_PRIV(tex);
+            uavs[i] = tex_p->uav;
+            break;
+        case PL_DESC_BUF_TEXEL_STORAGE:
+            buf = params->desc_bindings[binding].object;
+            buf_p = PL_PRIV(buf);
+            uavs[i] = buf_p->texel_uav;
+            break;
+        default:
+            break;
+        }
+    }
+}
+
+static void pass_run_raster(pl_gpu gpu, const struct pl_pass_run_params *params)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    pl_pass pass = params->pass;
+    struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+
+    if (p->fl <= D3D_FEATURE_LEVEL_9_3 && params->index_buf) {
+        // Index buffers are unsupported because we can't tell if they are an
+        // index buffer or a vertex buffer on creation, and FL9_x allows only
+        // one binding type per-buffer
+        PL_ERR(gpu, "Index buffers are unsupported in FL9_x");
+        return;
+    }
+
+    if (p->fl <= D3D_FEATURE_LEVEL_9_1 && params->index_data &&
+        params->index_fmt != PL_INDEX_UINT16)
+    {
+        PL_ERR(gpu, "32-bit index format is unsupported in FL9_1");
+        return;
+    }
+
+    // Figure out how much vertex/index data to upload, if any
+    size_t vertex_alloc = params->vertex_data ? pl_vertex_buf_size(params) : 0;
+    size_t index_alloc = params->index_data ? pl_index_buf_size(params) : 0;
+
+    static const DXGI_FORMAT index_fmts[PL_INDEX_FORMAT_COUNT] = {
+        [PL_INDEX_UINT16] = DXGI_FORMAT_R16_UINT,
+        [PL_INDEX_UINT32] = DXGI_FORMAT_R32_UINT,
+    };
+
+    // Upload vertex data. On >=FL10_0 we use the same buffer for index data, so
+    // upload that too.
+    bool share_vertex_index_buf = p->fl > D3D_FEATURE_LEVEL_9_3;
+    if (vertex_alloc || (share_vertex_index_buf && index_alloc)) {
+        struct stream_buf_slice slices[] = {
+            { .data = params->vertex_data, .size = vertex_alloc },
+            { .data = params->index_data, .size = index_alloc },
+        };
+
+        if (!stream_buf_upload(gpu, &p->vbuf, slices,
+                               share_vertex_index_buf ? 2 : 1)) {
+            PL_ERR(gpu, "Failed to upload vertex data");
+            return;
+        }
+
+        if (vertex_alloc) {
+            ID3D11DeviceContext_IASetVertexBuffers(p->imm, 0, 1, &p->vbuf.buf,
+                &(UINT) { pass->params.vertex_stride }, &slices[0].offset);
+        }
+        if (share_vertex_index_buf && index_alloc) {
+            ID3D11DeviceContext_IASetIndexBuffer(p->imm, p->vbuf.buf,
+                index_fmts[params->index_fmt], slices[1].offset);
+        }
+    }
+
+    // Upload index data for <=FL9_3, which must be in its own buffer
+    if (!share_vertex_index_buf && index_alloc) {
+        struct stream_buf_slice slices[] = {
+            { .data = params->index_data, .size = index_alloc },
+        };
+
+        if (!stream_buf_upload(gpu, &p->ibuf, slices, PL_ARRAY_SIZE(slices))) {
+            PL_ERR(gpu, "Failed to upload index data");
+            return;
+        }
+
+        ID3D11DeviceContext_IASetIndexBuffer(p->imm, p->ibuf.buf,
+            index_fmts[params->index_fmt], slices[0].offset);
+    }
+
+    if (params->vertex_buf) {
+        struct pl_buf_d3d11 *buf_p = PL_PRIV(params->vertex_buf);
+        ID3D11DeviceContext_IASetVertexBuffers(p->imm, 0, 1, &buf_p->buf,
+            &(UINT) { pass->params.vertex_stride },
+            &(UINT) { params->buf_offset });
+    }
+
+    if (params->index_buf) {
+        struct pl_buf_d3d11 *buf_p = PL_PRIV(params->index_buf);
+        ID3D11DeviceContext_IASetIndexBuffer(p->imm, buf_p->buf,
+            index_fmts[params->index_fmt], params->index_offset);
+    }
+
+    ID3D11DeviceContext_IASetInputLayout(p->imm, pass_p->layout);
+
+    static const D3D_PRIMITIVE_TOPOLOGY prim_topology[] = {
+        [PL_PRIM_TRIANGLE_LIST] = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST,
+        [PL_PRIM_TRIANGLE_STRIP] = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP,
+    };
+    ID3D11DeviceContext_IASetPrimitiveTopology(p->imm,
+        prim_topology[pass->params.vertex_type]);
+
+    ID3D11DeviceContext_VSSetShader(p->imm, pass_p->vs, NULL, 0);
+
+    ID3D11Buffer **cbvs = pass_p->cbv_arr;
+    ID3D11ShaderResourceView **srvs = pass_p->srv_arr;
+    ID3D11SamplerState **samplers = pass_p->sampler_arr;
+    ID3D11UnorderedAccessView **uavs = pass_p->uav_arr;
+
+    // Set vertex shader resources. The device context is called conditionally
+    // because the debug layer complains if these are called with 0 resources.
+    fill_resources(gpu, pass, &pass_p->vertex, params, cbvs, srvs, samplers);
+    if (pass_p->vertex.cbvs.num)
+        ID3D11DeviceContext_VSSetConstantBuffers(p->imm, 0, pass_p->vertex.cbvs.num, cbvs);
+    if (pass_p->vertex.srvs.num)
+        ID3D11DeviceContext_VSSetShaderResources(p->imm, 0, pass_p->vertex.srvs.num, srvs);
+    if (pass_p->vertex.samplers.num)
+        ID3D11DeviceContext_VSSetSamplers(p->imm, 0, pass_p->vertex.samplers.num, samplers);
+
+    ID3D11DeviceContext_RSSetState(p->imm, p->rstate);
+    ID3D11DeviceContext_RSSetViewports(p->imm, 1, (&(D3D11_VIEWPORT) {
+        .TopLeftX = params->viewport.x0,
+        .TopLeftY = params->viewport.y0,
+        .Width = pl_rect_w(params->viewport),
+        .Height = pl_rect_h(params->viewport),
+        .MinDepth = 0,
+        .MaxDepth = 1,
+    }));
+    ID3D11DeviceContext_RSSetScissorRects(p->imm, 1, (&(D3D11_RECT) {
+        .left = params->scissors.x0,
+        .top = params->scissors.y0,
+        .right = params->scissors.x1,
+        .bottom = params->scissors.y1,
+    }));
+
+    ID3D11DeviceContext_PSSetShader(p->imm, pass_p->ps, NULL, 0);
+
+    // Set pixel shader resources
+    fill_resources(gpu, pass, &pass_p->main, params, cbvs, srvs, samplers);
+    if (pass_p->main.cbvs.num)
+        ID3D11DeviceContext_PSSetConstantBuffers(p->imm, 0, pass_p->main.cbvs.num, cbvs);
+    if (pass_p->main.srvs.num)
+        ID3D11DeviceContext_PSSetShaderResources(p->imm, 0, pass_p->main.srvs.num, srvs);
+    if (pass_p->main.samplers.num)
+        ID3D11DeviceContext_PSSetSamplers(p->imm, 0, pass_p->main.samplers.num, samplers);
+
+    ID3D11DeviceContext_OMSetBlendState(p->imm, pass_p->bstate, NULL,
+                                        D3D11_DEFAULT_SAMPLE_MASK);
+    ID3D11DeviceContext_OMSetDepthStencilState(p->imm, p->dsstate, 0);
+
+    fill_uavs(pass, params, uavs);
+
+    struct pl_tex_d3d11 *target_p = PL_PRIV(params->target);
+    ID3D11DeviceContext_OMSetRenderTargetsAndUnorderedAccessViews(
+        p->imm, 1, &target_p->rtv, NULL, 1, pass_p->uavs.num, uavs, NULL);
+
+    if (params->index_data || params->index_buf) {
+        ID3D11DeviceContext_DrawIndexed(p->imm, params->vertex_count, 0, 0);
+    } else {
+        ID3D11DeviceContext_Draw(p->imm, params->vertex_count, 0);
+    }
+
+    // Unbind everything. It's easier to do this than to actually track state,
+    // and if we leave the RTV bound, it could trip up D3D's conflict checker.
+    // Also, apparently unbinding SRVs can prevent a 10level9 bug?
+    // https://docs.microsoft.com/en-us/windows/win32/direct3d11/overviews-direct3d-11-devices-downlevel-prevent-null-srvs
+    for (int i = 0; i < PL_MAX(pass_p->main.cbvs.num, pass_p->vertex.cbvs.num); i++)
+        cbvs[i] = NULL;
+    for (int i = 0; i < PL_MAX(pass_p->main.srvs.num, pass_p->vertex.srvs.num); i++)
+        srvs[i] = NULL;
+    for (int i = 0; i < PL_MAX(pass_p->main.samplers.num, pass_p->vertex.samplers.num); i++)
+        samplers[i] = NULL;
+    for (int i = 0; i < pass_p->uavs.num; i++)
+        uavs[i] = NULL;
+    if (pass_p->vertex.cbvs.num)
+        ID3D11DeviceContext_VSSetConstantBuffers(p->imm, 0, pass_p->vertex.cbvs.num, cbvs);
+    if (pass_p->vertex.srvs.num)
+        ID3D11DeviceContext_VSSetShaderResources(p->imm, 0, pass_p->vertex.srvs.num, srvs);
+    if (pass_p->vertex.samplers.num)
+        ID3D11DeviceContext_VSSetSamplers(p->imm, 0, pass_p->vertex.samplers.num, samplers);
+    if (pass_p->main.cbvs.num)
+        ID3D11DeviceContext_PSSetConstantBuffers(p->imm, 0, pass_p->main.cbvs.num, cbvs);
+    if (pass_p->main.srvs.num)
+        ID3D11DeviceContext_PSSetShaderResources(p->imm, 0, pass_p->main.srvs.num, srvs);
+    if (pass_p->main.samplers.num)
+        ID3D11DeviceContext_PSSetSamplers(p->imm, 0, pass_p->main.samplers.num, samplers);
+    ID3D11DeviceContext_OMSetRenderTargetsAndUnorderedAccessViews(
+        p->imm, 0, NULL, NULL, 1, pass_p->uavs.num, uavs, NULL);
+}
+
+static void pass_run_compute(pl_gpu gpu, const struct pl_pass_run_params *params)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    pl_pass pass = params->pass;
+    struct pl_pass_d3d11 *pass_p = PL_PRIV(pass);
+
+    // Update gl_NumWorkGroups emulation buffer if necessary
+    if (pass_p->num_workgroups_used) {
+        bool needs_update = false;
+        for (int i = 0; i < 3; i++) {
+            if (pass_p->last_num_wgs.num_wgs[i] != params->compute_groups[i])
+                needs_update = true;
+            pass_p->last_num_wgs.num_wgs[i] = params->compute_groups[i];
+        }
+
+        if (needs_update) {
+            ID3D11DeviceContext_UpdateSubresource(p->imm,
+                (ID3D11Resource *) pass_p->num_workgroups_buf, 0, NULL,
+                &pass_p->last_num_wgs, 0, 0);
+        }
+    }
+
+    ID3D11DeviceContext_CSSetShader(p->imm, pass_p->cs, NULL, 0);
+
+    ID3D11Buffer **cbvs = pass_p->cbv_arr;
+    ID3D11ShaderResourceView **srvs = pass_p->srv_arr;
+    ID3D11UnorderedAccessView **uavs = pass_p->uav_arr;
+    ID3D11SamplerState **samplers = pass_p->sampler_arr;
+
+    fill_resources(gpu, pass, &pass_p->main, params, cbvs, srvs, samplers);
+    fill_uavs(pass, params, uavs);
+
+    if (pass_p->main.cbvs.num)
+        ID3D11DeviceContext_CSSetConstantBuffers(p->imm, 0, pass_p->main.cbvs.num, cbvs);
+    if (pass_p->main.srvs.num)
+        ID3D11DeviceContext_CSSetShaderResources(p->imm, 0, pass_p->main.srvs.num, srvs);
+    if (pass_p->main.samplers.num)
+        ID3D11DeviceContext_CSSetSamplers(p->imm, 0, pass_p->main.samplers.num, samplers);
+    if (pass_p->uavs.num)
+        ID3D11DeviceContext_CSSetUnorderedAccessViews(p->imm, 0, pass_p->uavs.num, uavs, NULL);
+
+    ID3D11DeviceContext_Dispatch(p->imm, params->compute_groups[0],
+                                         params->compute_groups[1],
+                                         params->compute_groups[2]);
+
+    // Unbind everything
+    for (int i = 0; i < pass_p->main.cbvs.num; i++)
+        cbvs[i] = NULL;
+    for (int i = 0; i < pass_p->main.srvs.num; i++)
+        srvs[i] = NULL;
+    for (int i = 0; i < pass_p->main.samplers.num; i++)
+        samplers[i] = NULL;
+    for (int i = 0; i < pass_p->uavs.num; i++)
+        uavs[i] = NULL;
+    if (pass_p->main.cbvs.num)
+        ID3D11DeviceContext_CSSetConstantBuffers(p->imm, 0, pass_p->main.cbvs.num, cbvs);
+    if (pass_p->main.srvs.num)
+        ID3D11DeviceContext_CSSetShaderResources(p->imm, 0, pass_p->main.srvs.num, srvs);
+    if (pass_p->main.samplers.num)
+        ID3D11DeviceContext_CSSetSamplers(p->imm, 0, pass_p->main.samplers.num, samplers);
+    if (pass_p->uavs.num)
+        ID3D11DeviceContext_CSSetUnorderedAccessViews(p->imm, 0, pass_p->uavs.num, uavs, NULL);
+}
+
+void pl_d3d11_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+    pl_pass pass = params->pass;
+
+    pl_d3d11_timer_start(gpu, params->timer);
+
+    if (pass->params.type == PL_PASS_COMPUTE) {
+        pass_run_compute(gpu, params);
+    } else {
+        pass_run_raster(gpu, params);
+    }
+
+    pl_d3d11_timer_end(gpu, params->timer);
+    pl_d3d11_flush_message_queue(ctx, "After pass run");
+}
diff --git a/src/d3d11/gpu_tex.c b/src/d3d11/gpu_tex.c
new file mode 100644
index 0000000..d63fc17
--- /dev/null
+++ b/src/d3d11/gpu_tex.c
@@ -0,0 +1,745 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+#include "formats.h"
+
+static inline UINT tex_subresource(pl_tex tex)
+{
+    struct pl_tex_d3d11 *tex_p = PL_PRIV(tex);
+    return tex_p->array_slice >= 0 ? tex_p->array_slice : 0;
+}
+
+static bool tex_init(pl_gpu gpu, pl_tex tex)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+    struct pl_tex_d3d11 *tex_p = PL_PRIV(tex);
+
+    // View formats may be omitted when they match the texture format, but for
+    // simplicity's sake we always set it. It will match the texture format for
+    // textures created with tex_create, but it can be different for video
+    // textures wrapped with pl_d3d11_wrap.
+    DXGI_FORMAT fmt = fmt_to_dxgi(tex->params.format);
+
+    if (tex->params.sampleable || tex->params.storable) {
+        D3D11_SHADER_RESOURCE_VIEW_DESC srvdesc = {
+            .Format = fmt,
+        };
+        switch (pl_tex_params_dimension(tex->params)) {
+        case 1:
+            if (tex_p->array_slice >= 0) {
+                srvdesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE1DARRAY;
+                srvdesc.Texture1DArray.MipLevels = 1;
+                srvdesc.Texture1DArray.FirstArraySlice = tex_p->array_slice;
+                srvdesc.Texture1DArray.ArraySize = 1;
+            } else {
+                srvdesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE1D;
+                srvdesc.Texture1D.MipLevels = 1;
+            }
+            break;
+        case 2:
+            if (tex_p->array_slice >= 0) {
+                srvdesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2DARRAY;
+                srvdesc.Texture2DArray.MipLevels = 1;
+                srvdesc.Texture2DArray.FirstArraySlice = tex_p->array_slice;
+                srvdesc.Texture2DArray.ArraySize = 1;
+            } else {
+                srvdesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D;
+                srvdesc.Texture2D.MipLevels = 1;
+            }
+            break;
+        case 3:
+            // D3D11 does not have Texture3D arrays
+            srvdesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE3D;
+            srvdesc.Texture3D.MipLevels = 1;
+            break;
+        }
+        D3D(ID3D11Device_CreateShaderResourceView(p->dev, tex_p->res, &srvdesc,
+                                                  &tex_p->srv));
+    }
+
+    if (tex->params.renderable) {
+        D3D11_RENDER_TARGET_VIEW_DESC rtvdesc = {
+            .Format = fmt,
+        };
+        switch (pl_tex_params_dimension(tex->params)) {
+        case 1:
+            if (tex_p->array_slice >= 0) {
+                rtvdesc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE1DARRAY;
+                rtvdesc.Texture1DArray.FirstArraySlice = tex_p->array_slice;
+                rtvdesc.Texture1DArray.ArraySize = 1;
+            } else {
+                rtvdesc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE1D;
+            }
+            break;
+        case 2:
+            if (tex_p->array_slice >= 0) {
+                rtvdesc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE2DARRAY;
+                rtvdesc.Texture2DArray.FirstArraySlice = tex_p->array_slice;
+                rtvdesc.Texture2DArray.ArraySize = 1;
+            } else {
+                rtvdesc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE2D;
+            }
+            break;
+        case 3:
+            // D3D11 does not have Texture3D arrays
+            rtvdesc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE3D;
+            rtvdesc.Texture3D.WSize = -1;
+            break;
+        }
+        D3D(ID3D11Device_CreateRenderTargetView(p->dev, tex_p->res, &rtvdesc,
+                                                &tex_p->rtv));
+    }
+
+    if (p->fl >= D3D_FEATURE_LEVEL_11_0 && tex->params.storable) {
+        D3D11_UNORDERED_ACCESS_VIEW_DESC uavdesc = {
+            .Format = fmt,
+        };
+        switch (pl_tex_params_dimension(tex->params)) {
+        case 1:
+            if (tex_p->array_slice >= 0) {
+                uavdesc.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE1DARRAY;
+                uavdesc.Texture1DArray.FirstArraySlice = tex_p->array_slice;
+                uavdesc.Texture1DArray.ArraySize = 1;
+            } else {
+                uavdesc.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE1D;
+            }
+            break;
+        case 2:
+            if (tex_p->array_slice >= 0) {
+                uavdesc.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE2DARRAY;
+                uavdesc.Texture2DArray.FirstArraySlice = tex_p->array_slice;
+                uavdesc.Texture2DArray.ArraySize = 1;
+            } else {
+                uavdesc.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE2D;
+            }
+            break;
+        case 3:
+            // D3D11 does not have Texture3D arrays
+            uavdesc.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE3D;
+            uavdesc.Texture3D.WSize = -1;
+            break;
+        }
+        D3D(ID3D11Device_CreateUnorderedAccessView(p->dev, tex_p->res, &uavdesc,
+                                                   &tex_p->uav));
+    }
+
+    return true;
+error:
+    return false;
+}
+
+void pl_d3d11_tex_destroy(pl_gpu gpu, pl_tex tex)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+    struct pl_tex_d3d11 *tex_p = PL_PRIV(tex);
+
+    SAFE_RELEASE(tex_p->srv);
+    SAFE_RELEASE(tex_p->rtv);
+    SAFE_RELEASE(tex_p->uav);
+    SAFE_RELEASE(tex_p->res);
+    SAFE_RELEASE(tex_p->staging);
+
+    pl_d3d11_flush_message_queue(ctx, "After texture destroy");
+
+    pl_free((void *) tex);
+}
+
+pl_tex pl_d3d11_tex_create(pl_gpu gpu, const struct pl_tex_params *params)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+
+    struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct pl_tex_d3d11);
+    tex->params = *params;
+    tex->params.initial_data = NULL;
+    tex->sampler_type = PL_SAMPLER_NORMAL;
+
+    struct pl_tex_d3d11 *tex_p = PL_PRIV(tex);
+
+    DXGI_FORMAT dxfmt = fmt_to_dxgi(params->format);
+
+    D3D11_USAGE usage = D3D11_USAGE_DEFAULT;
+    D3D11_BIND_FLAG bind_flags = 0;
+
+    if (params->format->emulated) {
+        tex_p->texel_fmt = pl_find_fmt(gpu, params->format->type, 1, 0,
+                                       params->format->host_bits[0],
+                                       PL_FMT_CAP_TEXEL_UNIFORM);
+
+        if (!tex_p->texel_fmt) {
+            PL_ERR(gpu, "Failed picking texel format for emulated texture!");
+            goto error;
+        }
+
+        tex->params.storable = true;
+    }
+
+    if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+        // On >=FL11_0, blit emulation needs image storage
+        tex->params.storable |= params->blit_src || params->blit_dst;
+
+        // Blit emulation can use a sampler for linear filtering during stretch
+        if ((tex->params.format->caps & PL_FMT_CAP_LINEAR) && params->blit_src)
+            tex->params.sampleable = true;
+    } else {
+        // On <FL11_0, blit emulation uses a render pass
+        tex->params.sampleable |= params->blit_src;
+        tex->params.renderable |= params->blit_dst;
+    }
+
+    if (tex->params.sampleable)
+        bind_flags |= D3D11_BIND_SHADER_RESOURCE;
+    if (tex->params.renderable)
+        bind_flags |= D3D11_BIND_RENDER_TARGET;
+    if (p->fl >= D3D_FEATURE_LEVEL_11_0 && tex->params.storable)
+        bind_flags |= D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_UNORDERED_ACCESS;
+
+    // Apparently IMMUTABLE textures are efficient, so try to infer whether we
+    // can use one
+    if (params->initial_data && !params->format->emulated &&
+        !tex->params.renderable && !tex->params.storable && !params->host_writable)
+    {
+        usage = D3D11_USAGE_IMMUTABLE;
+    }
+
+    // In FL9_x, resources with only D3D11_BIND_SHADER_RESOURCE can't be copied
+    // from GPU-accessible memory to CPU-accessible memory. The only other bind
+    // flag we set on this FL is D3D11_BIND_RENDER_TARGET, so set it.
+    if (p->fl <= D3D_FEATURE_LEVEL_9_3 && tex->params.host_readable)
+        bind_flags |= D3D11_BIND_RENDER_TARGET;
+
+    // In FL9_x, when using DEFAULT or IMMUTABLE, BindFlags cannot be zero
+    if (p->fl <= D3D_FEATURE_LEVEL_9_3 && !bind_flags)
+        bind_flags |= D3D11_BIND_SHADER_RESOURCE;
+
+    D3D11_SUBRESOURCE_DATA data;
+    D3D11_SUBRESOURCE_DATA *pdata = NULL;
+    if (params->initial_data && !params->format->emulated) {
+        data = (D3D11_SUBRESOURCE_DATA) {
+            .pSysMem = params->initial_data,
+            .SysMemPitch = params->w * params->format->texel_size,
+        };
+        if (params->d)
+            data.SysMemSlicePitch = data.SysMemPitch * params->h;
+        pdata = &data;
+    }
+
+    switch (pl_tex_params_dimension(*params)) {
+    case 1:;
+        D3D11_TEXTURE1D_DESC desc1d = {
+            .Width = params->w,
+            .MipLevels = 1,
+            .ArraySize = 1,
+            .Format = dxfmt,
+            .Usage = usage,
+            .BindFlags = bind_flags,
+        };
+        D3D(ID3D11Device_CreateTexture1D(p->dev, &desc1d, pdata, &tex_p->tex1d));
+        tex_p->res = (ID3D11Resource *)tex_p->tex1d;
+
+        // Create a staging texture with CPU access for pl_tex_download()
+        if (params->host_readable) {
+            desc1d.BindFlags = 0;
+            desc1d.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+            desc1d.Usage = D3D11_USAGE_STAGING;
+
+            D3D(ID3D11Device_CreateTexture1D(p->dev, &desc1d, NULL,
+                                             &tex_p->staging1d));
+            tex_p->staging = (ID3D11Resource *) tex_p->staging1d;
+        }
+        break;
+    case 2:;
+        D3D11_TEXTURE2D_DESC desc2d = {
+            .Width = params->w,
+            .Height = params->h,
+            .MipLevels = 1,
+            .ArraySize = 1,
+            .SampleDesc.Count = 1,
+            .Format = dxfmt,
+            .Usage = usage,
+            .BindFlags = bind_flags,
+        };
+        D3D(ID3D11Device_CreateTexture2D(p->dev, &desc2d, pdata, &tex_p->tex2d));
+        tex_p->res = (ID3D11Resource *)tex_p->tex2d;
+
+        // Create a staging texture with CPU access for pl_tex_download()
+        if (params->host_readable) {
+            desc2d.BindFlags = 0;
+            desc2d.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+            desc2d.Usage = D3D11_USAGE_STAGING;
+
+            D3D(ID3D11Device_CreateTexture2D(p->dev, &desc2d, NULL,
+                                             &tex_p->staging2d));
+            tex_p->staging = (ID3D11Resource *) tex_p->staging2d;
+        }
+        break;
+    case 3:;
+        D3D11_TEXTURE3D_DESC desc3d = {
+            .Width = params->w,
+            .Height = params->h,
+            .Depth = params->d,
+            .MipLevels = 1,
+            .Format = dxfmt,
+            .Usage = usage,
+            .BindFlags = bind_flags,
+        };
+        D3D(ID3D11Device_CreateTexture3D(p->dev, &desc3d, pdata, &tex_p->tex3d));
+        tex_p->res = (ID3D11Resource *)tex_p->tex3d;
+
+        // Create a staging texture with CPU access for pl_tex_download()
+        if (params->host_readable) {
+            desc3d.BindFlags = 0;
+            desc3d.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+            desc3d.Usage = D3D11_USAGE_STAGING;
+
+            D3D(ID3D11Device_CreateTexture3D(p->dev, &desc3d, NULL,
+                                             &tex_p->staging3d));
+            tex_p->staging = (ID3D11Resource *) tex_p->staging3d;
+        }
+        break;
+    default:
+        pl_unreachable();
+    }
+
+    tex_p->array_slice = -1;
+
+    if (!tex_init(gpu, tex))
+        goto error;
+
+    if (params->initial_data && params->format->emulated) {
+        struct pl_tex_transfer_params ul_params = {
+            .tex = tex,
+            .ptr = (void *) params->initial_data,
+            .rc = { 0, 0, 0, params->w, params->h, params->d },
+        };
+
+        // Since we re-use GPU helpers which require writable images, just fake it
+        bool writable = tex->params.host_writable;
+        tex->params.host_writable = true;
+        if (!pl_tex_upload(gpu, &ul_params))
+            goto error;
+        tex->params.host_writable = writable;
+    }
+
+    pl_d3d11_flush_message_queue(ctx, "After texture create");
+
+    return tex;
+
+error:
+    pl_d3d11_tex_destroy(gpu, tex);
+    return NULL;
+}
+
+pl_tex pl_d3d11_wrap(pl_gpu gpu, const struct pl_d3d11_wrap_params *params)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+
+    struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct pl_tex_d3d11);
+    tex->sampler_type = PL_SAMPLER_NORMAL;
+
+    struct pl_tex_d3d11 *tex_p = PL_PRIV(tex);
+
+    DXGI_FORMAT fmt = DXGI_FORMAT_UNKNOWN;
+    D3D11_USAGE usage = D3D11_USAGE_DEFAULT;
+    D3D11_BIND_FLAG bind_flags = 0;
+    UINT mip_levels = 1;
+    UINT array_size = 1;
+    UINT sample_count = 1;
+
+    D3D11_RESOURCE_DIMENSION type;
+    ID3D11Resource_GetType(params->tex, &type);
+
+    switch (type) {
+    case D3D11_RESOURCE_DIMENSION_TEXTURE1D:
+        D3D(ID3D11Resource_QueryInterface(params->tex, &IID_ID3D11Texture1D,
+                                          (void **) &tex_p->tex1d));
+        tex_p->res = (ID3D11Resource *) tex_p->tex1d;
+
+        D3D11_TEXTURE1D_DESC desc1d;
+        ID3D11Texture1D_GetDesc(tex_p->tex1d, &desc1d);
+
+        tex->params.w = desc1d.Width;
+        mip_levels = desc1d.MipLevels;
+        array_size = desc1d.ArraySize;
+        fmt = desc1d.Format;
+        usage = desc1d.Usage;
+        bind_flags = desc1d.BindFlags;
+        break;
+
+    case D3D11_RESOURCE_DIMENSION_TEXTURE2D:
+        D3D(ID3D11Resource_QueryInterface(params->tex, &IID_ID3D11Texture2D,
+                                          (void **) &tex_p->tex2d));
+        tex_p->res = (ID3D11Resource *) tex_p->tex2d;
+
+        D3D11_TEXTURE2D_DESC desc2d;
+        ID3D11Texture2D_GetDesc(tex_p->tex2d, &desc2d);
+
+        tex->params.w = desc2d.Width;
+        tex->params.h = desc2d.Height;
+        mip_levels = desc2d.MipLevels;
+        array_size = desc2d.ArraySize;
+        fmt = desc2d.Format;
+        sample_count = desc2d.SampleDesc.Count;
+        usage = desc2d.Usage;
+        bind_flags = desc2d.BindFlags;
+
+        // Allow the format and size of 2D textures to be overridden to support
+        // shader views of video resources
+        if (params->fmt) {
+            fmt = params->fmt;
+            tex->params.w = params->w;
+            tex->params.h = params->h;
+        }
+
+        break;
+
+    case D3D11_RESOURCE_DIMENSION_TEXTURE3D:
+        D3D(ID3D11Resource_QueryInterface(params->tex, &IID_ID3D11Texture3D,
+                                          (void **) &tex_p->tex3d));
+        tex_p->res = (ID3D11Resource *) tex_p->tex3d;
+
+        D3D11_TEXTURE3D_DESC desc3d;
+        ID3D11Texture3D_GetDesc(tex_p->tex3d, &desc3d);
+
+        tex->params.w = desc3d.Width;
+        tex->params.h = desc3d.Height;
+        tex->params.d = desc3d.Depth;
+        mip_levels = desc3d.MipLevels;
+        fmt = desc3d.Format;
+        usage = desc3d.Usage;
+        bind_flags = desc3d.BindFlags;
+        break;
+
+    case D3D11_RESOURCE_DIMENSION_UNKNOWN:
+    case D3D11_RESOURCE_DIMENSION_BUFFER:
+        PL_ERR(gpu, "Resource is not suitable to wrap");
+        goto error;
+    }
+
+    if (mip_levels != 1) {
+        PL_ERR(gpu, "Mipmapped textures not supported for wrapping");
+        goto error;
+    }
+    if (sample_count != 1) {
+        PL_ERR(gpu, "Multisampled textures not supported for wrapping");
+        goto error;
+    }
+    if (usage != D3D11_USAGE_DEFAULT) {
+        PL_ERR(gpu, "Resource is not D3D11_USAGE_DEFAULT");
+        goto error;
+    }
+
+    if (array_size > 1) {
+        if (params->array_slice < 0 || params->array_slice >= array_size) {
+            PL_ERR(gpu, "array_slice out of range");
+            goto error;
+        }
+        tex_p->array_slice = params->array_slice;
+    } else {
+        tex_p->array_slice = -1;
+    }
+
+    if (bind_flags & D3D11_BIND_SHADER_RESOURCE) {
+        tex->params.sampleable = true;
+
+        // Blit emulation uses a render pass on <FL11_0
+        if (p->fl < D3D_FEATURE_LEVEL_11_0)
+            tex->params.blit_src = true;
+    }
+    if (bind_flags & D3D11_BIND_RENDER_TARGET) {
+        tex->params.renderable = true;
+
+        // Blit emulation uses a render pass on <FL11_0
+        if (p->fl < D3D_FEATURE_LEVEL_11_0)
+            tex->params.blit_dst = true;
+    }
+    static const D3D11_BIND_FLAG storable_flags =
+        D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE;
+    if ((bind_flags & storable_flags) == storable_flags) {
+        tex->params.storable = true;
+
+        // Blit emulation uses image storage on >=FL11_0. A feature level check
+        // isn't required because <FL11_0 doesn't have storable images.
+        tex->params.blit_src = tex->params.blit_dst = true;
+    }
+
+    for (int i = 0; i < gpu->num_formats; i++) {
+        DXGI_FORMAT target_fmt = fmt_to_dxgi(gpu->formats[i]);
+        if (fmt == target_fmt) {
+            tex->params.format = gpu->formats[i];
+            break;
+        }
+    }
+    if (!tex->params.format) {
+        PL_ERR(gpu, "Could not find a suitable pl_fmt for wrapped resource");
+        goto error;
+    }
+
+    if (!tex_init(gpu, tex))
+        goto error;
+
+    pl_d3d11_flush_message_queue(ctx, "After texture wrap");
+
+    return tex;
+
+error:
+    pl_d3d11_tex_destroy(gpu, tex);
+    return NULL;
+}
+
+void pl_d3d11_tex_invalidate(pl_gpu gpu, pl_tex tex)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+    struct pl_tex_d3d11 *tex_p = PL_PRIV(tex);
+
+    // Resource discarding requires D3D11.1
+    if (!p->imm1)
+        return;
+
+    // Prefer discarding a view to discarding the whole resource. The reason
+    // for this is that a pl_tex can refer to a single member of a texture
+    // array. Discarding the SRV, RTV or UAV should only discard that member.
+    if (tex_p->rtv) {
+        ID3D11DeviceContext1_DiscardView(p->imm1, (ID3D11View *) tex_p->rtv);
+    } else if (tex_p->uav) {
+        ID3D11DeviceContext1_DiscardView(p->imm1, (ID3D11View *) tex_p->uav);
+    } else if (tex_p->srv) {
+        ID3D11DeviceContext1_DiscardView(p->imm1, (ID3D11View *) tex_p->srv);
+    } else if (tex_p->array_slice < 0) {
+        // If there are no views, only discard if the ID3D11Resource is not a
+        // texture array
+        ID3D11DeviceContext1_DiscardResource(p->imm1, tex_p->res);
+    }
+
+    pl_d3d11_flush_message_queue(ctx, "After texture invalidate");
+}
+
+void pl_d3d11_tex_clear_ex(pl_gpu gpu, pl_tex tex,
+                           const union pl_clear_color color)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+    struct pl_tex_d3d11 *tex_p = PL_PRIV(tex);
+
+    if (tex->params.format->type == PL_FMT_UINT) {
+        if (tex_p->uav) {
+            ID3D11DeviceContext_ClearUnorderedAccessViewUint(p->imm, tex_p->uav,
+                                                             color.u);
+        } else {
+            float c[4] = { color.u[0], color.u[1], color.u[2], color.u[3] };
+            ID3D11DeviceContext_ClearRenderTargetView(p->imm, tex_p->rtv, c);
+        }
+
+    } else if (tex->params.format->type == PL_FMT_SINT) {
+        if (tex_p->uav) {
+            ID3D11DeviceContext_ClearUnorderedAccessViewUint(p->imm, tex_p->uav,
+                                                             (const uint32_t *)color.i);
+        } else {
+            float c[4] = { color.i[0], color.i[1], color.i[2], color.i[3] };
+            ID3D11DeviceContext_ClearRenderTargetView(p->imm, tex_p->rtv, c);
+        }
+
+    } else if (tex_p->rtv) {
+        ID3D11DeviceContext_ClearRenderTargetView(p->imm, tex_p->rtv, color.f);
+    } else {
+        ID3D11DeviceContext_ClearUnorderedAccessViewFloat(p->imm, tex_p->uav, color.f);
+    }
+
+    pl_d3d11_flush_message_queue(ctx, "After texture clear");
+}
+
+#define pl_rect3d_to_box(rc)                             \
+    ((D3D11_BOX) {                                       \
+        .left = rc.x0, .top = rc.y0, .front = rc.z0,     \
+        .right = rc.x1, .bottom = rc.y1, .back = rc.z1,  \
+    })
+
+void pl_d3d11_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+    struct pl_tex_d3d11 *src_p = PL_PRIV(params->src);
+    DXGI_FORMAT src_fmt = fmt_to_dxgi(params->src->params.format);
+    struct pl_tex_d3d11 *dst_p = PL_PRIV(params->dst);
+    DXGI_FORMAT dst_fmt = fmt_to_dxgi(params->dst->params.format);
+
+    // If the blit operation doesn't require flipping, scaling or format
+    // conversion, we can use CopySubresourceRegion
+    pl_rect3d src_rc = params->src_rc, dst_rc = params->dst_rc;
+    if (pl_rect3d_eq(src_rc, dst_rc) && src_fmt == dst_fmt) {
+        pl_rect3d rc = params->src_rc;
+        pl_rect3d_normalize(&rc);
+
+        ID3D11DeviceContext_CopySubresourceRegion(p->imm, dst_p->res,
+            tex_subresource(params->dst), rc.x0, rc.y0, rc.z0, src_p->res,
+            tex_subresource(params->src), &pl_rect3d_to_box(rc));
+    } else if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+        if (!pl_tex_blit_compute(gpu, params))
+            PL_ERR(gpu, "Failed compute shader fallback blit");
+    } else {
+        pl_tex_blit_raster(gpu, params);
+    }
+
+    pl_d3d11_flush_message_queue(ctx, "After texture blit");
+}
+
+bool pl_d3d11_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+    pl_tex tex = params->tex;
+    pl_fmt fmt = tex->params.format;
+    struct pl_tex_d3d11 *tex_p = PL_PRIV(tex);
+    struct pl_tex_transfer_params *slices = NULL;
+    bool ret = false;
+
+    pl_d3d11_timer_start(gpu, params->timer);
+
+    if (fmt->emulated) {
+
+        int num_slices = pl_tex_transfer_slices(gpu, tex_p->texel_fmt, params, &slices);
+        for (int i = 0; i < num_slices; i++) {
+            // Copy the source data buffer into an intermediate buffer
+            pl_buf tbuf = pl_buf_create(gpu, pl_buf_params(
+                .memory_type  = PL_BUF_MEM_DEVICE,
+                .format       = tex_p->texel_fmt,
+                .size         = pl_tex_transfer_size(&slices[i]),
+                .initial_data = slices[i].ptr,
+                .storable     = true,
+            ));
+
+            if (!tbuf) {
+                PL_ERR(gpu, "Failed creating buffer for tex upload fallback!");
+                goto error;
+            }
+
+            slices[i].ptr = NULL;
+            slices[i].buf = tbuf;
+            slices[i].buf_offset = 0;
+            bool ok = pl_tex_upload_texel(gpu, &slices[i]);
+            pl_buf_destroy(gpu, &tbuf);
+            if (!ok)
+                goto error;
+        }
+
+    } else {
+
+        ID3D11DeviceContext_UpdateSubresource(p->imm, tex_p->res,
+            tex_subresource(tex), &pl_rect3d_to_box(params->rc), params->ptr,
+            params->row_pitch, params->depth_pitch);
+
+    }
+
+    ret = true;
+
+error:
+    pl_d3d11_timer_end(gpu, params->timer);
+    pl_d3d11_flush_message_queue(ctx, "After texture upload");
+
+    pl_free(slices);
+    return ret;
+}
+
+bool pl_d3d11_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+    struct pl_gpu_d3d11 *p = PL_PRIV(gpu);
+    struct d3d11_ctx *ctx = p->ctx;
+    const struct pl_tex_t *tex = params->tex;
+    pl_fmt fmt = tex->params.format;
+    struct pl_tex_d3d11 *tex_p = PL_PRIV(tex);
+    struct pl_tex_transfer_params *slices = NULL;
+    bool ret = false;
+
+    if (!tex_p->staging)
+        return false;
+
+    pl_d3d11_timer_start(gpu, params->timer);
+
+    if (fmt->emulated) {
+
+        pl_buf tbuf = NULL;
+        int num_slices = pl_tex_transfer_slices(gpu, tex_p->texel_fmt, params, &slices);
+        for (int i = 0; i < num_slices; i++) {
+            const size_t slice_size = pl_tex_transfer_size(&slices[i]);
+            bool ok = pl_buf_recreate(gpu, &tbuf, pl_buf_params(
+                .storable      = true,
+                .size          = slice_size,
+                .memory_type   = PL_BUF_MEM_DEVICE,
+                .format        = tex_p->texel_fmt,
+                .host_readable = true,
+            ));
+
+            if (!ok) {
+                PL_ERR(gpu, "Failed creating buffer for tex download fallback!");
+                goto error;
+            }
+
+            void *ptr = slices[i].ptr;
+            slices[i].ptr = NULL;
+            slices[i].buf = tbuf;
+            slices[i].buf_offset = 0;
+
+            // Download into an intermediate buffer first
+            ok = pl_tex_download_texel(gpu, &slices[i]);
+            ok = ok && pl_buf_read(gpu, tbuf, 0, ptr, slice_size);
+            if (!ok) {
+                pl_buf_destroy(gpu, &tbuf);
+                goto error;
+            }
+        }
+        pl_buf_destroy(gpu, &tbuf);
+
+    } else {
+
+        ID3D11DeviceContext_CopySubresourceRegion(p->imm,
+            (ID3D11Resource *) tex_p->staging, 0, params->rc.x0, params->rc.y0,
+            params->rc.z0, tex_p->res, tex_subresource(tex),
+            &pl_rect3d_to_box(params->rc));
+
+        D3D11_MAPPED_SUBRESOURCE lock;
+        D3D(ID3D11DeviceContext_Map(p->imm, (ID3D11Resource *) tex_p->staging, 0,
+                                    D3D11_MAP_READ, 0, &lock));
+
+        char *cdst = params->ptr;
+        char *csrc = lock.pData;
+        size_t line_size = pl_rect_w(params->rc) * tex->params.format->texel_size;
+        for (int z = 0; z < pl_rect_d(params->rc); z++) {
+            for (int y = 0; y < pl_rect_h(params->rc); y++) {
+                memcpy(cdst + z * params->depth_pitch + y * params->row_pitch,
+                    csrc + (params->rc.z0 + z) * lock.DepthPitch +
+                            (params->rc.y0 + y) * lock.RowPitch + params->rc.x0,
+                    line_size);
+            }
+        }
+
+        ID3D11DeviceContext_Unmap(p->imm, (ID3D11Resource*)tex_p->staging, 0);
+    }
+
+    ret = true;
+
+error:
+    pl_d3d11_timer_end(gpu, params->timer);
+    pl_d3d11_flush_message_queue(ctx, "After texture download");
+
+    pl_free(slices);
+    return ret;
+}
diff --git a/src/d3d11/meson.build b/src/d3d11/meson.build
new file mode 100644
index 0000000..d4c4b44
--- /dev/null
+++ b/src/d3d11/meson.build
@@ -0,0 +1,41 @@
+d3d11 = get_option('d3d11')
+d3d11_header = cc.check_header('d3d11.h', required: false) # needed publicly
+d3d11_headers_extra = [ # needed internally
+  cc.check_header('d3d11_4.h', required: d3d11),
+  cc.check_header('dxgi1_6.h', required: d3d11),
+]
+d3d11_deps = [
+  dependency('spirv-cross-c-shared', version: '>=0.29.0', required: d3d11),
+  cc.find_library('version', required: d3d11),
+]
+
+d3d11 = d3d11.require(d3d11_header)
+foreach h : d3d11_headers_extra
+  d3d11 = d3d11.require(h)
+endforeach
+foreach d : d3d11_deps
+  d3d11 = d3d11.require(d.found())
+endforeach
+
+components.set('d3d11', d3d11.allowed())
+if d3d11.allowed()
+  conf_internal.set('PL_HAVE_DXGI_DEBUG',
+    cc.has_header_symbol('dxgidebug.h', 'IID_IDXGIInfoQueue'))
+  conf_internal.set('PL_HAVE_DXGI_DEBUG_D3D11',
+    cc.has_header_symbol('d3d11sdklayers.h', 'DXGI_DEBUG_D3D11'))
+  add_project_arguments(['-DCOBJMACROS'], language: ['c', 'cpp'])
+  build_deps += declare_dependency(dependencies: d3d11_deps)
+  tests += 'd3d11.c'
+  sources += [
+    'd3d11/context.c',
+    'd3d11/formats.c',
+    'd3d11/gpu.c',
+    'd3d11/gpu_buf.c',
+    'd3d11/gpu_tex.c',
+    'd3d11/gpu_pass.c',
+    'd3d11/swapchain.c',
+    'd3d11/utils.c',
+  ]
+elif d3d11_header
+  sources += 'd3d11/stubs.c'
+endif
diff --git a/src/d3d11/stubs.c b/src/d3d11/stubs.c
new file mode 100644
index 0000000..b3f259c
--- /dev/null
+++ b/src/d3d11/stubs.c
@@ -0,0 +1,56 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../common.h"
+#include "log.h"
+
+#include <libplacebo/d3d11.h>
+
+const struct pl_d3d11_params pl_d3d11_default_params = { PL_D3D11_DEFAULTS };
+
+pl_d3d11 pl_d3d11_create(pl_log log, const struct pl_d3d11_params *params)
+{
+    pl_fatal(log, "libplacebo compiled without D3D11 support!");
+    return NULL;
+}
+
+void pl_d3d11_destroy(pl_d3d11 *pd3d11)
+{
+    pl_d3d11 d3d11 = *pd3d11;
+    pl_assert(!d3d11);
+}
+
+pl_d3d11 pl_d3d11_get(pl_gpu gpu)
+{
+    return NULL;
+}
+
+pl_swapchain pl_d3d11_create_swapchain(pl_d3d11 d3d11,
+    const struct pl_d3d11_swapchain_params *params)
+{
+    pl_unreachable();
+}
+
+IDXGISwapChain *pl_d3d11_swapchain_unwrap(pl_swapchain sw)
+{
+    pl_unreachable();
+}
+
+pl_tex pl_d3d11_wrap(pl_gpu gpu, const struct pl_d3d11_wrap_params *params)
+{
+    pl_unreachable();
+}
diff --git a/src/d3d11/swapchain.c b/src/d3d11/swapchain.c
new file mode 100644
index 0000000..8a53632
--- /dev/null
+++ b/src/d3d11/swapchain.c
@@ -0,0 +1,667 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <windows.h>
+#include <versionhelpers.h>
+#include <math.h>
+
+#include "gpu.h"
+#include "swapchain.h"
+#include "utils.h"
+
+struct d3d11_csp_mapping {
+    DXGI_COLOR_SPACE_TYPE d3d11_csp;
+    DXGI_FORMAT           d3d11_fmt;
+    struct pl_color_space out_csp;
+};
+
+static struct d3d11_csp_mapping map_pl_csp_to_d3d11(const struct pl_color_space *hint,
+                                                    bool use_8bit_sdr)
+{
+    if (pl_color_space_is_hdr(hint) &&
+        hint->transfer != PL_COLOR_TRC_LINEAR)
+    {
+        struct pl_color_space pl_csp = pl_color_space_hdr10;
+        pl_csp.hdr = (struct pl_hdr_metadata) {
+            // Whitelist only values that we support signalling metadata for
+            .prim     = hint->hdr.prim,
+            .min_luma = hint->hdr.min_luma,
+            .max_luma = hint->hdr.max_luma,
+            .max_cll  = hint->hdr.max_cll,
+            .max_fall = hint->hdr.max_fall,
+        };
+
+        return (struct d3d11_csp_mapping){
+            .d3d11_csp = DXGI_COLOR_SPACE_RGB_FULL_G2084_NONE_P2020,
+            .d3d11_fmt = DXGI_FORMAT_R10G10B10A2_UNORM,
+            .out_csp   = pl_csp,
+        };
+    } else if (pl_color_primaries_is_wide_gamut(hint->primaries) ||
+               hint->transfer == PL_COLOR_TRC_LINEAR)
+    {
+        // scRGB a la VK_COLOR_SPACE_EXTENDED_SRGB_LINEAR_EXT,
+        // so could be utilized for HDR/wide gamut content as well
+        // with content that goes beyond 0.0-1.0.
+        return (struct d3d11_csp_mapping){
+            .d3d11_csp = DXGI_COLOR_SPACE_RGB_FULL_G10_NONE_P709,
+            .d3d11_fmt = DXGI_FORMAT_R16G16B16A16_FLOAT,
+            .out_csp = {
+                .primaries = PL_COLOR_PRIM_BT_709,
+                .transfer  = PL_COLOR_TRC_LINEAR,
+            }
+        };
+    }
+
+    return (struct d3d11_csp_mapping){
+        .d3d11_csp = DXGI_COLOR_SPACE_RGB_FULL_G22_NONE_P709,
+        .d3d11_fmt = use_8bit_sdr ? DXGI_FORMAT_R8G8B8A8_UNORM :
+                                    DXGI_FORMAT_R10G10B10A2_UNORM,
+        .out_csp = pl_color_space_monitor,
+    };
+}
+
+struct priv {
+    struct pl_sw_fns impl;
+
+    struct d3d11_ctx *ctx;
+    IDXGISwapChain *swapchain;
+    pl_tex backbuffer;
+
+    // Currently requested or applied swap chain configuration.
+    // Affected by received colorspace hints.
+    struct d3d11_csp_mapping csp_map;
+
+    // Whether a swapchain backbuffer format reconfiguration has been
+    // requested by means of an additional resize action.
+    bool update_swapchain_format;
+
+    // Whether 10-bit backbuffer format is disabled for SDR content.
+    bool disable_10bit_sdr;
+
+    // Fallback to 8-bit RGB was triggered due to lack of compatiblity
+    bool fallback_8bit_rgb;
+};
+
+static void d3d11_sw_destroy(pl_swapchain sw)
+{
+    struct priv *p = PL_PRIV(sw);
+
+    pl_tex_destroy(sw->gpu, &p->backbuffer);
+    SAFE_RELEASE(p->swapchain);
+    pl_free((void *) sw);
+}
+
+static int d3d11_sw_latency(pl_swapchain sw)
+{
+    struct priv *p = PL_PRIV(sw);
+    struct d3d11_ctx *ctx = p->ctx;
+
+    UINT max_latency;
+    IDXGIDevice1_GetMaximumFrameLatency(ctx->dxgi_dev, &max_latency);
+    return max_latency;
+}
+
+static pl_tex get_backbuffer(pl_swapchain sw)
+{
+    struct priv *p = PL_PRIV(sw);
+    struct d3d11_ctx *ctx = p->ctx;
+    ID3D11Texture2D *backbuffer = NULL;
+    pl_tex tex = NULL;
+
+    D3D(IDXGISwapChain_GetBuffer(p->swapchain, 0, &IID_ID3D11Texture2D,
+                                 (void **) &backbuffer));
+
+    tex = pl_d3d11_wrap(sw->gpu, pl_d3d11_wrap_params(
+        .tex = (ID3D11Resource *) backbuffer,
+    ));
+
+error:
+    SAFE_RELEASE(backbuffer);
+    return tex;
+}
+
+static bool d3d11_sw_resize(pl_swapchain sw, int *width, int *height)
+{
+    struct priv *p = PL_PRIV(sw);
+    struct d3d11_ctx *ctx = p->ctx;
+
+    DXGI_SWAP_CHAIN_DESC desc = {0};
+    IDXGISwapChain_GetDesc(p->swapchain, &desc);
+    int w = PL_DEF(*width, desc.BufferDesc.Width);
+    int h = PL_DEF(*height, desc.BufferDesc.Height);
+    bool format_changed = p->csp_map.d3d11_fmt != desc.BufferDesc.Format;
+    if (format_changed) {
+        PL_INFO(ctx, "Attempting to reconfigure swap chain format: %s -> %s",
+                pl_get_dxgi_format_name(desc.BufferDesc.Format),
+                pl_get_dxgi_format_name(p->csp_map.d3d11_fmt));
+    }
+
+    if (w != desc.BufferDesc.Width || h != desc.BufferDesc.Height ||
+        format_changed)
+    {
+        if (p->backbuffer) {
+            PL_ERR(sw, "Tried resizing the swapchain while a frame was in "
+                   "progress! Please submit the current frame first.");
+            return false;
+        }
+
+        HRESULT hr = IDXGISwapChain_ResizeBuffers(p->swapchain, 0, w, h,
+                                                  p->csp_map.d3d11_fmt, desc.Flags);
+
+        if (hr == E_INVALIDARG && p->csp_map.d3d11_fmt != DXGI_FORMAT_R8G8B8A8_UNORM)
+        {
+            PL_WARN(sw, "Reconfiguring the swapchain failed, re-trying with R8G8B8A8_UNORM fallback.");
+            D3D(IDXGISwapChain_ResizeBuffers(p->swapchain, 0, w, h,
+                                             DXGI_FORMAT_R8G8B8A8_UNORM, desc.Flags));
+
+            // re-configure the colorspace to 8-bit RGB SDR fallback
+            p->csp_map = map_pl_csp_to_d3d11(&pl_color_space_unknown, true);
+            p->fallback_8bit_rgb = true;
+        }
+        else if (FAILED(hr))
+        {
+            PL_ERR(sw, "Reconfiguring the swapchain failed with error: %s", pl_hresult_to_str(hr));
+            return false;
+        }
+    }
+
+    *width = w;
+    *height = h;
+    p->update_swapchain_format = false;
+    return true;
+
+error:
+    return false;
+}
+
+static bool d3d11_sw_start_frame(pl_swapchain sw,
+                                 struct pl_swapchain_frame *out_frame)
+{
+    struct priv *p = PL_PRIV(sw);
+    struct d3d11_ctx *ctx = p->ctx;
+
+    if (ctx->is_failed)
+        return false;
+    if (p->backbuffer) {
+        PL_ERR(sw, "Attempted calling `pl_swapchain_start_frame` while a frame "
+               "was already in progress! Call `pl_swapchain_submit_frame` first.");
+        return false;
+    }
+
+    if (p->update_swapchain_format) {
+        int w = 0, h = 0;
+        if (!d3d11_sw_resize(sw, &w, &h))
+            return false;
+    }
+
+    p->backbuffer = get_backbuffer(sw);
+    if (!p->backbuffer)
+        return false;
+
+    int bits = 0;
+    pl_fmt fmt = p->backbuffer->params.format;
+    for (int i = 0; i < fmt->num_components; i++)
+        bits = PL_MAX(bits, fmt->component_depth[i]);
+
+    *out_frame = (struct pl_swapchain_frame) {
+        .fbo = p->backbuffer,
+        .flipped = false,
+        .color_repr = {
+            .sys = PL_COLOR_SYSTEM_RGB,
+            .levels = PL_COLOR_LEVELS_FULL,
+            .alpha = PL_ALPHA_UNKNOWN,
+            .bits = {
+                .sample_depth = bits,
+                .color_depth = bits,
+            },
+        },
+        .color_space = p->csp_map.out_csp,
+    };
+
+    return true;
+}
+
+static bool d3d11_sw_submit_frame(pl_swapchain sw)
+{
+    struct priv *p = PL_PRIV(sw);
+    struct d3d11_ctx *ctx = p->ctx;
+
+    // Release the backbuffer. We shouldn't hold onto it unnecessarily, because
+    // it prevents external code from resizing the swapchain, which we'd
+    // otherwise support just fine.
+    pl_tex_destroy(sw->gpu, &p->backbuffer);
+
+    return !ctx->is_failed;
+}
+
+static void d3d11_sw_swap_buffers(pl_swapchain sw)
+{
+    struct priv *p = PL_PRIV(sw);
+    struct d3d11_ctx *ctx = p->ctx;
+
+    // Present can fail with a device removed error
+    D3D(IDXGISwapChain_Present(p->swapchain, 1, 0));
+
+error:
+    return;
+}
+
+static DXGI_HDR_METADATA_HDR10 set_hdr10_metadata(const struct pl_hdr_metadata *hdr)
+{
+    return (DXGI_HDR_METADATA_HDR10) {
+        .RedPrimary   = { roundf(hdr->prim.red.x   * 50000),
+                          roundf(hdr->prim.red.y   * 50000) },
+        .GreenPrimary = { roundf(hdr->prim.green.x * 50000),
+                          roundf(hdr->prim.green.y * 50000) },
+        .BluePrimary  = { roundf(hdr->prim.blue.x  * 50000),
+                          roundf(hdr->prim.blue.y  * 50000) },
+        .WhitePoint   = { roundf(hdr->prim.white.x * 50000),
+                          roundf(hdr->prim.white.y * 50000) },
+        .MaxMasteringLuminance     = roundf(hdr->max_luma),
+        .MinMasteringLuminance     = roundf(hdr->min_luma * 10000),
+        .MaxContentLightLevel      = roundf(hdr->max_cll),
+        .MaxFrameAverageLightLevel = roundf(hdr->max_fall),
+    };
+}
+
+static bool set_swapchain_metadata(struct d3d11_ctx *ctx,
+                                   IDXGISwapChain3 *swapchain3,
+                                   struct d3d11_csp_mapping *csp_map)
+{
+    IDXGISwapChain4 *swapchain4 = NULL;
+    bool ret = false;
+    bool is_hdr = pl_color_space_is_hdr(&csp_map->out_csp);
+    DXGI_HDR_METADATA_HDR10 hdr10 = is_hdr ?
+        set_hdr10_metadata(&csp_map->out_csp.hdr) : (DXGI_HDR_METADATA_HDR10){ 0 };
+
+    D3D(IDXGISwapChain3_SetColorSpace1(swapchain3, csp_map->d3d11_csp));
+
+    // if we succeeded to set the color space, it's good enough,
+    // since older versions of Windows 10 will not have swapchain v4 available.
+    ret = true;
+
+    if (FAILED(IDXGISwapChain3_QueryInterface(swapchain3, &IID_IDXGISwapChain4,
+                                              (void **)&swapchain4)))
+    {
+        PL_TRACE(ctx, "v4 swap chain interface is not available, skipping HDR10 "
+                      "metadata configuration.");
+        goto error;
+    }
+
+    D3D(IDXGISwapChain4_SetHDRMetaData(swapchain4,
+                                       is_hdr ?
+                                       DXGI_HDR_METADATA_TYPE_HDR10 :
+                                       DXGI_HDR_METADATA_TYPE_NONE,
+                                       is_hdr ? sizeof(hdr10) : 0,
+                                       is_hdr ? &hdr10 : NULL));
+
+    goto success;
+
+error:
+    csp_map->out_csp.hdr = (struct pl_hdr_metadata) { 0 };
+success:
+    SAFE_RELEASE(swapchain4);
+    return ret;
+}
+
+static bool d3d11_format_supported(struct d3d11_ctx *ctx, DXGI_FORMAT fmt)
+{
+    UINT sup = 0;
+    UINT wanted_sup =
+        D3D11_FORMAT_SUPPORT_TEXTURE2D | D3D11_FORMAT_SUPPORT_DISPLAY |
+        D3D11_FORMAT_SUPPORT_SHADER_SAMPLE | D3D11_FORMAT_SUPPORT_RENDER_TARGET |
+        D3D11_FORMAT_SUPPORT_BLENDABLE;
+
+    D3D(ID3D11Device_CheckFormatSupport(ctx->dev, fmt, &sup));
+
+    return (sup & wanted_sup) == wanted_sup;
+
+error:
+    return false;
+}
+
+static bool d3d11_csp_supported(struct d3d11_ctx *ctx,
+                                IDXGISwapChain3 *swapchain3,
+                                DXGI_COLOR_SPACE_TYPE color_space)
+{
+    UINT csp_support_flags = 0;
+
+    D3D(IDXGISwapChain3_CheckColorSpaceSupport(swapchain3,
+                                               color_space,
+                                               &csp_support_flags));
+
+    return (csp_support_flags & DXGI_SWAP_CHAIN_COLOR_SPACE_SUPPORT_FLAG_PRESENT);
+
+error:
+    return false;
+}
+
+static void update_swapchain_color_config(pl_swapchain sw,
+                                          const struct pl_color_space *csp,
+                                          bool is_internal)
+{
+    struct priv *p = PL_PRIV(sw);
+    struct d3d11_ctx *ctx = p->ctx;
+    IDXGISwapChain3 *swapchain3 = NULL;
+    struct d3d11_csp_mapping old_map = p->csp_map;
+
+    // ignore config changes in fallback mode
+    if (p->fallback_8bit_rgb)
+        goto cleanup;
+
+    HRESULT hr = IDXGISwapChain_QueryInterface(p->swapchain, &IID_IDXGISwapChain3,
+                                               (void **)&swapchain3);
+    if (FAILED(hr)) {
+        PL_TRACE(ctx, "v3 swap chain interface is not available, skipping "
+                      "color space configuration.");
+        swapchain3 = NULL;
+    }
+
+    // Lack of swap chain v3 means we cannot control swap chain color space;
+    // Only effective formats are the 8 and 10 bit RGB ones.
+    struct d3d11_csp_mapping csp_map =
+        map_pl_csp_to_d3d11(swapchain3 ? csp : &pl_color_space_unknown,
+                            p->disable_10bit_sdr);
+
+    if (p->csp_map.d3d11_fmt == csp_map.d3d11_fmt &&
+        p->csp_map.d3d11_csp == csp_map.d3d11_csp &&
+        pl_color_space_equal(&p->csp_map.out_csp, &csp_map.out_csp))
+        goto cleanup;
+
+    PL_INFO(ctx, "%s swap chain configuration%s: format: %s, color space: %s.",
+            is_internal ? "Initial" : "New",
+            is_internal ? "" : " received from hint",
+            pl_get_dxgi_format_name(csp_map.d3d11_fmt),
+            pl_get_dxgi_csp_name(csp_map.d3d11_csp));
+
+    bool fmt_supported = d3d11_format_supported(ctx, csp_map.d3d11_fmt);
+    bool csp_supported = swapchain3 ?
+        d3d11_csp_supported(ctx, swapchain3, csp_map.d3d11_csp) : true;
+    if (!fmt_supported || !csp_supported) {
+        PL_ERR(ctx, "New swap chain configuration was deemed not supported: "
+                    "format: %s, color space: %s. Failling back to 8bit RGB.",
+               fmt_supported ? "supported" : "unsupported",
+               csp_supported ? "supported" : "unsupported");
+        // fall back to 8bit sRGB if requested configuration is not supported
+        csp_map = map_pl_csp_to_d3d11(&pl_color_space_unknown, true);
+    }
+
+    p->csp_map = csp_map;
+    p->update_swapchain_format = true;
+
+    if (!swapchain3)
+        goto cleanup;
+
+    if (!set_swapchain_metadata(ctx, swapchain3, &p->csp_map)) {
+        // format succeeded, but color space configuration failed
+        p->csp_map = old_map;
+        p->csp_map.d3d11_fmt = csp_map.d3d11_fmt;
+    }
+
+    pl_d3d11_flush_message_queue(ctx, "After colorspace hint");
+
+cleanup:
+    SAFE_RELEASE(swapchain3);
+}
+
+static void d3d11_sw_colorspace_hint(pl_swapchain sw,
+                                     const struct pl_color_space *csp)
+{
+    update_swapchain_color_config(sw, csp, false);
+}
+
+IDXGISwapChain *pl_d3d11_swapchain_unwrap(pl_swapchain sw)
+{
+    struct priv *p = PL_PRIV(sw);
+    IDXGISwapChain_AddRef(p->swapchain);
+    return p->swapchain;
+}
+
+static const struct pl_sw_fns d3d11_swapchain = {
+    .destroy         = d3d11_sw_destroy,
+    .latency         = d3d11_sw_latency,
+    .resize          = d3d11_sw_resize,
+    .colorspace_hint = d3d11_sw_colorspace_hint,
+    .start_frame     = d3d11_sw_start_frame,
+    .submit_frame    = d3d11_sw_submit_frame,
+    .swap_buffers    = d3d11_sw_swap_buffers,
+};
+
+static HRESULT create_swapchain_1_2(struct d3d11_ctx *ctx,
+    IDXGIFactory2 *factory, const struct pl_d3d11_swapchain_params *params,
+    bool flip, UINT width, UINT height, DXGI_FORMAT format,
+    IDXGISwapChain **swapchain_out)
+{
+    IDXGISwapChain *swapchain = NULL;
+    IDXGISwapChain1 *swapchain1 = NULL;
+    HRESULT hr;
+
+    DXGI_SWAP_CHAIN_DESC1 desc = {
+        .Width = width,
+        .Height = height,
+        .Format = format,
+        .SampleDesc.Count = 1,
+        .BufferUsage = DXGI_USAGE_SHADER_INPUT | DXGI_USAGE_RENDER_TARGET_OUTPUT,
+        .Flags = params->flags,
+    };
+
+    if (ID3D11Device_GetFeatureLevel(ctx->dev) >= D3D_FEATURE_LEVEL_11_0)
+        desc.BufferUsage |= DXGI_USAGE_UNORDERED_ACCESS;
+
+    if (flip) {
+        UINT max_latency;
+        IDXGIDevice1_GetMaximumFrameLatency(ctx->dxgi_dev, &max_latency);
+
+        // Make sure we have at least enough buffers to allow `max_latency`
+        // frames in-flight at once, plus one frame for the frontbuffer
+        desc.BufferCount = max_latency + 1;
+
+        if (IsWindows10OrGreater()) {
+            desc.SwapEffect = DXGI_SWAP_EFFECT_FLIP_DISCARD;
+        } else {
+            desc.SwapEffect = DXGI_SWAP_EFFECT_FLIP_SEQUENTIAL;
+        }
+
+        desc.BufferCount = PL_MIN(desc.BufferCount, DXGI_MAX_SWAP_CHAIN_BUFFERS);
+    } else {
+        desc.SwapEffect = DXGI_SWAP_EFFECT_DISCARD;
+        desc.BufferCount = 1;
+    }
+
+    if (params->window) {
+        hr = IDXGIFactory2_CreateSwapChainForHwnd(factory, (IUnknown *) ctx->dev,
+            params->window, &desc, NULL, NULL, &swapchain1);
+    } else if (params->core_window) {
+        hr = IDXGIFactory2_CreateSwapChainForCoreWindow(factory,
+            (IUnknown *) ctx->dev, params->core_window, &desc, NULL, &swapchain1);
+    } else {
+        hr = IDXGIFactory2_CreateSwapChainForComposition(factory,
+            (IUnknown *) ctx->dev, &desc, NULL, &swapchain1);
+    }
+    if (FAILED(hr))
+        goto done;
+    hr = IDXGISwapChain1_QueryInterface(swapchain1, &IID_IDXGISwapChain,
+                                        (void **) &swapchain);
+    if (FAILED(hr))
+        goto done;
+
+    *swapchain_out = swapchain;
+    swapchain = NULL;
+
+done:
+    SAFE_RELEASE(swapchain1);
+    SAFE_RELEASE(swapchain);
+    return hr;
+}
+
+static HRESULT create_swapchain_1_1(struct d3d11_ctx *ctx,
+    IDXGIFactory1 *factory, const struct pl_d3d11_swapchain_params *params,
+    UINT width, UINT height, DXGI_FORMAT format, IDXGISwapChain **swapchain_out)
+{
+    DXGI_SWAP_CHAIN_DESC desc = {
+        .BufferDesc = {
+            .Width = width,
+            .Height = height,
+            .Format = format,
+        },
+        .SampleDesc.Count = 1,
+        .BufferUsage = DXGI_USAGE_SHADER_INPUT | DXGI_USAGE_RENDER_TARGET_OUTPUT,
+        .BufferCount = 1,
+        .OutputWindow = params->window,
+        .Windowed = TRUE,
+        .SwapEffect = DXGI_SWAP_EFFECT_DISCARD,
+        .Flags = params->flags,
+    };
+
+    return IDXGIFactory1_CreateSwapChain(factory, (IUnknown *) ctx->dev, &desc,
+                                         swapchain_out);
+}
+
+static IDXGISwapChain *create_swapchain(struct d3d11_ctx *ctx,
+    const struct pl_d3d11_swapchain_params *params, DXGI_FORMAT format)
+{
+    IDXGIDevice1 *dxgi_dev = NULL;
+    IDXGIAdapter1 *adapter = NULL;
+    IDXGIFactory1 *factory = NULL;
+    IDXGIFactory2 *factory2 = NULL;
+    IDXGISwapChain *swapchain = NULL;
+    bool success = false;
+    HRESULT hr;
+
+    D3D(ID3D11Device_QueryInterface(ctx->dev, &IID_IDXGIDevice1,
+                                    (void **) &dxgi_dev));
+    D3D(IDXGIDevice1_GetParent(dxgi_dev, &IID_IDXGIAdapter1, (void **) &adapter));
+    D3D(IDXGIAdapter1_GetParent(adapter, &IID_IDXGIFactory1, (void **) &factory));
+
+    hr = IDXGIFactory1_QueryInterface(factory, &IID_IDXGIFactory2,
+                                      (void **) &factory2);
+    if (FAILED(hr))
+        factory2 = NULL;
+
+    bool flip = factory2 && !params->blit;
+    UINT width = PL_DEF(params->width, 1);
+    UINT height = PL_DEF(params->height, 1);
+
+    // If both width and height are unset, the default size is the window size
+    if (params->window && params->width == 0 && params->height == 0) {
+        RECT rc;
+        if (GetClientRect(params->window, &rc)) {
+            width = PL_DEF(rc.right - rc.left, 1);
+            height = PL_DEF(rc.bottom - rc.top, 1);
+        }
+    }
+
+    // Return here to retry creating the swapchain
+    do {
+        if (factory2) {
+            // Create a DXGI 1.2+ (Windows 8+) swap chain if possible
+            hr = create_swapchain_1_2(ctx, factory2, params, flip, width,
+                                      height, format, &swapchain);
+        } else {
+            // Fall back to DXGI 1.1 (Windows 7)
+            hr = create_swapchain_1_1(ctx, factory, params, width, height,
+                                      format, &swapchain);
+        }
+        if (SUCCEEDED(hr))
+            break;
+
+        pl_d3d11_after_error(ctx, hr);
+        if (flip) {
+            PL_DEBUG(ctx, "Failed to create flip-model swapchain, trying bitblt");
+            flip = false;
+            continue;
+        }
+
+        PL_FATAL(ctx, "Failed to create swapchain: %s", pl_hresult_to_str(hr));
+        goto error;
+    } while (true);
+
+    // Prevent DXGI from making changes to the window, otherwise it will hook
+    // the Alt+Enter keystroke and make it trigger an ugly transition to
+    // legacy exclusive fullscreen mode.
+    IDXGIFactory_MakeWindowAssociation(factory, params->window,
+        DXGI_MWA_NO_WINDOW_CHANGES | DXGI_MWA_NO_ALT_ENTER |
+        DXGI_MWA_NO_PRINT_SCREEN);
+
+    success = true;
+error:
+    if (!success)
+        SAFE_RELEASE(swapchain);
+    SAFE_RELEASE(factory2);
+    SAFE_RELEASE(factory);
+    SAFE_RELEASE(adapter);
+    SAFE_RELEASE(dxgi_dev);
+    return swapchain;
+}
+
+pl_swapchain pl_d3d11_create_swapchain(pl_d3d11 d3d11,
+    const struct pl_d3d11_swapchain_params *params)
+{
+    struct d3d11_ctx *ctx = PL_PRIV(d3d11);
+    pl_gpu gpu = d3d11->gpu;
+    bool success = false;
+
+    struct pl_swapchain_t *sw = pl_zalloc_obj(NULL, sw, struct priv);
+    struct priv *p = PL_PRIV(sw);
+    *sw = (struct pl_swapchain_t) {
+        .log = gpu->log,
+        .gpu = gpu,
+    };
+    *p = (struct priv) {
+        .impl = d3d11_swapchain,
+        .ctx = ctx,
+        // default to standard 8 or 10 bit RGB, unset pl_color_space
+        .csp_map = {
+            .d3d11_fmt = params->disable_10bit_sdr ?
+                DXGI_FORMAT_R8G8B8A8_UNORM :
+                (d3d11_format_supported(ctx, DXGI_FORMAT_R10G10B10A2_UNORM) ?
+                 DXGI_FORMAT_R10G10B10A2_UNORM : DXGI_FORMAT_R8G8B8A8_UNORM),
+        },
+        .disable_10bit_sdr = params->disable_10bit_sdr,
+    };
+
+    if (params->swapchain) {
+        p->swapchain = params->swapchain;
+        IDXGISwapChain_AddRef(params->swapchain);
+    } else {
+        p->swapchain = create_swapchain(ctx, params, p->csp_map.d3d11_fmt);
+        if (!p->swapchain)
+            goto error;
+    }
+
+    DXGI_SWAP_CHAIN_DESC scd = {0};
+    IDXGISwapChain_GetDesc(p->swapchain, &scd);
+    if (scd.SwapEffect == DXGI_SWAP_EFFECT_FLIP_SEQUENTIAL ||
+        scd.SwapEffect == DXGI_SWAP_EFFECT_FLIP_DISCARD) {
+        PL_INFO(gpu, "Using flip-model presentation");
+    } else {
+        PL_INFO(gpu, "Using bitblt-model presentation");
+    }
+
+    p->csp_map.d3d11_fmt = scd.BufferDesc.Format;
+
+    update_swapchain_color_config(sw, &pl_color_space_unknown, true);
+
+    success = true;
+error:
+    if (!success) {
+        PL_FATAL(gpu, "Failed to create Direct3D 11 swapchain");
+        d3d11_sw_destroy(sw);
+        sw = NULL;
+    }
+    return sw;
+}
diff --git a/src/d3d11/utils.c b/src/d3d11/utils.c
new file mode 100644
index 0000000..47154b5
--- /dev/null
+++ b/src/d3d11/utils.c
@@ -0,0 +1,500 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <string.h>
+
+#include "utils.h"
+
+// D3D11.3 message IDs, not present in mingw-w64 v9
+#define D3D11_MESSAGE_ID_CREATE_FENCE (0x30020c)
+#define D3D11_MESSAGE_ID_DESTROY_FENCE (0x30020a)
+
+#ifdef PL_HAVE_DXGI_DEBUG
+static enum pl_log_level log_level_override(unsigned int id)
+{
+    switch (id) {
+        // These warnings can happen when a pl_timer is used too often before a
+        // blocking pl_swapchain_swap_buffers() or pl_gpu_finish(), overflowing
+        // its internal ring buffer and causing older query objects to be reused
+        // before their results are read. This is expected behavior, so reduce
+        // the log level to PL_LOG_TRACE to prevent log spam.
+    case D3D11_MESSAGE_ID_QUERY_BEGIN_ABANDONING_PREVIOUS_RESULTS:
+    case D3D11_MESSAGE_ID_QUERY_END_ABANDONING_PREVIOUS_RESULTS:
+        return PL_LOG_TRACE;
+
+        // D3D11 writes log messages every time an object is created or
+        // destroyed. That results in a lot of log spam, so force PL_LOG_TRACE.
+#define OBJ_LIFETIME_MESSAGES(obj)          \
+    case D3D11_MESSAGE_ID_CREATE_ ## obj:   \
+    case D3D11_MESSAGE_ID_DESTROY_ ## obj
+
+    OBJ_LIFETIME_MESSAGES(CONTEXT):
+    OBJ_LIFETIME_MESSAGES(BUFFER):
+    OBJ_LIFETIME_MESSAGES(TEXTURE1D):
+    OBJ_LIFETIME_MESSAGES(TEXTURE2D):
+    OBJ_LIFETIME_MESSAGES(TEXTURE3D):
+    OBJ_LIFETIME_MESSAGES(SHADERRESOURCEVIEW):
+    OBJ_LIFETIME_MESSAGES(RENDERTARGETVIEW):
+    OBJ_LIFETIME_MESSAGES(DEPTHSTENCILVIEW):
+    OBJ_LIFETIME_MESSAGES(VERTEXSHADER):
+    OBJ_LIFETIME_MESSAGES(HULLSHADER):
+    OBJ_LIFETIME_MESSAGES(DOMAINSHADER):
+    OBJ_LIFETIME_MESSAGES(GEOMETRYSHADER):
+    OBJ_LIFETIME_MESSAGES(PIXELSHADER):
+    OBJ_LIFETIME_MESSAGES(INPUTLAYOUT):
+    OBJ_LIFETIME_MESSAGES(SAMPLER):
+    OBJ_LIFETIME_MESSAGES(BLENDSTATE):
+    OBJ_LIFETIME_MESSAGES(DEPTHSTENCILSTATE):
+    OBJ_LIFETIME_MESSAGES(RASTERIZERSTATE):
+    OBJ_LIFETIME_MESSAGES(QUERY):
+    OBJ_LIFETIME_MESSAGES(PREDICATE):
+    OBJ_LIFETIME_MESSAGES(COUNTER):
+    OBJ_LIFETIME_MESSAGES(COMMANDLIST):
+    OBJ_LIFETIME_MESSAGES(CLASSINSTANCE):
+    OBJ_LIFETIME_MESSAGES(CLASSLINKAGE):
+    OBJ_LIFETIME_MESSAGES(COMPUTESHADER):
+    OBJ_LIFETIME_MESSAGES(UNORDEREDACCESSVIEW):
+    OBJ_LIFETIME_MESSAGES(VIDEODECODER):
+    OBJ_LIFETIME_MESSAGES(VIDEOPROCESSORENUM):
+    OBJ_LIFETIME_MESSAGES(VIDEOPROCESSOR):
+    OBJ_LIFETIME_MESSAGES(DECODEROUTPUTVIEW):
+    OBJ_LIFETIME_MESSAGES(PROCESSORINPUTVIEW):
+    OBJ_LIFETIME_MESSAGES(PROCESSOROUTPUTVIEW):
+    OBJ_LIFETIME_MESSAGES(DEVICECONTEXTSTATE):
+    OBJ_LIFETIME_MESSAGES(FENCE):
+        return PL_LOG_TRACE;
+
+#undef OBJ_LIFETIME_MESSAGES
+
+        // Don't force the log level of any other messages. It will be mapped
+        // from the D3D severity code instead.
+    default:
+        return PL_LOG_NONE;
+    }
+}
+#endif
+
+void pl_d3d11_flush_message_queue(struct d3d11_ctx *ctx, const char *header)
+{
+#ifdef PL_HAVE_DXGI_DEBUG
+    if (!ctx->iqueue)
+        return;
+
+    static const enum pl_log_level severity_map[] = {
+        [DXGI_INFO_QUEUE_MESSAGE_SEVERITY_CORRUPTION] = PL_LOG_FATAL,
+        [DXGI_INFO_QUEUE_MESSAGE_SEVERITY_ERROR]      = PL_LOG_ERR,
+        [DXGI_INFO_QUEUE_MESSAGE_SEVERITY_WARNING]    = PL_LOG_WARN,
+        [DXGI_INFO_QUEUE_MESSAGE_SEVERITY_INFO]       = PL_LOG_DEBUG,
+        [DXGI_INFO_QUEUE_MESSAGE_SEVERITY_MESSAGE]    = PL_LOG_DEBUG,
+    };
+
+    enum pl_log_level header_printed = PL_LOG_NONE;
+
+    // After the storage limit is reached and ID3D11InfoQueue::ClearStoredMessages
+    // is called message counter seems to be initialized to -1 which is quite big
+    // number if we read it as uint64_t. Any subsequent call to the
+    // ID3D11InfoQueue::GetNumStoredMessages will be off by one.
+    // Use ID3D11InfoQueue_GetNumStoredMessagesAllowedByRetrievalFilter without
+    // any filter set, which seem to be unaffected by this bug and return correct
+    // number of messages.
+    // IDXGIInfoQueue seems to be unaffected, but keep the same way of retrival
+    uint64_t messages = IDXGIInfoQueue_GetNumStoredMessagesAllowedByRetrievalFilters(ctx->iqueue, DXGI_DEBUG_ALL);
+
+    // Just to be on the safe side, check also for the mentioned -1 value...
+    if (!messages || messages == UINT64_C(-1))
+        return;
+
+    uint64_t discarded =
+        IDXGIInfoQueue_GetNumMessagesDiscardedByMessageCountLimit(ctx->iqueue, DXGI_DEBUG_ALL);
+    if (discarded > ctx->last_discarded) {
+        PL_WARN(ctx, "%s:", header);
+        header_printed = PL_LOG_WARN;
+
+        // Notify number of messages skipped due to the message count limit
+        PL_WARN(ctx, "    (skipped %"PRIu64" debug layer messages)",
+                discarded - ctx->last_discarded);
+        ctx->last_discarded = discarded;
+    }
+
+    // Copy debug layer messages to libplacebo's log output
+    for (uint64_t i = 0; i < messages; i++) {
+        SIZE_T len;
+        if (FAILED(IDXGIInfoQueue_GetMessage(ctx->iqueue, DXGI_DEBUG_ALL, i, NULL, &len)))
+            goto error;
+
+        pl_grow((void *) ctx->d3d11, &ctx->dxgi_msg, len);
+        DXGI_INFO_QUEUE_MESSAGE *dxgi_msg = ctx->dxgi_msg;
+
+        if (FAILED(IDXGIInfoQueue_GetMessage(ctx->iqueue, DXGI_DEBUG_ALL, i, dxgi_msg, &len)))
+            goto error;
+
+        enum pl_log_level level = PL_LOG_NONE;
+        if (IsEqualGUID(&dxgi_msg->Producer, &DXGI_DEBUG_D3D11))
+            level = log_level_override(dxgi_msg->ID);
+        if (level == PL_LOG_NONE)
+            level = severity_map[dxgi_msg->Severity];
+
+        if (pl_msg_test(ctx->log, level)) {
+            // If the header hasn't been printed, or it was printed for a lower
+            // log level than the current message, print it (again)
+            if (header_printed == PL_LOG_NONE || header_printed > level) {
+                PL_MSG(ctx, level, "%s:", header);
+                pl_log_stack_trace(ctx->log, level);
+                header_printed = level;
+            }
+
+            PL_MSG(ctx, level, " %d: %.*s", (int) dxgi_msg->ID,
+                   (int) dxgi_msg->DescriptionByteLength, dxgi_msg->pDescription);
+        }
+
+        if (dxgi_msg->Severity <= DXGI_INFO_QUEUE_MESSAGE_SEVERITY_ERROR)
+            pl_debug_abort();
+    }
+
+error:
+    IDXGIInfoQueue_ClearStoredMessages(ctx->iqueue, DXGI_DEBUG_ALL);
+#endif
+}
+
+HRESULT pl_d3d11_check_device_removed(struct d3d11_ctx *ctx, HRESULT hr)
+{
+    // This can be called before we have a device
+    if (!ctx->dev)
+        return hr;
+
+    switch (hr) {
+    case DXGI_ERROR_DEVICE_HUNG:
+    case DXGI_ERROR_DEVICE_RESET:
+    case DXGI_ERROR_DRIVER_INTERNAL_ERROR:
+        ctx->is_failed = true;
+        break;
+    case D3DDDIERR_DEVICEREMOVED:
+    case DXGI_ERROR_DEVICE_REMOVED:
+        hr = ID3D11Device_GetDeviceRemovedReason(ctx->dev);
+        ctx->is_failed = true;
+        break;
+    }
+    if (ctx->is_failed)
+        PL_ERR(ctx, "Device lost!");
+    return hr;
+}
+
+HRESULT pl_d3d11_after_error(struct d3d11_ctx *ctx, HRESULT hr)
+{
+    hr = pl_d3d11_check_device_removed(ctx, hr);
+    pl_d3d11_flush_message_queue(ctx, "After error");
+    return hr;
+}
+
+struct dll_version pl_get_dll_version(const wchar_t *name)
+{
+    void *data = NULL;
+    struct dll_version ret = {0};
+
+    DWORD size = GetFileVersionInfoSizeW(name, &(DWORD) {0});
+    if (!size)
+        goto error;
+    data = pl_alloc(NULL, size);
+
+    if (!GetFileVersionInfoW(name, 0, size, data))
+        goto error;
+
+    VS_FIXEDFILEINFO *ffi;
+    UINT ffi_len;
+    if (!VerQueryValueW(data, L"\\", (void**)&ffi, &ffi_len))
+        goto error;
+    if (ffi_len < sizeof(*ffi))
+        goto error;
+
+    ret = (struct dll_version) {
+        .major = HIWORD(ffi->dwFileVersionMS),
+        .minor = LOWORD(ffi->dwFileVersionMS),
+        .build = HIWORD(ffi->dwFileVersionLS),
+        .revision = LOWORD(ffi->dwFileVersionLS),
+    };
+
+error:
+    pl_free(data);
+    return ret;
+}
+
+wchar_t *pl_from_utf8(void *ctx, const char *str)
+{
+    int count = MultiByteToWideChar(CP_UTF8, 0, str, -1, NULL, 0);
+    pl_assert(count > 0);
+    wchar_t *ret = pl_calloc_ptr(ctx, count, ret);
+    MultiByteToWideChar(CP_UTF8, 0, str, -1, ret, count);
+    return ret;
+}
+
+char *pl_to_utf8(void *ctx, const wchar_t *str)
+{
+    int count = WideCharToMultiByte(CP_UTF8, 0, str, -1, NULL, 0, NULL, NULL);
+    pl_assert(count > 0);
+    char *ret = pl_calloc_ptr(ctx, count, ret);
+    WideCharToMultiByte(CP_UTF8, 0, str, -1, ret, count, NULL, NULL);
+    return ret;
+}
+
+static const char *hresult_str(HRESULT hr)
+{
+    switch (hr) {
+#define CASE(name) case name: return #name
+    CASE(S_OK);
+    CASE(S_FALSE);
+    CASE(E_ABORT);
+    CASE(E_ACCESSDENIED);
+    CASE(E_FAIL);
+    CASE(E_HANDLE);
+    CASE(E_INVALIDARG);
+    CASE(E_NOINTERFACE);
+    CASE(E_NOTIMPL);
+    CASE(E_OUTOFMEMORY);
+    CASE(E_POINTER);
+    CASE(E_UNEXPECTED);
+
+    CASE(DXGI_ERROR_ACCESS_DENIED);
+    CASE(DXGI_ERROR_ACCESS_LOST);
+    CASE(DXGI_ERROR_CANNOT_PROTECT_CONTENT);
+    CASE(DXGI_ERROR_DEVICE_HUNG);
+    CASE(DXGI_ERROR_DEVICE_REMOVED);
+    CASE(DXGI_ERROR_DEVICE_RESET);
+    CASE(DXGI_ERROR_DRIVER_INTERNAL_ERROR);
+    CASE(DXGI_ERROR_FRAME_STATISTICS_DISJOINT);
+    CASE(DXGI_ERROR_GRAPHICS_VIDPN_SOURCE_IN_USE);
+    CASE(DXGI_ERROR_INVALID_CALL);
+    CASE(DXGI_ERROR_MORE_DATA);
+    CASE(DXGI_ERROR_NAME_ALREADY_EXISTS);
+    CASE(DXGI_ERROR_NONEXCLUSIVE);
+    CASE(DXGI_ERROR_NOT_CURRENTLY_AVAILABLE);
+    CASE(DXGI_ERROR_NOT_FOUND);
+    CASE(DXGI_ERROR_REMOTE_CLIENT_DISCONNECTED);
+    CASE(DXGI_ERROR_REMOTE_OUTOFMEMORY);
+    CASE(DXGI_ERROR_RESTRICT_TO_OUTPUT_STALE);
+    CASE(DXGI_ERROR_SDK_COMPONENT_MISSING);
+    CASE(DXGI_ERROR_SESSION_DISCONNECTED);
+    CASE(DXGI_ERROR_UNSUPPORTED);
+    CASE(DXGI_ERROR_WAIT_TIMEOUT);
+    CASE(DXGI_ERROR_WAS_STILL_DRAWING);
+#undef CASE
+
+    default:
+        return "Unknown error";
+    }
+}
+
+static char *format_error(void *ctx, DWORD error)
+{
+    wchar_t *wstr;
+    if (!FormatMessageW(FORMAT_MESSAGE_ALLOCATE_BUFFER |
+                        FORMAT_MESSAGE_FROM_SYSTEM |
+                        FORMAT_MESSAGE_IGNORE_INSERTS, NULL, error,
+                        MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+                        (LPWSTR)&wstr, 0, NULL))
+    {
+        return NULL;
+    }
+
+    // Trim any trailing newline from the message
+    for (int i = wcslen(wstr) - 1; i >= 0; i--) {
+        if (wstr[i] != '\r' && wstr[i] != '\n') {
+            wstr[i + 1] = '\0';
+            break;
+        }
+    }
+
+    char *str = pl_to_utf8(ctx, wstr);
+    LocalFree(wstr);
+    return str;
+}
+
+char *pl_hresult_to_str_buf(char *buf, size_t buf_size, HRESULT hr)
+{
+    char *fmsg = format_error(NULL, hr);
+    const char *code = hresult_str(hr);
+    if (fmsg) {
+        snprintf(buf, buf_size, "%s (%s, 0x%08lx)", fmsg, code, hr);
+    } else {
+        snprintf(buf, buf_size, "%s, 0x%08lx", code, hr);
+    }
+    pl_free(fmsg);
+    return buf;
+}
+
+#define D3D11_DXGI_ENUM(prefix, define) { case prefix ## define: return #define; }
+
+const char *pl_get_dxgi_format_name(DXGI_FORMAT fmt)
+{
+    switch (fmt) {
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, UNKNOWN);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32A32_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32A32_FLOAT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32A32_UINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32A32_SINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32_FLOAT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32_UINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32B32_SINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16B16A16_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16B16A16_FLOAT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16B16A16_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16B16A16_UINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16B16A16_SNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16B16A16_SINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32_FLOAT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32_UINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G32_SINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R32G8X24_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, D32_FLOAT_S8X24_UINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R32_FLOAT_X8X24_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, X32_TYPELESS_G8X24_UINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R10G10B10A2_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R10G10B10A2_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R10G10B10A2_UINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R11G11B10_FLOAT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8B8A8_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8B8A8_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8B8A8_UNORM_SRGB);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8B8A8_UINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8B8A8_SNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8B8A8_SINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16_FLOAT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16_UINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16_SNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R16G16_SINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R32_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, D32_FLOAT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R32_FLOAT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R32_UINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R32_SINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R24G8_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, D24_UNORM_S8_UINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R24_UNORM_X8_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, X24_TYPELESS_G8_UINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8_UINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8_SNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8_SINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R16_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R16_FLOAT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, D16_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R16_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R16_UINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R16_SNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R16_SINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R8_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R8_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R8_UINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R8_SNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R8_SINT);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, A8_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R1_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R9G9B9E5_SHAREDEXP);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R8G8_B8G8_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, G8R8_G8B8_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, BC1_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, BC1_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, BC1_UNORM_SRGB);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, BC2_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, BC2_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, BC2_UNORM_SRGB);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, BC3_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, BC3_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, BC3_UNORM_SRGB);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, BC4_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, BC4_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, BC4_SNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, BC5_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, BC5_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, BC5_SNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, B5G6R5_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, B5G5R5A1_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, B8G8R8A8_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, B8G8R8X8_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, R10G10B10_XR_BIAS_A2_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, B8G8R8A8_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, B8G8R8A8_UNORM_SRGB);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, B8G8R8X8_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, B8G8R8X8_UNORM_SRGB);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, BC6H_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, BC6H_UF16);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, BC6H_SF16);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, BC7_TYPELESS);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, BC7_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, BC7_UNORM_SRGB);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, AYUV);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, Y410);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, Y416);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, NV12);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, P010);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, P016);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, 420_OPAQUE);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, YUY2);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, Y210);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, Y216);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, NV11);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, AI44);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, IA44);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, P8);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, A8P8);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, B4G4R4A4_UNORM);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, P208);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, V208);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, V408);
+    D3D11_DXGI_ENUM(DXGI_FORMAT_, FORCE_UINT);
+    }
+
+    return "<unknown>";
+}
+
+const char *pl_get_dxgi_csp_name(DXGI_COLOR_SPACE_TYPE csp)
+{
+    switch ((int) csp) {
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_FULL_G22_NONE_P709);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_FULL_G10_NONE_P709);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_STUDIO_G22_NONE_P709);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_STUDIO_G22_NONE_P2020);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RESERVED);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_FULL_G22_NONE_P709_X601);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G22_LEFT_P601);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_FULL_G22_LEFT_P601);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G22_LEFT_P709);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_FULL_G22_LEFT_P709);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G22_LEFT_P2020);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_FULL_G22_LEFT_P2020);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_FULL_G2084_NONE_P2020);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G2084_LEFT_P2020);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_STUDIO_G2084_NONE_P2020);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G22_TOPLEFT_P2020);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G2084_TOPLEFT_P2020);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_FULL_G22_NONE_P2020);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_GHLG_TOPLEFT_P2020);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_FULL_GHLG_TOPLEFT_P2020);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_STUDIO_G24_NONE_P709);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, RGB_STUDIO_G24_NONE_P2020);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G24_LEFT_P709);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G24_LEFT_P2020);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, YCBCR_STUDIO_G24_TOPLEFT_P2020);
+    D3D11_DXGI_ENUM(DXGI_COLOR_SPACE_, CUSTOM);
+    }
+
+    return "<unknown>";
+}
diff --git a/src/d3d11/utils.h b/src/d3d11/utils.h
new file mode 100644
index 0000000..86b4072
--- /dev/null
+++ b/src/d3d11/utils.h
@@ -0,0 +1,88 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+#define DXGI_COLOR_SPACE_RGB_STUDIO_G24_NONE_P709       ((DXGI_COLOR_SPACE_TYPE)20)
+#define DXGI_COLOR_SPACE_RGB_STUDIO_G24_NONE_P2020      ((DXGI_COLOR_SPACE_TYPE)21)
+#define DXGI_COLOR_SPACE_YCBCR_STUDIO_G24_LEFT_P709     ((DXGI_COLOR_SPACE_TYPE)22)
+#define DXGI_COLOR_SPACE_YCBCR_STUDIO_G24_LEFT_P2020    ((DXGI_COLOR_SPACE_TYPE)23)
+#define DXGI_COLOR_SPACE_YCBCR_STUDIO_G24_TOPLEFT_P2020 ((DXGI_COLOR_SPACE_TYPE)24)
+
+// Flush debug messages from D3D11's info queue to libplacebo's log output.
+// Should be called regularly.
+void pl_d3d11_flush_message_queue(struct d3d11_ctx *ctx, const char *header);
+
+// Some D3D11 functions can fail with a set of HRESULT codes which indicate the
+// device has been removed. This is equivalent to libplacebo's gpu_is_failed
+// state and indicates that the pl_gpu needs to be recreated. This function
+// checks for one of those HRESULTs, sets the failed state, and returns a
+// specific HRESULT that indicates why the device was removed (eg. GPU hang,
+// driver crash, etc.)
+HRESULT pl_d3d11_check_device_removed(struct d3d11_ctx *ctx, HRESULT hr);
+
+// Helper function for the D3D() macro, though it can be called directly when
+// handling D3D11 errors if the D3D() macro isn't suitable for some reason.
+// Calls `pl_d3d11_check_device_removed` and `pl_d3d11_drain_debug_messages` and
+// returns the specific HRESULT from `pl_d3d11_check_device_removed` for logging
+// purposes.
+HRESULT pl_d3d11_after_error(struct d3d11_ctx *ctx, HRESULT hr);
+
+// Convenience macro for running DXGI/D3D11 functions and performing appropriate
+// actions on failure. Can also be used for any HRESULT-returning function.
+#define D3D(call)                                                         \
+    do {                                                                  \
+        HRESULT hr_ = (call);                                             \
+        if (FAILED(hr_)) {                                                \
+            hr_ = pl_d3d11_after_error(ctx, hr_);                         \
+            PL_ERR(ctx, "%s: %s (%s:%d)", #call, pl_hresult_to_str(hr_),  \
+                   __FILE__, __LINE__);                                   \
+            goto error;                                                   \
+        }                                                                 \
+    } while (0);
+
+// Conditionally release a COM interface and set the pointer to NULL
+#define SAFE_RELEASE(iface)                   \
+    do {                                      \
+        if (iface)                            \
+            (iface)->lpVtbl->Release(iface);  \
+        (iface) = NULL;                       \
+    } while (0)
+
+struct dll_version {
+    uint16_t major;
+    uint16_t minor;
+    uint16_t build;
+    uint16_t revision;
+};
+
+// Get the version number of a DLL. This calls GetFileVersionInfoW, which should
+// call LoadLibraryExW internally, so it should get the same copy of the DLL
+// that is loaded into memory if there is a copy in System32 and a copy in the
+// %PATH% or application directory.
+struct dll_version pl_get_dll_version(const wchar_t *name);
+
+wchar_t *pl_from_utf8(void *ctx, const char *str);
+char *pl_to_utf8(void *ctx, const wchar_t *str);
+
+#define pl_hresult_to_str(hr) pl_hresult_to_str_buf((char[256]){0}, 256, (hr))
+char *pl_hresult_to_str_buf(char *buf, size_t buf_size, HRESULT hr);
+
+const char *pl_get_dxgi_csp_name(DXGI_COLOR_SPACE_TYPE csp);
+const char *pl_get_dxgi_format_name(DXGI_FORMAT fmt);
diff --git a/src/dispatch.c b/src/dispatch.c
new file mode 100644
index 0000000..308dd56
--- /dev/null
+++ b/src/dispatch.c
@@ -0,0 +1,1615 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include "log.h"
+#include "shaders.h"
+#include "dispatch.h"
+#include "gpu.h"
+#include "pl_thread.h"
+
+// Maximum number of passes to keep around at once. If full, passes older than
+// MIN_AGE are evicted to make room. (Failing that, the passes array doubles)
+#define MAX_PASSES 100
+#define MIN_AGE 10
+
+enum {
+    TMP_PRELUDE,   // GLSL version, global definitions, etc.
+    TMP_MAIN,      // main GLSL shader body
+    TMP_VERT_HEAD, // vertex shader inputs/outputs
+    TMP_VERT_BODY, // vertex shader body
+    TMP_COUNT,
+};
+
+struct pl_dispatch_t {
+    pl_mutex lock;
+    pl_log log;
+    pl_gpu gpu;
+    uint8_t current_ident;
+    uint8_t current_index;
+    bool dynamic_constants;
+    int max_passes;
+
+    void (*info_callback)(void *, const struct pl_dispatch_info *);
+    void *info_priv;
+
+    PL_ARRAY(pl_shader) shaders;                // to avoid re-allocations
+    PL_ARRAY(struct pass *) passes;             // compiled passes
+
+    // temporary buffers to help avoid re_allocations during pass creation
+    PL_ARRAY(const struct pl_buffer_var *) buf_tmp;
+    pl_str_builder tmp[TMP_COUNT];
+    uint8_t *ubo_tmp;
+};
+
+enum pass_var_type {
+    PASS_VAR_NONE = 0,
+    PASS_VAR_GLOBAL, // regular/global uniforms
+    PASS_VAR_UBO,    // uniform buffers
+    PASS_VAR_PUSHC   // push constants
+};
+
+// Cached metadata about a variable's effective placement / update method
+struct pass_var {
+    int index; // for pl_var_update
+    enum pass_var_type type;
+    struct pl_var_layout layout;
+    void *cached_data;
+};
+
+struct pass {
+    uint64_t signature;
+    pl_pass pass;
+    int last_index;
+
+    // contains cached data and update metadata, same order as pl_shader
+    struct pass_var *vars;
+    int num_var_locs;
+
+    // for uniform buffer updates
+    struct pl_shader_desc ubo_desc; // temporary
+    int ubo_index;
+    pl_buf ubo;
+
+    // Cached pl_pass_run_params. This will also contain mutable allocations
+    // for the push constants, descriptor bindings (including the binding for
+    // the UBO pre-filled), vertex array and variable updates
+    struct pl_pass_run_params run_params;
+
+    // for pl_dispatch_info
+    pl_timer timer;
+    uint64_t ts_last;
+    uint64_t ts_peak;
+    uint64_t ts_sum;
+    uint64_t samples[PL_ARRAY_SIZE(((struct pl_dispatch_info *) NULL)->samples)];
+    int ts_idx;
+};
+
+static void pass_destroy(pl_dispatch dp, struct pass *pass)
+{
+    if (!pass)
+        return;
+
+    pl_buf_destroy(dp->gpu, &pass->ubo);
+    pl_pass_destroy(dp->gpu, &pass->pass);
+    pl_timer_destroy(dp->gpu, &pass->timer);
+    pl_free(pass);
+}
+
+pl_dispatch pl_dispatch_create(pl_log log, pl_gpu gpu)
+{
+    struct pl_dispatch_t *dp = pl_zalloc_ptr(NULL, dp);
+    pl_mutex_init(&dp->lock);
+    dp->log = log;
+    dp->gpu = gpu;
+    dp->max_passes = MAX_PASSES;
+    for (int i = 0; i < PL_ARRAY_SIZE(dp->tmp); i++)
+        dp->tmp[i] = pl_str_builder_alloc(dp);
+
+    return dp;
+}
+
+void pl_dispatch_destroy(pl_dispatch *ptr)
+{
+    pl_dispatch dp = *ptr;
+    if (!dp)
+        return;
+
+    for (int i = 0; i < dp->passes.num; i++)
+        pass_destroy(dp, dp->passes.elem[i]);
+    for (int i = 0; i < dp->shaders.num; i++)
+        pl_shader_free(&dp->shaders.elem[i]);
+
+    pl_mutex_destroy(&dp->lock);
+    pl_free(dp);
+    *ptr = NULL;
+}
+
+pl_shader pl_dispatch_begin_ex(pl_dispatch dp, bool unique)
+{
+    pl_mutex_lock(&dp->lock);
+
+    struct pl_shader_params params = {
+        .id = unique ? dp->current_ident++ : 0,
+        .gpu = dp->gpu,
+        .index = dp->current_index,
+        .dynamic_constants = dp->dynamic_constants,
+    };
+
+    pl_shader sh = NULL;
+    PL_ARRAY_POP(dp->shaders, &sh);
+    pl_mutex_unlock(&dp->lock);
+
+    if (sh) {
+        pl_shader_reset(sh, &params);
+        return sh;
+    }
+
+    return pl_shader_alloc(dp->log, &params);
+}
+
+void pl_dispatch_mark_dynamic(pl_dispatch dp, bool dynamic)
+{
+    dp->dynamic_constants = dynamic;
+}
+
+void pl_dispatch_callback(pl_dispatch dp, void *priv,
+                          void (*cb)(void *priv, const struct pl_dispatch_info *))
+{
+    dp->info_callback = cb;
+    dp->info_priv = priv;
+}
+
+pl_shader pl_dispatch_begin(pl_dispatch dp)
+{
+    return pl_dispatch_begin_ex(dp, false);
+}
+
+static bool add_pass_var(pl_dispatch dp, void *tmp, struct pass *pass,
+                         struct pl_pass_params *params,
+                         const struct pl_shader_var *sv, struct pass_var *pv,
+                         bool greedy)
+{
+    pl_gpu gpu = dp->gpu;
+    if (pv->type)
+        return true;
+
+    // Try not to use push constants for "large" values like matrices in the
+    // first pass, since this is likely to exceed the VGPR/pushc size budgets
+    bool try_pushc = greedy || (sv->var.dim_m == 1 && sv->var.dim_a == 1) || sv->dynamic;
+    if (try_pushc && gpu->glsl.vulkan && gpu->limits.max_pushc_size) {
+        pv->layout = pl_std430_layout(params->push_constants_size, &sv->var);
+        size_t new_size = pv->layout.offset + pv->layout.size;
+        if (new_size <= gpu->limits.max_pushc_size) {
+            params->push_constants_size = new_size;
+            pv->type = PASS_VAR_PUSHC;
+            return true;
+        }
+    }
+
+    // If we haven't placed all PCs yet, don't place anything else, since
+    // we want to try and fit more stuff into PCs before "giving up"
+    if (!greedy)
+        return true;
+
+    int num_locs = sv->var.dim_v * sv->var.dim_m * sv->var.dim_a;
+    bool can_var = pass->num_var_locs + num_locs <= gpu->limits.max_variable_comps;
+
+    // Attempt using uniform buffer next. The GLSL version 440 check is due
+    // to explicit offsets on UBO entries. In theory we could leave away
+    // the offsets and support UBOs for older GL as well, but this is a nice
+    // safety net for driver bugs (and also rules out potentially buggy drivers)
+    // Also avoid UBOs for highly dynamic stuff since that requires synchronizing
+    // the UBO writes every frame
+    bool try_ubo = !can_var || !sv->dynamic;
+    if (try_ubo && gpu->glsl.version >= 440 && gpu->limits.max_ubo_size) {
+        if (sh_buf_desc_append(tmp, gpu, &pass->ubo_desc, &pv->layout, sv->var)) {
+            pv->type = PASS_VAR_UBO;
+            return true;
+        }
+    }
+
+    // Otherwise, use global uniforms
+    if (can_var) {
+        pv->type = PASS_VAR_GLOBAL;
+        pv->index = params->num_variables;
+        pv->layout = pl_var_host_layout(0, &sv->var);
+        PL_ARRAY_APPEND_RAW(tmp, params->variables, params->num_variables, sv->var);
+        pass->num_var_locs += num_locs;
+        return true;
+    }
+
+    // Ran out of variable binding methods. The most likely scenario in which
+    // this can happen is if we're using a GPU that does not support global
+    // input vars and we've exhausted the UBO size limits.
+    PL_ERR(dp, "Unable to add input variable: possibly exhausted "
+           "variable count / UBO size limits?");
+    return false;
+}
+
+#define ADD(b, ...)     pl_str_builder_addf(b, __VA_ARGS__)
+#define ADD_CAT(b, cat) pl_str_builder_concat(b, cat)
+#define ADD_CONST(b, s) pl_str_builder_const_str(b, s)
+
+static void add_var(pl_str_builder body, const struct pl_var *var)
+{
+    const char *type = pl_var_glsl_type_name(*var);
+    if (var->dim_a > 1) {
+        ADD(body, "%s "$"[%d];\n", type, sh_ident_unpack(var->name), var->dim_a);
+    } else {
+        ADD(body, "%s "$";\n", type, sh_ident_unpack(var->name));
+    }
+}
+
+static int cmp_buffer_var(const void *pa, const void *pb)
+{
+    const struct pl_buffer_var * const *a = pa, * const *b = pb;
+    return PL_CMP((*a)->layout.offset, (*b)->layout.offset);
+}
+
+static void add_buffer_vars(pl_dispatch dp, void *tmp, pl_str_builder body,
+                            const struct pl_buffer_var *vars, int num)
+{
+    // Sort buffer vars by offset
+    PL_ARRAY_RESIZE(dp, dp->buf_tmp, num);
+    for (int i = 0; i < num; i++)
+        dp->buf_tmp.elem[i] = &vars[i];
+    qsort(dp->buf_tmp.elem, num, sizeof(&vars[0]), cmp_buffer_var);
+
+    ADD(body, "{\n");
+    for (int i = 0; i < num; i++) {
+        const struct pl_buffer_var *bv = dp->buf_tmp.elem[i];
+        // Add an explicit offset wherever possible
+        if (dp->gpu->glsl.version >= 440)
+            ADD(body, "    layout(offset=%zu) ", bv->layout.offset);
+        add_var(body, &bv->var);
+    }
+    ADD(body, "};\n");
+}
+
+struct generate_params {
+    void *tmp;
+    pl_shader sh;
+    struct pass *pass;
+    struct pl_pass_params *pass_params;
+    ident_t out_mat;
+    ident_t out_off;
+    int vert_idx;
+};
+
+static void generate_shaders(pl_dispatch dp,
+                             const struct generate_params *params,
+                             pl_str_builder *out_vert_builder,
+                             pl_str_builder *out_glsl_builder)
+{
+    pl_gpu gpu = dp->gpu;
+    pl_shader sh = params->sh;
+    void *tmp = params->tmp;
+    struct pass *pass = params->pass;
+    struct pl_pass_params *pass_params = params->pass_params;
+    pl_str_builder shader_body = sh_finalize_internal(sh);
+
+    pl_str_builder pre = dp->tmp[TMP_PRELUDE];
+    ADD(pre, "#version %d%s\n", gpu->glsl.version,
+        (gpu->glsl.gles && gpu->glsl.version > 100) ? " es" : "");
+    if (pass_params->type == PL_PASS_COMPUTE)
+        ADD(pre, "#extension GL_ARB_compute_shader : enable\n");
+
+    // Enable this unconditionally if the GPU supports it, since we have no way
+    // of knowing whether subgroups are being used or not
+    if (gpu->glsl.subgroup_size) {
+        ADD(pre, "#extension GL_KHR_shader_subgroup_basic : enable \n"
+                 "#extension GL_KHR_shader_subgroup_vote : enable \n"
+                 "#extension GL_KHR_shader_subgroup_arithmetic : enable \n"
+                 "#extension GL_KHR_shader_subgroup_ballot : enable \n"
+                 "#extension GL_KHR_shader_subgroup_shuffle : enable \n"
+                 "#extension GL_KHR_shader_subgroup_clustered : enable \n"
+                 "#extension GL_KHR_shader_subgroup_quad : enable \n");
+    }
+
+    // Enable all extensions needed for different types of input
+    bool has_ssbo = false, has_ubo = false, has_img = false, has_texel = false,
+         has_ext = false, has_nofmt = false, has_gather = false;
+    for (int i = 0; i < sh->descs.num; i++) {
+        switch (sh->descs.elem[i].desc.type) {
+        case PL_DESC_BUF_UNIFORM: has_ubo = true; break;
+        case PL_DESC_BUF_STORAGE: has_ssbo = true; break;
+        case PL_DESC_BUF_TEXEL_UNIFORM: has_texel = true; break;
+        case PL_DESC_BUF_TEXEL_STORAGE: {
+            pl_buf buf = sh->descs.elem[i].binding.object;
+            has_nofmt |= !buf->params.format->glsl_format;
+            has_texel = true;
+            break;
+        }
+        case PL_DESC_STORAGE_IMG: {
+            pl_tex tex = sh->descs.elem[i].binding.object;
+            has_nofmt |= !tex->params.format->glsl_format;
+            has_img = true;
+            break;
+        }
+        case PL_DESC_SAMPLED_TEX: {
+            pl_tex tex = sh->descs.elem[i].binding.object;
+            has_gather |= tex->params.format->gatherable;
+            switch (tex->sampler_type) {
+            case PL_SAMPLER_NORMAL: break;
+            case PL_SAMPLER_RECT: break;
+            case PL_SAMPLER_EXTERNAL: has_ext = true; break;
+            case PL_SAMPLER_TYPE_COUNT: pl_unreachable();
+            }
+            break;
+        }
+
+        case PL_DESC_INVALID:
+        case PL_DESC_TYPE_COUNT:
+            pl_unreachable();
+        }
+    }
+
+    if (has_img)
+        ADD(pre, "#extension GL_ARB_shader_image_load_store : enable\n");
+    if (has_ubo)
+        ADD(pre, "#extension GL_ARB_uniform_buffer_object : enable\n");
+    if (has_ssbo)
+        ADD(pre, "#extension GL_ARB_shader_storage_buffer_object : enable\n");
+    if (has_texel)
+        ADD(pre, "#extension GL_ARB_texture_buffer_object : enable\n");
+    if (has_ext) {
+        if (gpu->glsl.version >= 300) {
+            ADD(pre, "#extension GL_OES_EGL_image_external_essl3 : enable\n");
+        } else {
+            ADD(pre, "#extension GL_OES_EGL_image_external : enable\n");
+        }
+    }
+    if (has_nofmt)
+        ADD(pre, "#extension GL_EXT_shader_image_load_formatted : enable\n");
+    if (has_gather)
+        ADD(pre, "#extension GL_ARB_texture_gather : enable\n");
+
+    if (gpu->glsl.gles) {
+        // Use 32-bit precision for floats if possible
+        ADD(pre, "#ifdef GL_FRAGMENT_PRECISION_HIGH \n"
+                 "precision highp float;            \n"
+                 "#else                             \n"
+                 "precision mediump float;          \n"
+                 "#endif                            \n");
+
+        // Always use 16-bit precision for samplers
+        ADD(pre, "precision mediump sampler2D; \n");
+        if (gpu->limits.max_tex_1d_dim)
+            ADD(pre, "precision mediump sampler1D; \n");
+        if (gpu->limits.max_tex_3d_dim && gpu->glsl.version > 100)
+            ADD(pre, "precision mediump sampler3D; \n");
+
+        // Integer math has a good chance of caring about precision
+        ADD(pre, "precision highp int; \n");
+    }
+
+    // textureLod() doesn't work on external/rect samplers, simply disable
+    // LOD sampling in this case. We don't currently support mipmaps anyway.
+    for (int i = 0; i < sh->descs.num; i++) {
+        if (pass_params->descriptors[i].type != PL_DESC_SAMPLED_TEX)
+            continue;
+        pl_tex tex = sh->descs.elem[i].binding.object;
+        if (tex->sampler_type != PL_SAMPLER_NORMAL) {
+            ADD(pre, "#define textureLod(t, p, b) texture(t, p) \n"
+                     "#define textureLodOffset(t, p, b, o)    \\\n"
+                     "        textureOffset(t, p, o)            \n");
+            break;
+        }
+    }
+
+    // Add all of the push constants as their own element
+    if (pass_params->push_constants_size) {
+        // We re-use add_buffer_vars to make sure variables are sorted, this
+        // is important because the push constants can be out-of-order in
+        // `pass->vars`
+        PL_ARRAY(struct pl_buffer_var) pc_bvars = {0};
+        for (int i = 0; i < sh->vars.num; i++) {
+            if (pass->vars[i].type != PASS_VAR_PUSHC)
+                continue;
+
+            PL_ARRAY_APPEND(tmp, pc_bvars, (struct pl_buffer_var) {
+                .var = sh->vars.elem[i].var,
+                .layout = pass->vars[i].layout,
+            });
+        }
+
+        ADD(pre, "layout(std430, push_constant) uniform PushC ");
+        add_buffer_vars(dp, tmp, pre, pc_bvars.elem, pc_bvars.num);
+    }
+
+    // Add all of the specialization constants
+    for (int i = 0; i < sh->consts.num; i++) {
+        static const char *types[PL_VAR_TYPE_COUNT] = {
+            [PL_VAR_SINT]   = "int",
+            [PL_VAR_UINT]   = "uint",
+            [PL_VAR_FLOAT]  = "float",
+        };
+
+        const struct pl_shader_const *sc = &sh->consts.elem[i];
+        ADD(pre, "layout(constant_id=%"PRIu32") const %s "$" = 1; \n",
+            pass_params->constants[i].id, types[sc->type],
+            sh_ident_unpack(sc->name));
+    }
+
+    static const char sampler_prefixes[PL_FMT_TYPE_COUNT] = {
+        [PL_FMT_FLOAT]  = ' ',
+        [PL_FMT_UNORM]  = ' ',
+        [PL_FMT_SNORM]  = ' ',
+        [PL_FMT_UINT]   = 'u',
+        [PL_FMT_SINT]   = 'i',
+    };
+
+    // Add all of the required descriptors
+    for (int i = 0; i < sh->descs.num; i++) {
+        const struct pl_shader_desc *sd = &sh->descs.elem[i];
+        const struct pl_desc *desc = &pass_params->descriptors[i];
+
+        switch (desc->type) {
+        case PL_DESC_SAMPLED_TEX: {
+            static const char *types[][4] = {
+                [PL_SAMPLER_NORMAL][1]  = "sampler1D",
+                [PL_SAMPLER_NORMAL][2]  = "sampler2D",
+                [PL_SAMPLER_NORMAL][3]  = "sampler3D",
+                [PL_SAMPLER_RECT][2]    = "sampler2DRect",
+                [PL_SAMPLER_EXTERNAL][2] = "samplerExternalOES",
+            };
+
+            pl_tex tex = sd->binding.object;
+            int dims = pl_tex_params_dimension(tex->params);
+            const char *type = types[tex->sampler_type][dims];
+            char prefix = sampler_prefixes[tex->params.format->type];
+            ident_t id = sh_ident_unpack(desc->name);
+            pl_assert(type && prefix);
+
+            // Vulkan requires explicit bindings; GL always sets the
+            // bindings manually to avoid relying on the user doing so
+            if (gpu->glsl.vulkan) {
+                ADD(pre, "layout(binding=%d) uniform %c%s "$";\n",
+                    desc->binding, prefix, type, id);
+            } else if (gpu->glsl.gles && prefix != ' ') {
+                ADD(pre, "uniform highp %c%s "$";\n", prefix, type, id);
+            } else {
+                ADD(pre, "uniform %c%s "$";\n", prefix, type, id);
+            }
+            break;
+        }
+
+        case PL_DESC_STORAGE_IMG: {
+            static const char *types[] = {
+                [1] = "image1D",
+                [2] = "image2D",
+                [3] = "image3D",
+            };
+
+            // For better compatibility, we have to explicitly label the
+            // type of data we will be reading/writing to this image.
+            pl_tex tex = sd->binding.object;
+            const char *format = tex->params.format->glsl_format;
+            int dims = pl_tex_params_dimension(tex->params);
+            if (gpu->glsl.vulkan) {
+                if (format) {
+                    ADD(pre, "layout(binding=%d, %s) ", desc->binding, format);
+                } else {
+                    ADD(pre, "layout(binding=%d) ", desc->binding);
+                }
+            } else if (format) {
+                ADD(pre, "layout(%s) ", format);
+            }
+
+            ADD_CONST(pre, pl_desc_access_glsl_name(desc->access));
+            if (sd->memory & PL_MEMORY_COHERENT)
+                ADD(pre, " coherent");
+            if (sd->memory & PL_MEMORY_VOLATILE)
+                ADD(pre, " volatile");
+            ADD(pre, " restrict uniform %s "$";\n",
+                types[dims], sh_ident_unpack(desc->name));
+            break;
+        }
+
+        case PL_DESC_BUF_UNIFORM:
+            if (gpu->glsl.vulkan) {
+                ADD(pre, "layout(std140, binding=%d) ", desc->binding);
+            } else {
+                ADD(pre, "layout(std140) ");
+            }
+            ADD(pre, "uniform "$" ", sh_ident_unpack(desc->name));
+            add_buffer_vars(dp, tmp, pre, sd->buffer_vars, sd->num_buffer_vars);
+            break;
+
+        case PL_DESC_BUF_STORAGE:
+            if (gpu->glsl.version >= 140)
+                ADD(pre, "layout(std430, binding=%d) ", desc->binding);
+            ADD_CONST(pre, pl_desc_access_glsl_name(desc->access));
+            if (sd->memory & PL_MEMORY_COHERENT)
+                ADD(pre, " coherent");
+            if (sd->memory & PL_MEMORY_VOLATILE)
+                ADD(pre, " volatile");
+            ADD(pre, " restrict buffer "$" ", sh_ident_unpack(desc->name));
+            add_buffer_vars(dp, tmp, pre, sd->buffer_vars, sd->num_buffer_vars);
+            break;
+
+        case PL_DESC_BUF_TEXEL_UNIFORM: {
+            pl_buf buf = sd->binding.object;
+            char prefix = sampler_prefixes[buf->params.format->type];
+            if (gpu->glsl.vulkan)
+                ADD(pre, "layout(binding=%d) ", desc->binding);
+            ADD(pre, "uniform %csamplerBuffer "$";\n", prefix,
+                sh_ident_unpack(desc->name));
+            break;
+        }
+
+        case PL_DESC_BUF_TEXEL_STORAGE: {
+            pl_buf buf = sd->binding.object;
+            const char *format = buf->params.format->glsl_format;
+            char prefix = sampler_prefixes[buf->params.format->type];
+            if (gpu->glsl.vulkan) {
+                if (format) {
+                    ADD(pre, "layout(binding=%d, %s) ", desc->binding, format);
+                } else {
+                    ADD(pre, "layout(binding=%d) ", desc->binding);
+                }
+            } else if (format) {
+                ADD(pre, "layout(%s) ", format);
+            }
+
+            ADD_CONST(pre, pl_desc_access_glsl_name(desc->access));
+            if (sd->memory & PL_MEMORY_COHERENT)
+                ADD(pre, " coherent");
+            if (sd->memory & PL_MEMORY_VOLATILE)
+                ADD(pre, " volatile");
+            ADD(pre, " restrict uniform %cimageBuffer "$";\n",
+                prefix, sh_ident_unpack(desc->name));
+            break;
+        }
+
+        case PL_DESC_INVALID:
+        case PL_DESC_TYPE_COUNT:
+            pl_unreachable();
+        }
+    }
+
+    // Add all of the remaining variables
+    for (int i = 0; i < sh->vars.num; i++) {
+        const struct pl_var *var = &sh->vars.elem[i].var;
+        const struct pass_var *pv = &pass->vars[i];
+        if (pv->type != PASS_VAR_GLOBAL)
+            continue;
+        ADD(pre, "uniform ");
+        add_var(pre, var);
+    }
+
+    pl_str_builder glsl = dp->tmp[TMP_MAIN];
+    ADD_CAT(glsl, pre);
+
+    switch(pass_params->type) {
+    case PL_PASS_RASTER: {
+        pl_assert(params->vert_idx >= 0);
+        pl_str_builder vert_head = dp->tmp[TMP_VERT_HEAD];
+        pl_str_builder vert_body = dp->tmp[TMP_VERT_BODY];
+
+        // Older GLSL doesn't support the use of explicit locations
+        bool has_loc = gpu->glsl.version >= 430;
+
+        // Set up a trivial vertex shader
+        ADD_CAT(vert_head, pre);
+        ADD(vert_body, "void main() {\n");
+        for (int i = 0; i < sh->vas.num; i++) {
+            const struct pl_vertex_attrib *va = &pass_params->vertex_attribs[i];
+            const struct pl_shader_va *sva = &sh->vas.elem[i];
+            const char *type = va->fmt->glsl_type;
+
+            // Use the pl_shader_va for the name in the fragment shader since
+            // the pl_vertex_attrib is already mangled for the vertex shader
+            ident_t id = sh_ident_unpack(sva->attr.name);
+
+            if (has_loc) {
+                ADD(vert_head, "layout(location=%d) in %s "$";\n",
+                    va->location, type, sh_ident_unpack(va->name));
+            } else {
+                ADD(vert_head, "in %s "$";\n", type, sh_ident_unpack(va->name));
+            }
+
+            if (i == params->vert_idx) {
+                pl_assert(va->fmt->num_components == 2);
+                ADD(vert_body, "vec2 va_pos = "$"; \n", sh_ident_unpack(va->name));
+                if (params->out_mat)
+                    ADD(vert_body, "va_pos = "$" * va_pos; \n", params->out_mat);
+                if (params->out_off)
+                    ADD(vert_body, "va_pos += "$"; \n", params->out_off);
+                ADD(vert_body, "gl_Position = vec4(va_pos, 0.0, 1.0); \n");
+            } else {
+                // Everything else is just blindly passed through
+                if (has_loc) {
+                    ADD(vert_head, "layout(location=%d) out %s "$";\n",
+                        va->location, type, id);
+                    ADD(glsl, "layout(location=%d) in %s "$";\n",
+                        va->location, type, id);
+                } else {
+                    ADD(vert_head, "out %s "$";\n", type, id);
+                    ADD(glsl, "in %s "$";\n", type, id);
+                }
+                ADD(vert_body, $" = "$";\n", id, sh_ident_unpack(va->name));
+            }
+        }
+
+        ADD(vert_body, "}");
+        ADD_CAT(vert_head, vert_body);
+        pl_hash_merge(&pass->signature, pl_str_builder_hash(vert_head));
+        *out_vert_builder = vert_head;
+
+        if (has_loc) {
+            ADD(glsl, "layout(location=0) out vec4 out_color;\n");
+        } else {
+            ADD(glsl, "out vec4 out_color;\n");
+        }
+        break;
+    }
+    case PL_PASS_COMPUTE:
+        ADD(glsl, "layout (local_size_x = %d, local_size_y = %d) in;\n",
+            sh->group_size[0], sh->group_size[1]);
+        break;
+    case PL_PASS_INVALID:
+    case PL_PASS_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+    // Set up the main shader body
+    ADD_CAT(glsl, shader_body);
+    ADD(glsl, "void main() {\n");
+
+    pl_assert(sh->input == PL_SHADER_SIG_NONE);
+    switch (pass_params->type) {
+    case PL_PASS_RASTER:
+        pl_assert(sh->output == PL_SHADER_SIG_COLOR);
+        ADD(glsl, "out_color = "$"();\n", sh->name);
+        break;
+    case PL_PASS_COMPUTE:
+        ADD(glsl, $"();\n", sh->name);
+        break;
+    case PL_PASS_INVALID:
+    case PL_PASS_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+    ADD(glsl, "}");
+
+    pl_hash_merge(&pass->signature, pl_str_builder_hash(glsl));
+    *out_glsl_builder = glsl;
+}
+
+#undef ADD
+#undef ADD_CAT
+
+#define pass_age(pass) (dp->current_index - (pass)->last_index)
+
+static int cmp_pass_age(const void *ptra, const void *ptrb)
+{
+    const struct pass *a = *(const struct pass **) ptra;
+    const struct pass *b = *(const struct pass **) ptrb;
+    return b->last_index - a->last_index;
+}
+
+static void garbage_collect_passes(pl_dispatch dp)
+{
+    if (dp->passes.num <= dp->max_passes)
+        return;
+
+    // Garbage collect oldest passes, starting at the middle
+    qsort(dp->passes.elem, dp->passes.num, sizeof(struct pass *), cmp_pass_age);
+    int idx = dp->passes.num / 2;
+    while (idx < dp->passes.num && pass_age(dp->passes.elem[idx]) < MIN_AGE)
+        idx++;
+
+    for (int i = idx; i < dp->passes.num; i++)
+        pass_destroy(dp, dp->passes.elem[i]);
+
+    int num_evicted = dp->passes.num - idx;
+    dp->passes.num = idx;
+
+    if (num_evicted) {
+        PL_DEBUG(dp, "Evicted %d passes from dispatch cache, consider "
+                 "using more dynamic shaders", num_evicted);
+    } else {
+        dp->max_passes *= 2;
+    }
+}
+
+static struct pass *finalize_pass(pl_dispatch dp, pl_shader sh,
+                                  pl_tex target, int vert_idx,
+                                  const struct pl_blend_params *blend, bool load,
+                                  const struct pl_dispatch_vertex_params *vparams,
+                                  const pl_transform2x2 *proj)
+{
+    struct pass *pass = pl_alloc_ptr(dp, pass);
+    *pass = (struct pass) {
+        .signature = 0x0, // updated incrementally below
+        .last_index = dp->current_index,
+        .ubo_desc = {
+            .desc = {
+                .name = sh_ident_pack(sh_fresh(sh, "UBO")),
+                .type = PL_DESC_BUF_UNIFORM,
+            },
+        },
+    };
+
+    // For identifiers tied to the lifetime of this shader
+    void *tmp = sh->tmp;
+
+    struct pl_pass_params params = {
+        .type = pl_shader_is_compute(sh) ? PL_PASS_COMPUTE : PL_PASS_RASTER,
+        .num_descriptors = sh->descs.num,
+        .vertex_type = vparams ? vparams->vertex_type : PL_PRIM_TRIANGLE_STRIP,
+        .vertex_stride = vparams ? vparams->vertex_stride : 0,
+        .blend_params = blend,
+    };
+
+    struct generate_params gen_params = {
+        .tmp = tmp,
+        .pass = pass,
+        .pass_params = &params,
+        .sh = sh,
+        .vert_idx = vert_idx,
+    };
+
+    if (params.type == PL_PASS_RASTER) {
+        assert(target);
+        params.target_format = target->params.format;
+        params.load_target = load;
+
+        // Fill in the vertex attributes array
+        params.num_vertex_attribs = sh->vas.num;
+        params.vertex_attribs = pl_calloc_ptr(tmp, sh->vas.num, params.vertex_attribs);
+
+        int va_loc = 0;
+        for (int i = 0; i < sh->vas.num; i++) {
+            struct pl_vertex_attrib *va = &params.vertex_attribs[i];
+            *va = sh->vas.elem[i].attr;
+
+            // Mangle the name to make sure it doesn't conflict with the
+            // fragment shader input, this will be converted back to a legal
+            // string by the shader compilation code
+            va->name = sh_ident_pack(sh_fresh(sh, "va"));
+
+            // Place the vertex attribute
+            va->location = va_loc;
+            if (!vparams) {
+                va->offset = params.vertex_stride;
+                params.vertex_stride += va->fmt->texel_size;
+            }
+
+            // The number of vertex attribute locations consumed by a vertex
+            // attribute is the number of vec4s it consumes, rounded up
+            const size_t va_loc_size = sizeof(float[4]);
+            va_loc += PL_DIV_UP(va->fmt->texel_size, va_loc_size);
+        }
+
+        // Hash in the raster state configuration
+        pl_hash_merge(&pass->signature, (uint64_t) params.vertex_type);
+        pl_hash_merge(&pass->signature, (uint64_t) params.vertex_stride);
+        pl_hash_merge(&pass->signature, (uint64_t) params.load_target);
+        pl_hash_merge(&pass->signature, target->params.format->signature);
+        if (blend) {
+            pl_static_assert(sizeof(*blend) == sizeof(enum pl_blend_mode) * 4);
+            pl_hash_merge(&pass->signature, pl_var_hash(*blend));
+        }
+
+        // Load projection matrix if required
+        if (proj && memcmp(&proj->mat, &pl_matrix2x2_identity, sizeof(proj->mat)) != 0) {
+            gen_params.out_mat = sh_var(sh, (struct pl_shader_var) {
+                .var = pl_var_mat2("proj"),
+                .data = PL_TRANSPOSE_2X2(proj->mat.m),
+            });
+        }
+
+        if (proj && (proj->c[0] || proj->c[1])) {
+            gen_params.out_off = sh_var(sh, (struct pl_shader_var) {
+                .var = pl_var_vec2("offset"),
+                .data = proj->c,
+            });
+        }
+    }
+
+    // Place all of the compile-time constants
+    uint8_t *constant_data = NULL;
+    if (sh->consts.num) {
+        params.num_constants = sh->consts.num;
+        params.constants = pl_alloc(tmp, sh->consts.num * sizeof(struct pl_constant));
+
+        // Compute offsets
+        size_t total_size = 0;
+        uint32_t const_id = 0;
+        for (int i = 0; i < sh->consts.num; i++) {
+            params.constants[i] = (struct pl_constant) {
+                .type = sh->consts.elem[i].type,
+                .id = const_id++,
+                .offset = total_size,
+            };
+            total_size += pl_var_type_size(sh->consts.elem[i].type);
+        }
+
+        // Write values into the constants buffer
+        params.constant_data = constant_data = pl_alloc(pass, total_size);
+        for (int i = 0; i < sh->consts.num; i++) {
+            const struct pl_shader_const *sc = &sh->consts.elem[i];
+            void *data = constant_data + params.constants[i].offset;
+            memcpy(data, sc->data, pl_var_type_size(sc->type));
+        }
+    }
+
+    // Place all the variables; these will dynamically end up in different
+    // locations based on what the underlying GPU supports (UBOs, pushc, etc.)
+    //
+    // We go through the list twice, once to place stuff that we definitely
+    // want inside PCs, and then a second time to opportunistically place the rest.
+    pass->vars = pl_calloc_ptr(pass, sh->vars.num, pass->vars);
+    for (int i = 0; i < sh->vars.num; i++) {
+        if (!add_pass_var(dp, tmp, pass, &params, &sh->vars.elem[i], &pass->vars[i], false))
+            goto error;
+    }
+    for (int i = 0; i < sh->vars.num; i++) {
+        if (!add_pass_var(dp, tmp, pass, &params, &sh->vars.elem[i], &pass->vars[i], true))
+            goto error;
+    }
+
+    // Now that we know the variable placement, finalize pushc/UBO sizes
+    params.push_constants_size = PL_ALIGN2(params.push_constants_size, 4);
+    size_t ubo_size = sh_buf_desc_size(&pass->ubo_desc);
+    if (ubo_size) {
+        pass->ubo_index = sh->descs.num;
+        PL_ARRAY_APPEND(sh, sh->descs, pass->ubo_desc); // don't mangle names
+    };
+
+    // Place and fill in the descriptors
+    const int num_descs = sh->descs.num;
+    int binding[PL_DESC_TYPE_COUNT] = {0};
+    params.num_descriptors = num_descs;
+    params.descriptors = pl_calloc_ptr(tmp, num_descs, params.descriptors);
+    for (int i = 0; i < num_descs; i++) {
+        struct pl_desc *desc = &params.descriptors[i];
+        *desc = sh->descs.elem[i].desc;
+        desc->binding = binding[pl_desc_namespace(dp->gpu, desc->type)]++;
+    }
+
+    // Finalize the shader and look it up in the pass cache
+    pl_str_builder vert_builder = NULL, glsl_builder = NULL;
+    generate_shaders(dp, &gen_params, &vert_builder, &glsl_builder);
+    for (int i = 0; i < dp->passes.num; i++) {
+        struct pass *p = dp->passes.elem[i];
+        if (p->signature != pass->signature)
+            continue;
+
+        // Found existing shader, re-use directly
+        if (p->ubo)
+            sh->descs.elem[p->ubo_index].binding.object = p->ubo;
+        pl_free(p->run_params.constant_data);
+        p->run_params.constant_data = pl_steal(p, constant_data);
+        p->last_index = dp->current_index;
+        pl_free(pass);
+        return p;
+    }
+
+    // Need to compile new shader, execute templates now
+    if (vert_builder) {
+        pl_str vert = pl_str_builder_exec(vert_builder);
+        params.vertex_shader = (char *) vert.buf;
+    }
+    pl_str glsl = pl_str_builder_exec(glsl_builder);
+    params.glsl_shader = (char *) glsl.buf;
+
+    // Turn all shader identifiers into actual strings before passing it
+    // to the `pl_gpu`
+#define FIX_IDENT(name) \
+    name = sh_ident_tostr(sh_ident_unpack(name))
+    for (int i = 0; i < params.num_variables; i++)
+        FIX_IDENT(params.variables[i].name);
+    for (int i = 0; i < params.num_descriptors; i++)
+        FIX_IDENT(params.descriptors[i].name);
+    for (int i = 0; i < params.num_vertex_attribs; i++)
+        FIX_IDENT(params.vertex_attribs[i].name);
+#undef FIX_IDENT
+
+    pass->pass = pl_pass_create(dp->gpu, &params);
+    if (!pass->pass) {
+        PL_ERR(dp, "Failed creating render pass for dispatch");
+        // Add it anyway
+    }
+
+    struct pl_pass_run_params *rparams = &pass->run_params;
+    rparams->pass = pass->pass;
+    rparams->constant_data = constant_data;
+    rparams->push_constants = pl_zalloc(pass, params.push_constants_size);
+    rparams->desc_bindings = pl_calloc_ptr(pass, params.num_descriptors,
+                                           rparams->desc_bindings);
+
+    if (ubo_size && pass->pass) {
+        // Create the UBO
+        pass->ubo = pl_buf_create(dp->gpu, pl_buf_params(
+            .size = ubo_size,
+            .uniform = true,
+            .host_writable = true,
+        ));
+
+        if (!pass->ubo) {
+            PL_ERR(dp, "Failed creating uniform buffer for dispatch");
+            goto error;
+        }
+
+        sh->descs.elem[pass->ubo_index].binding.object = pass->ubo;
+    }
+
+    if (params.type == PL_PASS_RASTER && !vparams) {
+        // Generate the vertex array placeholder
+        rparams->vertex_count = 4; // single quad
+        size_t vert_size = rparams->vertex_count * params.vertex_stride;
+        rparams->vertex_data = pl_zalloc(pass, vert_size);
+    }
+
+    pass->timer = pl_timer_create(dp->gpu);
+
+    PL_ARRAY_APPEND(dp, dp->passes, pass);
+    return pass;
+
+error:
+    pass_destroy(dp, pass);
+    return NULL;
+}
+
+static void update_pass_var(pl_dispatch dp, struct pass *pass,
+                            const struct pl_shader_var *sv, struct pass_var *pv)
+{
+    struct pl_var_layout host_layout = pl_var_host_layout(0, &sv->var);
+    pl_assert(host_layout.size);
+
+    // Use the cache to skip updates if possible
+    if (pv->cached_data && !memcmp(sv->data, pv->cached_data, host_layout.size))
+        return;
+    if (!pv->cached_data)
+        pv->cached_data = pl_alloc(pass, host_layout.size);
+    memcpy(pv->cached_data, sv->data, host_layout.size);
+
+    struct pl_pass_run_params *rparams = &pass->run_params;
+    switch (pv->type) {
+    case PASS_VAR_NONE:
+        pl_unreachable();
+    case PASS_VAR_GLOBAL: {
+        struct pl_var_update vu = {
+            .index = pv->index,
+            .data  = sv->data,
+        };
+        PL_ARRAY_APPEND_RAW(pass, rparams->var_updates, rparams->num_var_updates, vu);
+        break;
+    }
+    case PASS_VAR_UBO: {
+        pl_assert(pass->ubo);
+        const size_t offset = pv->layout.offset;
+        if (host_layout.stride == pv->layout.stride) {
+            pl_assert(host_layout.size == pv->layout.size);
+            pl_buf_write(dp->gpu, pass->ubo, offset, sv->data, host_layout.size);
+        } else {
+            // Coalesce strided UBO write into a single pl_buf_write to avoid
+            // unnecessary synchronization overhead by assembling the correctly
+            // strided upload in RAM
+            pl_grow(dp, &dp->ubo_tmp, pv->layout.size);
+            uint8_t * const tmp = dp->ubo_tmp;
+            const uint8_t *src = sv->data;
+            const uint8_t *end = src + host_layout.size;
+            uint8_t *dst = tmp;
+            while (src < end) {
+                memcpy(dst, src, host_layout.stride);
+                src += host_layout.stride;
+                dst += pv->layout.stride;
+            }
+            pl_buf_write(dp->gpu, pass->ubo, offset, tmp, pv->layout.size);
+        }
+        break;
+    }
+    case PASS_VAR_PUSHC:
+        pl_assert(rparams->push_constants);
+        memcpy_layout(rparams->push_constants, pv->layout, sv->data, host_layout);
+        break;
+    };
+}
+
+static void compute_vertex_attribs(pl_dispatch dp, pl_shader sh,
+                                   int width, int height, ident_t *out_scale)
+{
+    // Simulate vertex attributes using global definitions
+    *out_scale = sh_var(sh, (struct pl_shader_var) {
+        .var     = pl_var_vec2("out_scale"),
+        .data    = &(float[2]){ 1.0 / width, 1.0 / height },
+        .dynamic = true,
+    });
+
+    GLSLP("#define frag_pos(id) (vec2(id) + vec2(0.5))  \n"
+          "#define frag_map(id) ("$" * frag_pos(id))    \n"
+          "#define gl_FragCoord vec4(frag_pos(gl_GlobalInvocationID), 0.0, 1.0) \n",
+          *out_scale);
+
+    for (int n = 0; n < sh->vas.num; n++) {
+        const struct pl_shader_va *sva = &sh->vas.elem[n];
+
+        ident_t points[4];
+        for (int i = 0; i < PL_ARRAY_SIZE(points); i++) {
+            points[i] = sh_var(sh, (struct pl_shader_var) {
+                .var  = pl_var_from_fmt(sva->attr.fmt, "pt"),
+                .data = sva->data[i],
+            });
+        }
+
+        GLSLP("#define "$"_map(id) "
+             "(mix(mix("$", "$", frag_map(id).x), "
+             "     mix("$", "$", frag_map(id).x), "
+             "frag_map(id).y)) \n"
+             "#define "$" ("$"_map(gl_GlobalInvocationID)) \n",
+             sh_ident_unpack(sva->attr.name),
+             points[0], points[1], points[2], points[3],
+             sh_ident_unpack(sva->attr.name),
+             sh_ident_unpack(sva->attr.name));
+    }
+}
+
+static void translate_compute_shader(pl_dispatch dp, pl_shader sh,
+                                     const pl_rect2d *rc,
+                                     const struct pl_dispatch_params *params)
+{
+    int width = abs(pl_rect_w(*rc)), height = abs(pl_rect_h(*rc));
+    if (sh->transpose)
+        PL_SWAP(width, height);
+    ident_t out_scale;
+    compute_vertex_attribs(dp, sh, width, height, &out_scale);
+
+    // Simulate a framebuffer using storage images
+    pl_assert(params->target->params.storable);
+    pl_assert(sh->output == PL_SHADER_SIG_COLOR);
+    ident_t fbo = sh_desc(sh, (struct pl_shader_desc) {
+        .binding.object = params->target,
+        .desc = {
+            .name    = "out_image",
+            .type    = PL_DESC_STORAGE_IMG,
+            .access  = params->blend_params ? PL_DESC_ACCESS_READWRITE
+                                            : PL_DESC_ACCESS_WRITEONLY,
+        },
+    });
+
+    ident_t base = sh_var(sh, (struct pl_shader_var) {
+        .data    = &(int[2]){ rc->x0, rc->y0 },
+        .dynamic = true,
+        .var     = {
+            .name  = "base",
+            .type  = PL_VAR_SINT,
+            .dim_v = 2,
+            .dim_m = 1,
+            .dim_a = 1,
+        },
+    });
+
+    int dx = rc->x0 > rc->x1 ? -1 : 1, dy = rc->y0 > rc->y1 ? -1 : 1;
+    GLSL("ivec2 dir = ivec2(%d, %d);\n", dx, dy); // hard-code, not worth var
+    GLSL("ivec2 pos = "$" + dir * ivec2(gl_GlobalInvocationID).%c%c;\n",
+         base, sh->transpose ? 'y' : 'x', sh->transpose ? 'x' : 'y');
+    GLSL("vec2 fpos = "$" * vec2(gl_GlobalInvocationID);\n", out_scale);
+    GLSL("if (fpos.x < 1.0 && fpos.y < 1.0) {\n");
+    if (params->blend_params) {
+        GLSL("vec4 orig = imageLoad("$", pos);\n", fbo);
+
+        static const char *modes[] = {
+            [PL_BLEND_ZERO] = "0.0",
+            [PL_BLEND_ONE]  = "1.0",
+            [PL_BLEND_SRC_ALPHA] = "color.a",
+            [PL_BLEND_ONE_MINUS_SRC_ALPHA] = "(1.0 - color.a)",
+        };
+
+        GLSL("color = vec4(color.rgb * vec3(%s), color.a * %s) \n"
+             "      + vec4(orig.rgb  * vec3(%s), orig.a  * %s);\n",
+             modes[params->blend_params->src_rgb],
+             modes[params->blend_params->src_alpha],
+             modes[params->blend_params->dst_rgb],
+             modes[params->blend_params->dst_alpha]);
+    }
+    GLSL("imageStore("$", pos, color);\n", fbo);
+    GLSL("}\n");
+    sh->output = PL_SHADER_SIG_NONE;
+}
+
+static void run_pass(pl_dispatch dp, pl_shader sh, struct pass *pass)
+{
+    pl_shader_info shader = &sh->info->info;
+    pl_pass_run(dp->gpu, &pass->run_params);
+
+    for (uint64_t ts; (ts = pl_timer_query(dp->gpu, pass->timer));) {
+        PL_TRACE(dp, "Spent %.3f ms on shader: %s", ts / 1e6, shader->description);
+
+        uint64_t old = pass->samples[pass->ts_idx];
+        pass->samples[pass->ts_idx] = ts;
+        pass->ts_last = ts;
+        pass->ts_peak = PL_MAX(pass->ts_peak, ts);
+        pass->ts_sum += ts;
+        pass->ts_idx = (pass->ts_idx + 1) % PL_ARRAY_SIZE(pass->samples);
+
+        if (old) {
+            pass->ts_sum -= old;
+            if (old == pass->ts_peak) {
+                uint64_t new_peak = 0;
+                for (int i = 0; i < PL_ARRAY_SIZE(pass->samples); i++)
+                    new_peak = PL_MAX(new_peak, pass->samples[i]);
+                pass->ts_peak = new_peak;
+            }
+        }
+    }
+
+    if (!dp->info_callback)
+        return;
+
+    struct pl_dispatch_info info;
+    info.signature = pass->signature;
+    info.shader = shader;
+
+    // Test to see if the ring buffer already wrapped around once
+    if (pass->samples[pass->ts_idx]) {
+        info.num_samples = PL_ARRAY_SIZE(pass->samples);
+        int num_wrapped = info.num_samples - pass->ts_idx;
+        memcpy(info.samples, &pass->samples[pass->ts_idx],
+               num_wrapped * sizeof(info.samples[0]));
+        memcpy(&info.samples[num_wrapped], pass->samples,
+               pass->ts_idx * sizeof(info.samples[0]));
+    } else {
+        info.num_samples = pass->ts_idx;
+        memcpy(info.samples, pass->samples,
+               pass->ts_idx * sizeof(info.samples[0]));
+    }
+
+    info.last = pass->ts_last;
+    info.peak = pass->ts_peak;
+    info.average = pass->ts_sum / PL_MAX(info.num_samples, 1);
+    dp->info_callback(dp->info_priv, &info);
+}
+
+bool pl_dispatch_finish(pl_dispatch dp, const struct pl_dispatch_params *params)
+{
+    pl_shader sh = *params->shader;
+    bool ret = false;
+    pl_mutex_lock(&dp->lock);
+
+    if (sh->failed) {
+        PL_ERR(sh, "Trying to dispatch a failed shader.");
+        goto error;
+    }
+
+    if (!sh->mutable) {
+        PL_ERR(dp, "Trying to dispatch non-mutable shader?");
+        goto error;
+    }
+
+    if (sh->input != PL_SHADER_SIG_NONE || sh->output != PL_SHADER_SIG_COLOR) {
+        PL_ERR(dp, "Trying to dispatch shader with incompatible signature!");
+        goto error;
+    }
+
+    const struct pl_tex_params *tpars = &params->target->params;
+    if (pl_tex_params_dimension(*tpars) != 2 || !tpars->renderable) {
+        PL_ERR(dp, "Trying to dispatch a shader using an invalid target "
+               "texture. The target must be a renderable 2D texture.");
+        goto error;
+    }
+
+    const struct pl_gpu_limits *limits = &dp->gpu->limits;
+    bool can_compute = tpars->storable;
+    if (can_compute && params->blend_params)
+        can_compute = tpars->format->caps & PL_FMT_CAP_READWRITE;
+
+    if (pl_shader_is_compute(sh) && !can_compute) {
+        PL_ERR(dp, "Trying to dispatch using a compute shader with a "
+               "non-storable or incompatible target texture.");
+        goto error;
+    } else if (can_compute && limits->compute_queues > limits->fragment_queues) {
+        if (sh_try_compute(sh, 16, 16, true, 0))
+            PL_TRACE(dp, "Upgrading fragment shader to compute shader.");
+    }
+
+    pl_rect2d rc = params->rect;
+    if (!pl_rect_w(rc)) {
+        rc.x0 = 0;
+        rc.x1 = tpars->w;
+    }
+    if (!pl_rect_h(rc)) {
+        rc.y0 = 0;
+        rc.y1 = tpars->h;
+    }
+
+    int w, h, tw = abs(pl_rect_w(rc)), th = abs(pl_rect_h(rc));
+    if (pl_shader_output_size(sh, &w, &h) && (w != tw || h != th))
+    {
+        PL_ERR(dp, "Trying to dispatch a shader with explicit output size "
+               "requirements %dx%d%s using a target rect of size %dx%d.",
+               w, h, sh->transpose ? " (transposed)" : "", tw, th);
+        goto error;
+    }
+
+    int vert_idx = -1;
+    const pl_transform2x2 *proj = NULL;
+    if (pl_shader_is_compute(sh)) {
+        // Translate the compute shader to simulate vertices etc.
+        translate_compute_shader(dp, sh, &rc, params);
+    } else {
+        // Add the vertex information encoding the position
+        pl_rect2df vert_rect = {
+            .x0 = 2.0 * rc.x0 / tpars->w - 1.0,
+            .y0 = 2.0 * rc.y0 / tpars->h - 1.0,
+            .x1 = 2.0 * rc.x1 / tpars->w - 1.0,
+            .y1 = 2.0 * rc.y1 / tpars->h - 1.0,
+        };
+
+        if (sh->transpose) {
+            static const pl_transform2x2 transpose_proj = {{{
+                { 0, 1 },
+                { 1, 0 },
+            }}};
+            proj = &transpose_proj;
+            PL_SWAP(vert_rect.x0, vert_rect.y0);
+            PL_SWAP(vert_rect.x1, vert_rect.y1);
+        }
+
+        sh_attr_vec2(sh, "position", &vert_rect);
+        vert_idx = sh->vas.num - 1;
+    }
+
+    // We need to set pl_pass_params.load_target when either blending is
+    // enabled or we're drawing to some scissored sub-rect of the texture
+    pl_rect2d full = { 0, 0, tpars->w, tpars->h };
+    pl_rect2d rc_norm = rc;
+    pl_rect2d_normalize(&rc_norm);
+    rc_norm.x0 = PL_MAX(rc_norm.x0, 0);
+    rc_norm.y0 = PL_MAX(rc_norm.y0, 0);
+    rc_norm.x1 = PL_MIN(rc_norm.x1, tpars->w);
+    rc_norm.y1 = PL_MIN(rc_norm.y1, tpars->h);
+    bool load = params->blend_params || !pl_rect2d_eq(rc_norm, full);
+
+    struct pass *pass = finalize_pass(dp, sh, params->target, vert_idx,
+                                      params->blend_params, load, NULL, proj);
+
+    // Silently return on failed passes
+    if (!pass || !pass->pass)
+        goto error;
+
+    struct pl_pass_run_params *rparams = &pass->run_params;
+
+    // Update the descriptor bindings
+    for (int i = 0; i < sh->descs.num; i++)
+        rparams->desc_bindings[i] = sh->descs.elem[i].binding;
+
+    // Update all of the variables (if needed)
+    rparams->num_var_updates = 0;
+    for (int i = 0; i < sh->vars.num; i++)
+        update_pass_var(dp, pass, &sh->vars.elem[i], &pass->vars[i]);
+
+    // Update the vertex data
+    if (rparams->vertex_data) {
+        uintptr_t vert_base = (uintptr_t) rparams->vertex_data;
+        size_t stride = rparams->pass->params.vertex_stride;
+        for (int i = 0; i < sh->vas.num; i++) {
+            const struct pl_shader_va *sva = &sh->vas.elem[i];
+            struct pl_vertex_attrib *va = &rparams->pass->params.vertex_attribs[i];
+
+            size_t size = sva->attr.fmt->texel_size;
+            uintptr_t va_base = vert_base + va->offset; // use placed offset
+            for (int n = 0; n < 4; n++)
+                memcpy((void *) (va_base + n * stride), sva->data[n], size);
+        }
+    }
+
+    // For compute shaders: also update the dispatch dimensions
+    if (pl_shader_is_compute(sh)) {
+        int width = abs(pl_rect_w(rc)),
+            height = abs(pl_rect_h(rc));
+        if (sh->transpose)
+            PL_SWAP(width, height);
+        // Round up to make sure we don't leave off a part of the target
+        int block_w = sh->group_size[0],
+            block_h = sh->group_size[1],
+            num_x   = PL_DIV_UP(width, block_w),
+            num_y   = PL_DIV_UP(height, block_h);
+
+        rparams->compute_groups[0] = num_x;
+        rparams->compute_groups[1] = num_y;
+        rparams->compute_groups[2] = 1;
+    } else {
+        // Update the scissors for performance
+        rparams->scissors = rc_norm;
+    }
+
+    // Dispatch the actual shader
+    rparams->target = params->target;
+    rparams->timer = PL_DEF(params->timer, pass->timer);
+    run_pass(dp, sh, pass);
+
+    ret = true;
+    // fall through
+
+error:
+    // Reset the temporary buffers which we use to build the shader
+    for (int i = 0; i < PL_ARRAY_SIZE(dp->tmp); i++)
+        pl_str_builder_reset(dp->tmp[i]);
+
+    pl_mutex_unlock(&dp->lock);
+    pl_dispatch_abort(dp, params->shader);
+    return ret;
+}
+
+bool pl_dispatch_compute(pl_dispatch dp, const struct pl_dispatch_compute_params *params)
+{
+    pl_shader sh = *params->shader;
+    bool ret = false;
+    pl_mutex_lock(&dp->lock);
+
+    if (sh->failed) {
+        PL_ERR(sh, "Trying to dispatch a failed shader.");
+        goto error;
+    }
+
+    if (!sh->mutable) {
+        PL_ERR(dp, "Trying to dispatch non-mutable shader?");
+        goto error;
+    }
+
+    if (sh->input != PL_SHADER_SIG_NONE) {
+        PL_ERR(dp, "Trying to dispatch shader with incompatible signature!");
+        goto error;
+    }
+
+    if (!pl_shader_is_compute(sh)) {
+        PL_ERR(dp, "Trying to dispatch a non-compute shader using "
+               "`pl_dispatch_compute`!");
+        goto error;
+    }
+
+    if (sh->vas.num) {
+        if (!params->width || !params->height) {
+            PL_ERR(dp, "Trying to dispatch a targetless compute shader that "
+                   "uses vertex attributes, this requires specifying the size "
+                   "of the effective rendering area!");
+            goto error;
+        }
+
+        compute_vertex_attribs(dp, sh, params->width, params->height,
+                               &(ident_t){0});
+    }
+
+    struct pass *pass = finalize_pass(dp, sh, NULL, -1, NULL, false, NULL, NULL);
+
+    // Silently return on failed passes
+    if (!pass || !pass->pass)
+        goto error;
+
+    struct pl_pass_run_params *rparams = &pass->run_params;
+
+    // Update the descriptor bindings
+    for (int i = 0; i < sh->descs.num; i++)
+        rparams->desc_bindings[i] = sh->descs.elem[i].binding;
+
+    // Update all of the variables (if needed)
+    rparams->num_var_updates = 0;
+    for (int i = 0; i < sh->vars.num; i++)
+        update_pass_var(dp, pass, &sh->vars.elem[i], &pass->vars[i]);
+
+    // Update the dispatch size
+    int groups = 1;
+    for (int i = 0; i < 3; i++) {
+        groups *= params->dispatch_size[i];
+        rparams->compute_groups[i] = params->dispatch_size[i];
+    }
+
+    if (!groups) {
+        pl_assert(params->width && params->height);
+        int block_w = sh->group_size[0],
+            block_h = sh->group_size[1],
+            num_x   = PL_DIV_UP(params->width, block_w),
+            num_y   = PL_DIV_UP(params->height, block_h);
+
+        rparams->compute_groups[0] = num_x;
+        rparams->compute_groups[1] = num_y;
+        rparams->compute_groups[2] = 1;
+    }
+
+    // Dispatch the actual shader
+    rparams->timer = PL_DEF(params->timer, pass->timer);
+    run_pass(dp, sh, pass);
+
+    ret = true;
+    // fall through
+
+error:
+    // Reset the temporary buffers which we use to build the shader
+    for (int i = 0; i < PL_ARRAY_SIZE(dp->tmp); i++)
+        pl_str_builder_reset(dp->tmp[i]);
+
+    pl_mutex_unlock(&dp->lock);
+    pl_dispatch_abort(dp, params->shader);
+    return ret;
+}
+
+bool pl_dispatch_vertex(pl_dispatch dp, const struct pl_dispatch_vertex_params *params)
+{
+    pl_shader sh = *params->shader;
+    bool ret = false;
+    pl_mutex_lock(&dp->lock);
+
+    if (sh->failed) {
+        PL_ERR(sh, "Trying to dispatch a failed shader.");
+        goto error;
+    }
+
+    if (!sh->mutable) {
+        PL_ERR(dp, "Trying to dispatch non-mutable shader?");
+        goto error;
+    }
+
+    if (sh->input != PL_SHADER_SIG_NONE || sh->output != PL_SHADER_SIG_COLOR) {
+        PL_ERR(dp, "Trying to dispatch shader with incompatible signature!");
+        goto error;
+    }
+
+    const struct pl_tex_params *tpars = &params->target->params;
+    if (pl_tex_params_dimension(*tpars) != 2 || !tpars->renderable) {
+        PL_ERR(dp, "Trying to dispatch a shader using an invalid target "
+               "texture. The target must be a renderable 2D texture.");
+        goto error;
+    }
+
+    if (pl_shader_is_compute(sh)) {
+        PL_ERR(dp, "Trying to dispatch a compute shader using pl_dispatch_vertex.");
+        goto error;
+    }
+
+    if (sh->vas.num) {
+        PL_ERR(dp, "Trying to dispatch a custom vertex shader with already "
+               "attached vertex attributes.");
+        goto error;
+    }
+
+    if (sh->transpose) {
+        PL_ERR(dp, "Trying to dispatch a transposed shader using "
+               "pl_dispatch_vertex, unlikely to be correct. Erroring as a "
+               "safety precaution!");
+        goto error;
+    }
+
+    int pos_idx = params->vertex_position_idx;
+    if (pos_idx < 0 || pos_idx >= params->num_vertex_attribs) {
+        PL_ERR(dp, "Vertex position index out of range?");
+        goto error;
+    }
+
+    // Attach all of the vertex attributes to the shader manually
+    sh->vas.num = params->num_vertex_attribs;
+    PL_ARRAY_RESIZE(sh, sh->vas, sh->vas.num);
+    for (int i = 0; i < params->num_vertex_attribs; i++) {
+        ident_t id = sh_fresh(sh, params->vertex_attribs[i].name);
+        sh->vas.elem[i].attr = params->vertex_attribs[i];
+        sh->vas.elem[i].attr.name = sh_ident_pack(id);
+        GLSLP("#define %s "$"\n", params->vertex_attribs[i].name, id);
+    }
+
+    // Compute the coordinate projection matrix
+    pl_transform2x2 proj = pl_transform2x2_identity;
+    switch (params->vertex_coords) {
+    case PL_COORDS_ABSOLUTE:
+        proj.mat.m[0][0] /= tpars->w;
+        proj.mat.m[1][1] /= tpars->h;
+        // fall through
+    case PL_COORDS_RELATIVE:
+        proj.mat.m[0][0] *= 2.0;
+        proj.mat.m[1][1] *= 2.0;
+        proj.c[0] -= 1.0;
+        proj.c[1] -= 1.0;
+        // fall through
+    case PL_COORDS_NORMALIZED:
+        if (params->vertex_flipped) {
+            proj.mat.m[1][1] = -proj.mat.m[1][1];
+            proj.c[1] += 2.0;
+        }
+        break;
+    }
+
+    struct pass *pass = finalize_pass(dp, sh, params->target, pos_idx,
+                                      params->blend_params, true, params, &proj);
+
+    // Silently return on failed passes
+    if (!pass || !pass->pass)
+        goto error;
+
+    struct pl_pass_run_params *rparams = &pass->run_params;
+
+    // Update the descriptor bindings
+    for (int i = 0; i < sh->descs.num; i++)
+        rparams->desc_bindings[i] = sh->descs.elem[i].binding;
+
+    // Update all of the variables (if needed)
+    rparams->num_var_updates = 0;
+    for (int i = 0; i < sh->vars.num; i++)
+        update_pass_var(dp, pass, &sh->vars.elem[i], &pass->vars[i]);
+
+    // Update the scissors
+    rparams->scissors = params->scissors;
+    if (params->vertex_flipped) {
+        rparams->scissors.y0 = tpars->h - rparams->scissors.y0;
+        rparams->scissors.y1 = tpars->h - rparams->scissors.y1;
+    }
+    pl_rect2d_normalize(&rparams->scissors);
+
+    // Dispatch the actual shader
+    rparams->target = params->target;
+    rparams->vertex_count = params->vertex_count;
+    rparams->vertex_data = params->vertex_data;
+    rparams->vertex_buf = params->vertex_buf;
+    rparams->buf_offset = params->buf_offset;
+    rparams->index_data = params->index_data;
+    rparams->index_fmt = params->index_fmt;
+    rparams->index_buf = params->index_buf;
+    rparams->index_offset = params->index_offset;
+    rparams->timer = PL_DEF(params->timer, pass->timer);
+    run_pass(dp, sh, pass);
+
+    ret = true;
+    // fall through
+
+error:
+    // Reset the temporary buffers which we use to build the shader
+    for (int i = 0; i < PL_ARRAY_SIZE(dp->tmp); i++)
+        pl_str_builder_reset(dp->tmp[i]);
+
+    pl_mutex_unlock(&dp->lock);
+    pl_dispatch_abort(dp, params->shader);
+    return ret;
+}
+
+void pl_dispatch_abort(pl_dispatch dp, pl_shader *psh)
+{
+    pl_shader sh = *psh;
+    if (!sh)
+        return;
+
+    // Free unused memory as early as possible
+    sh_deref(sh);
+
+    // Re-add the shader to the internal pool of shaders
+    pl_mutex_lock(&dp->lock);
+    PL_ARRAY_APPEND(dp, dp->shaders, sh);
+    pl_mutex_unlock(&dp->lock);
+    *psh = NULL;
+}
+
+void pl_dispatch_reset_frame(pl_dispatch dp)
+{
+    pl_mutex_lock(&dp->lock);
+
+    dp->current_ident = 0;
+    dp->current_index++;
+    garbage_collect_passes(dp);
+
+    pl_mutex_unlock(&dp->lock);
+}
+
+size_t pl_dispatch_save(pl_dispatch dp, uint8_t *out)
+{
+    return pl_cache_save(pl_gpu_cache(dp->gpu), out, out ? SIZE_MAX : 0);
+}
+
+void pl_dispatch_load(pl_dispatch dp, const uint8_t *cache)
+{
+    pl_cache_load(pl_gpu_cache(dp->gpu), cache, SIZE_MAX);
+}
diff --git a/src/dispatch.h b/src/dispatch.h
new file mode 100644
index 0000000..66c10f6
--- /dev/null
+++ b/src/dispatch.h
@@ -0,0 +1,31 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+// Like `pl_dispatch_begin`, but has an extra `unique` parameter. If this is
+// true, the generated shader will be uniquely namespaced `unique` and may be
+// freely merged with other shaders (`sh_subpass`). Otherwise, all shaders have
+// the same namespace and merging them is an error.
+pl_shader pl_dispatch_begin_ex(pl_dispatch dp, bool unique);
+
+// Set the `dynamic_constants` field for newly created `pl_shader` objects.
+//
+// This is a private API because it's sort of clunky/stateful.
+void pl_dispatch_mark_dynamic(pl_dispatch dp, bool dynamic);
diff --git a/src/dither.c b/src/dither.c
new file mode 100644
index 0000000..13f68e4
--- /dev/null
+++ b/src/dither.c
@@ -0,0 +1,317 @@
+/*
+ * Generate a noise texture for dithering images.
+ * Copyright © 2013  Wessel Dankers <wsl@fruit.je>
+ *
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * The original code is taken from mpv, under the same license.
+ */
+
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+
+#include "common.h"
+
+#include <libplacebo/dither.h>
+
+void pl_generate_bayer_matrix(float *data, int size)
+{
+    pl_assert(size >= 0);
+
+    // Start with a single entry of 0
+    data[0] = 0;
+
+    for (int sz = 1; sz < size; sz *= 2) {
+        // Make three copies of the current, appropriately shifted and scaled
+        for (int y = 0; y < sz; y ++) {
+            for (int x = 0; x < sz; x++) {
+                int offsets[] = {0, sz * size + sz, sz, sz * size};
+                int pos = y * size + x;
+
+                for (int i = 1; i < 4; i++)
+                    data[pos + offsets[i]] = data[pos] + i / (4.0 * sz * sz);
+            }
+        }
+    }
+}
+
+#define MAX_SIZEB 8
+#define MAX_SIZE (1 << MAX_SIZEB)
+#define MAX_SIZE2 (MAX_SIZE * MAX_SIZE)
+
+typedef uint_fast32_t index_t;
+
+#define WRAP_SIZE2(k, x) ((index_t)((index_t)(x) & ((k)->size2 - 1)))
+#define XY(k, x, y) ((index_t)(((x) | ((y) << (k)->sizeb))))
+
+struct ctx {
+    unsigned int sizeb, size, size2;
+    unsigned int gauss_radius;
+    unsigned int gauss_middle;
+    uint64_t gauss[MAX_SIZE2];
+    index_t randomat[MAX_SIZE2];
+    bool calcmat[MAX_SIZE2];
+    uint64_t gaussmat[MAX_SIZE2];
+    index_t unimat[MAX_SIZE2];
+};
+
+static void makegauss(struct ctx *k, unsigned int sizeb)
+{
+    pl_assert(sizeb >= 1 && sizeb <= MAX_SIZEB);
+
+    k->sizeb = sizeb;
+    k->size = 1 << k->sizeb;
+    k->size2 = k->size * k->size;
+
+    k->gauss_radius = k->size / 2 - 1;
+    k->gauss_middle = XY(k, k->gauss_radius, k->gauss_radius);
+
+    unsigned int gauss_size = k->gauss_radius * 2 + 1;
+    unsigned int gauss_size2 = gauss_size * gauss_size;
+
+    for (index_t c = 0; c < k->size2; c++)
+        k->gauss[c] = 0;
+
+    double sigma = -log(1.5 / (double) UINT64_MAX * gauss_size2) / k->gauss_radius;
+
+    for (index_t gy = 0; gy <= k->gauss_radius; gy++) {
+        for (index_t gx = 0; gx <= gy; gx++) {
+            int cx = (int)gx - k->gauss_radius;
+            int cy = (int)gy - k->gauss_radius;
+            int sq = cx * cx + cy * cy;
+            double e = exp(-sqrt(sq) * sigma);
+            uint64_t v = e / gauss_size2 * (double) UINT64_MAX;
+            k->gauss[XY(k, gx, gy)] =
+                k->gauss[XY(k, gy, gx)] =
+                k->gauss[XY(k, gx, gauss_size - 1 - gy)] =
+                k->gauss[XY(k, gy, gauss_size - 1 - gx)] =
+                k->gauss[XY(k, gauss_size - 1 - gx, gy)] =
+                k->gauss[XY(k, gauss_size - 1 - gy, gx)] =
+                k->gauss[XY(k, gauss_size - 1 - gx, gauss_size - 1 - gy)] =
+                k->gauss[XY(k, gauss_size - 1 - gy, gauss_size - 1 - gx)] = v;
+        }
+    }
+
+#ifndef NDEBUG
+    uint64_t total = 0;
+    for (index_t c = 0; c < k->size2; c++) {
+        uint64_t oldtotal = total;
+        total += k->gauss[c];
+        assert(total >= oldtotal);
+    }
+#endif
+}
+
+static void setbit(struct ctx *k, index_t c)
+{
+    if (k->calcmat[c])
+        return;
+    k->calcmat[c] = true;
+    uint64_t *m = k->gaussmat;
+    uint64_t *me = k->gaussmat + k->size2;
+    uint64_t *g = k->gauss + WRAP_SIZE2(k, k->gauss_middle + k->size2 - c);
+    uint64_t *ge = k->gauss + k->size2;
+    while (g < ge)
+        *m++ += *g++;
+    g = k->gauss;
+    while (m < me)
+        *m++ += *g++;
+}
+
+static index_t getmin(struct ctx *k)
+{
+    uint64_t min = UINT64_MAX;
+    index_t resnum = 0;
+    unsigned int size2 = k->size2;
+    for (index_t c = 0; c < size2; c++) {
+        if (k->calcmat[c])
+            continue;
+        uint64_t total = k->gaussmat[c];
+        if (total <= min) {
+            if (total != min) {
+                min = total;
+                resnum = 0;
+            }
+            k->randomat[resnum++] = c;
+        }
+    }
+    assert(resnum > 0);
+    if (resnum == 1)
+        return k->randomat[0];
+    if (resnum == size2)
+        return size2 / 2;
+    return k->randomat[rand() % resnum];
+}
+
+static void makeuniform(struct ctx *k)
+{
+    unsigned int size2 = k->size2;
+    for (index_t c = 0; c < size2; c++) {
+        index_t r = getmin(k);
+        setbit(k, r);
+        k->unimat[r] = c;
+    }
+}
+
+void pl_generate_blue_noise(float *data, int size)
+{
+    pl_assert(size > 0);
+    int shift = PL_LOG2(size);
+
+    pl_assert((1 << shift) == size);
+    struct ctx *k = pl_zalloc_ptr(NULL, k);
+    makegauss(k, shift);
+    makeuniform(k);
+    float invscale = k->size2;
+    for(index_t y = 0; y < k->size; y++) {
+        for(index_t x = 0; x < k->size; x++)
+            data[x + y * k->size] = k->unimat[XY(k, x, y)] / invscale;
+    }
+    pl_free(k);
+}
+
+const struct pl_error_diffusion_kernel pl_error_diffusion_simple = {
+    .name = "simple",
+    .description = "Simple error diffusion",
+    .shift = 1,
+    .pattern = {{0, 0, 0, 1, 0},
+                {0, 0, 1, 0, 0},
+                {0, 0, 0, 0, 0}},
+    .divisor = 2,
+};
+
+const struct pl_error_diffusion_kernel pl_error_diffusion_false_fs = {
+    .name = "false-fs",
+    .description = "False Floyd-Steinberg kernel",
+    .shift = 1,
+    .pattern = {{0, 0, 0, 3, 0},
+                {0, 0, 3, 2, 0},
+                {0, 0, 0, 0, 0}},
+    .divisor = 8,
+};
+
+const struct pl_error_diffusion_kernel pl_error_diffusion_sierra_lite = {
+    .name = "sierra-lite",
+    .description = "Sierra Lite kernel",
+    .shift = 2,
+    .pattern = {{0, 0, 0, 2, 0},
+                {0, 1, 1, 0, 0},
+                {0, 0, 0, 0, 0}},
+    .divisor = 4,
+};
+
+const struct pl_error_diffusion_kernel pl_error_diffusion_floyd_steinberg = {
+    .name = "floyd-steinberg",
+    .description = "Floyd Steinberg kernel",
+    .shift = 2,
+    .pattern = {{0, 0, 0, 7, 0},
+                {0, 3, 5, 1, 0},
+                {0, 0, 0, 0, 0}},
+    .divisor = 16,
+};
+
+const struct pl_error_diffusion_kernel pl_error_diffusion_atkinson = {
+    .name = "atkinson",
+    .description = "Atkinson kernel",
+    .shift = 2,
+    .pattern = {{0, 0, 0, 1, 1},
+                {0, 1, 1, 1, 0},
+                {0, 0, 1, 0, 0}},
+    .divisor = 8,
+};
+
+const struct pl_error_diffusion_kernel pl_error_diffusion_jarvis_judice_ninke = {
+    .name = "jarvis-judice-ninke",
+    .description = "Jarvis, Judice & Ninke kernel",
+    .shift = 3,
+    .pattern = {{0, 0, 0, 7, 5},
+                {3, 5, 7, 5, 3},
+                {1, 3, 5, 3, 1}},
+    .divisor = 48,
+};
+
+const struct pl_error_diffusion_kernel pl_error_diffusion_stucki = {
+    .name = "stucki",
+    .description = "Stucki kernel",
+    .shift = 3,
+    .pattern = {{0, 0, 0, 8, 4},
+                {2, 4, 8, 4, 2},
+                {1, 2, 4, 2, 1}},
+    .divisor = 42,
+};
+
+const struct pl_error_diffusion_kernel pl_error_diffusion_burkes = {
+    .name = "burkes",
+    .description = "Burkes kernel",
+    .shift = 3,
+    .pattern = {{0, 0, 0, 8, 4},
+                {2, 4, 8, 4, 2},
+                {0, 0, 0, 0, 0}},
+    .divisor = 32,
+};
+
+const struct pl_error_diffusion_kernel pl_error_diffusion_sierra2 = {
+    .name = "sierra-2",
+    .description = "Two-row Sierra",
+    .shift = 3,
+    .pattern = {{0, 0, 0, 4, 3},
+                {1, 2, 3, 2, 1},
+                {0, 0, 0, 0, 0}},
+    .divisor = 16,
+};
+
+const struct pl_error_diffusion_kernel pl_error_diffusion_sierra3 = {
+    .name = "sierra-3",
+    .description = "Three-row Sierra",
+    .shift = 3,
+    .pattern = {{0, 0, 0, 5, 3},
+                {2, 4, 5, 4, 2},
+                {0, 2, 3, 2, 0}},
+    .divisor = 32,
+};
+
+const struct pl_error_diffusion_kernel * const pl_error_diffusion_kernels[] = {
+    &pl_error_diffusion_simple,
+    &pl_error_diffusion_false_fs,
+    &pl_error_diffusion_sierra_lite,
+    &pl_error_diffusion_floyd_steinberg,
+    &pl_error_diffusion_atkinson,
+    &pl_error_diffusion_jarvis_judice_ninke,
+    &pl_error_diffusion_stucki,
+    &pl_error_diffusion_burkes,
+    &pl_error_diffusion_sierra2,
+    &pl_error_diffusion_sierra3,
+    NULL
+};
+
+const int pl_num_error_diffusion_kernels = PL_ARRAY_SIZE(pl_error_diffusion_kernels) - 1;
+
+// Find the error diffusion kernel with the given name, or NULL on failure.
+const struct pl_error_diffusion_kernel *pl_find_error_diffusion_kernel(const char *name)
+{
+    for (int i = 0; i < pl_num_error_diffusion_kernels; i++) {
+        if (strcmp(name, pl_error_diffusion_kernels[i]->name) == 0)
+            return pl_error_diffusion_kernels[i];
+    }
+
+    return NULL;
+}
diff --git a/src/dummy.c b/src/dummy.c
new file mode 100644
index 0000000..cd80080
--- /dev/null
+++ b/src/dummy.c
@@ -0,0 +1,348 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <limits.h>
+#include <string.h>
+
+#include "gpu.h"
+
+#include <libplacebo/dummy.h>
+
+const struct pl_gpu_dummy_params pl_gpu_dummy_default_params = { PL_GPU_DUMMY_DEFAULTS };
+static const struct pl_gpu_fns pl_fns_dummy;
+
+struct priv {
+    struct pl_gpu_fns impl;
+    struct pl_gpu_dummy_params params;
+};
+
+pl_gpu pl_gpu_dummy_create(pl_log log, const struct pl_gpu_dummy_params *params)
+{
+    params = PL_DEF(params, &pl_gpu_dummy_default_params);
+
+    struct pl_gpu_t *gpu = pl_zalloc_obj(NULL, gpu, struct priv);
+    gpu->log = log;
+    gpu->glsl = params->glsl;
+    gpu->limits = params->limits;
+
+    struct priv *p = PL_PRIV(gpu);
+    p->impl = pl_fns_dummy;
+    p->params = *params;
+
+    // Forcibly override these, because we know for sure what the values are
+    gpu->limits.align_tex_xfer_pitch = 1;
+    gpu->limits.align_tex_xfer_offset = 1;
+    gpu->limits.align_vertex_stride = 1;
+
+    // Set up the dummy formats, add one for each possible format type that we
+    // can represent on the host
+    PL_ARRAY(pl_fmt) formats = {0};
+    for (enum pl_fmt_type type = 1; type < PL_FMT_TYPE_COUNT; type++) {
+        for (int comps = 1; comps <= 4; comps++) {
+            for (int depth = 8; depth < 128; depth *= 2) {
+                if (type == PL_FMT_FLOAT && depth < 16)
+                    continue;
+
+                static const char *cnames[] = {
+                    [1] = "r",
+                    [2] = "rg",
+                    [3] = "rgb",
+                    [4] = "rgba",
+                };
+
+                static const char *tnames[] = {
+                    [PL_FMT_UNORM] = "",
+                    [PL_FMT_SNORM] = "s",
+                    [PL_FMT_UINT]  = "u",
+                    [PL_FMT_SINT]  = "i",
+                    [PL_FMT_FLOAT] = "f",
+                };
+
+                const char *tname = tnames[type];
+                if (type == PL_FMT_FLOAT && depth == 16)
+                    tname = "hf";
+
+                struct pl_fmt_t *fmt = pl_alloc_ptr(gpu, fmt);
+                *fmt = (struct pl_fmt_t) {
+                    .name = pl_asprintf(fmt, "%s%d%s", cnames[comps], depth, tname),
+                    .type = type,
+                    .num_components = comps,
+                    .opaque = false,
+                    .gatherable = true,
+                    .internal_size = comps * depth / 8,
+                    .texel_size = comps * depth / 8,
+                    .texel_align = 1,
+                    .caps = PL_FMT_CAP_SAMPLEABLE | PL_FMT_CAP_LINEAR |
+                            PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_BLENDABLE |
+                            PL_FMT_CAP_VERTEX | PL_FMT_CAP_HOST_READABLE,
+                };
+
+                for (int i = 0; i < comps; i++) {
+                    fmt->component_depth[i] = depth;
+                    fmt->host_bits[i] = depth;
+                    fmt->sample_order[i] = i;
+                }
+
+                if (gpu->glsl.compute)
+                    fmt->caps |= PL_FMT_CAP_STORABLE;
+                if (gpu->limits.max_buffer_texels && gpu->limits.max_ubo_size)
+                    fmt->caps |= PL_FMT_CAP_TEXEL_UNIFORM;
+                if (gpu->limits.max_buffer_texels && gpu->limits.max_ssbo_size)
+                    fmt->caps |= PL_FMT_CAP_TEXEL_STORAGE;
+
+                fmt->glsl_type = pl_var_glsl_type_name(pl_var_from_fmt(fmt, ""));
+                fmt->glsl_format = pl_fmt_glsl_format(fmt, comps);
+                fmt->fourcc = pl_fmt_fourcc(fmt);
+                if (!fmt->glsl_format)
+                    fmt->caps &= ~(PL_FMT_CAP_STORABLE | PL_FMT_CAP_TEXEL_STORAGE);
+                PL_ARRAY_APPEND(gpu, formats, fmt);
+            }
+        }
+    }
+
+    gpu->formats = formats.elem;
+    gpu->num_formats = formats.num;
+    return pl_gpu_finalize(gpu);
+}
+
+static void dumb_destroy(pl_gpu gpu)
+{
+    pl_free((void *) gpu);
+}
+
+void pl_gpu_dummy_destroy(pl_gpu *gpu)
+{
+    pl_gpu_destroy(*gpu);
+    *gpu = NULL;
+}
+
+struct buf_priv {
+    uint8_t *data;
+};
+
+static pl_buf dumb_buf_create(pl_gpu gpu, const struct pl_buf_params *params)
+{
+    struct pl_buf_t *buf = pl_zalloc_obj(NULL, buf, struct buf_priv);
+    buf->params = *params;
+    buf->params.initial_data = NULL;
+
+    struct buf_priv *p = PL_PRIV(buf);
+    p->data = malloc(params->size);
+    if (!p->data) {
+        PL_ERR(gpu, "Failed allocating memory for dummy buffer!");
+        pl_free(buf);
+        return NULL;
+    }
+
+    if (params->initial_data)
+        memcpy(p->data, params->initial_data, params->size);
+    if (params->host_mapped)
+        buf->data = p->data;
+
+    return buf;
+}
+
+static void dumb_buf_destroy(pl_gpu gpu, pl_buf buf)
+{
+    struct buf_priv *p = PL_PRIV(buf);
+    free(p->data);
+    pl_free((void *) buf);
+}
+
+uint8_t *pl_buf_dummy_data(pl_buf buf)
+{
+    struct buf_priv *p = PL_PRIV(buf);
+    return p->data;
+}
+
+static void dumb_buf_write(pl_gpu gpu, pl_buf buf, size_t buf_offset,
+                           const void *data, size_t size)
+{
+    struct buf_priv *p = PL_PRIV(buf);
+    memcpy(p->data + buf_offset, data, size);
+}
+
+static bool dumb_buf_read(pl_gpu gpu, pl_buf buf, size_t buf_offset,
+                          void *dest, size_t size)
+{
+    struct buf_priv *p = PL_PRIV(buf);
+    memcpy(dest, p->data + buf_offset, size);
+    return true;
+}
+
+static void dumb_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset,
+                          pl_buf src, size_t src_offset, size_t size)
+{
+    struct buf_priv *dstp = PL_PRIV(dst);
+    struct buf_priv *srcp = PL_PRIV(src);
+    memcpy(dstp->data + dst_offset, srcp->data + src_offset, size);
+}
+
+struct tex_priv {
+    void *data;
+};
+
+static size_t tex_size(pl_gpu gpu, pl_tex tex)
+{
+    size_t size = tex->params.format->texel_size * tex->params.w;
+    size *= PL_DEF(tex->params.h, 1);
+    size *= PL_DEF(tex->params.d, 1);
+    return size;
+}
+
+static pl_tex dumb_tex_create(pl_gpu gpu, const struct pl_tex_params *params)
+{
+    struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, void *);
+    tex->params = *params;
+    tex->params.initial_data = NULL;
+
+    struct tex_priv *p = PL_PRIV(tex);
+    p->data = malloc(tex_size(gpu, tex));
+    if (!p->data) {
+        PL_ERR(gpu, "Failed allocating memory for dummy texture!");
+        pl_free(tex);
+        return NULL;
+    }
+
+    if (params->initial_data)
+        memcpy(p->data, params->initial_data, tex_size(gpu, tex));
+
+    return tex;
+}
+
+pl_tex pl_tex_dummy_create(pl_gpu gpu, const struct pl_tex_dummy_params *params)
+{
+    // Only do minimal sanity checking, since this is just a dummy texture
+    pl_assert(params->format && params->w >= 0 && params->h >= 0 && params->d >= 0);
+
+    struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct tex_priv);
+    tex->sampler_type = params->sampler_type;
+    tex->params = (struct pl_tex_params) {
+        .w = params->w,
+        .h = params->h,
+        .d = params->d,
+        .format = params->format,
+        .sampleable = true,
+        .user_data = params->user_data,
+    };
+
+    return tex;
+}
+
+static void dumb_tex_destroy(pl_gpu gpu, pl_tex tex)
+{
+    struct tex_priv *p = PL_PRIV(tex);
+    if (p->data)
+        free(p->data);
+    pl_free((void *) tex);
+}
+
+uint8_t *pl_tex_dummy_data(pl_tex tex)
+{
+    struct tex_priv *p = PL_PRIV(tex);
+    return p->data;
+}
+
+static bool dumb_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+    pl_tex tex = params->tex;
+    struct tex_priv *p = PL_PRIV(tex);
+    pl_assert(p->data);
+
+    const uint8_t *src = params->ptr;
+    uint8_t *dst = p->data;
+    if (params->buf) {
+        struct buf_priv *bufp = PL_PRIV(params->buf);
+        src = (uint8_t *) bufp->data + params->buf_offset;
+    }
+
+    size_t texel_size = tex->params.format->texel_size;
+    size_t row_size = pl_rect_w(params->rc) * texel_size;
+    for (int z = params->rc.z0; z < params->rc.z1; z++) {
+        size_t src_plane = z * params->depth_pitch;
+        size_t dst_plane = z * tex->params.h * tex->params.w * texel_size;
+        for (int y = params->rc.y0; y < params->rc.y1; y++) {
+            size_t src_row = src_plane + y * params->row_pitch;
+            size_t dst_row = dst_plane + y * tex->params.w * texel_size;
+            size_t pos = params->rc.x0 * texel_size;
+            memcpy(&dst[dst_row + pos], &src[src_row + pos], row_size);
+        }
+    }
+
+    return true;
+}
+
+static bool dumb_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+    pl_tex tex = params->tex;
+    struct tex_priv *p = PL_PRIV(tex);
+    pl_assert(p->data);
+
+    const uint8_t *src = p->data;
+    uint8_t *dst = params->ptr;
+    if (params->buf) {
+        struct buf_priv *bufp = PL_PRIV(params->buf);
+        dst = (uint8_t *) bufp->data + params->buf_offset;
+    }
+
+    size_t texel_size = tex->params.format->texel_size;
+    size_t row_size = pl_rect_w(params->rc) * texel_size;
+    for (int z = params->rc.z0; z < params->rc.z1; z++) {
+        size_t src_plane = z * tex->params.h * tex->params.w * texel_size;
+        size_t dst_plane = z * params->depth_pitch;
+        for (int y = params->rc.y0; y < params->rc.y1; y++) {
+            size_t src_row = src_plane + y * tex->params.w * texel_size;
+            size_t dst_row = dst_plane + y * params->row_pitch;
+            size_t pos = params->rc.x0 * texel_size;
+            memcpy(&dst[dst_row + pos], &src[src_row + pos], row_size);
+        }
+    }
+
+    return true;
+}
+
+static int dumb_desc_namespace(pl_gpu gpu, enum pl_desc_type type)
+{
+    return 0; // safest behavior: never alias bindings
+}
+
+static pl_pass dumb_pass_create(pl_gpu gpu, const struct pl_pass_params *params)
+{
+    PL_ERR(gpu, "Creating render passes is not supported for dummy GPUs");
+    return NULL;
+}
+
+static void dumb_gpu_finish(pl_gpu gpu)
+{
+    // no-op
+}
+
+static const struct pl_gpu_fns pl_fns_dummy = {
+    .destroy = dumb_destroy,
+    .buf_create = dumb_buf_create,
+    .buf_destroy = dumb_buf_destroy,
+    .buf_write = dumb_buf_write,
+    .buf_read = dumb_buf_read,
+    .buf_copy = dumb_buf_copy,
+    .tex_create = dumb_tex_create,
+    .tex_destroy = dumb_tex_destroy,
+    .tex_upload = dumb_tex_upload,
+    .tex_download = dumb_tex_download,
+    .desc_namespace = dumb_desc_namespace,
+    .pass_create = dumb_pass_create,
+    .gpu_finish = dumb_gpu_finish,
+};
diff --git a/src/filters.c b/src/filters.c
new file mode 100644
index 0000000..cc4871f
--- /dev/null
+++ b/src/filters.c
@@ -0,0 +1,1015 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Some of the filter code originally derives (via mpv) from Glumpy:
+ * # Copyright (c) 2009-2016 Nicolas P. Rougier. All rights reserved.
+ * # Distributed under the (new) BSD License.
+ * (https://github.com/glumpy/glumpy/blob/master/glumpy/library/build-spatial-filters.py)
+ *
+ * The math underlying each filter function was written from scratch, with
+ * some algorithms coming from a number of different sources, including:
+ * - https://en.wikipedia.org/wiki/Window_function
+ * - https://en.wikipedia.org/wiki/Jinc
+ * - http://vector-agg.cvs.sourceforge.net/viewvc/vector-agg/agg-2.5/include/agg_image_filters.h
+ * - Vapoursynth plugin fmtconv (WTFPL Licensed), which is based on
+ *   dither plugin for avisynth from the same author:
+ *   https://github.com/vapoursynth/fmtconv/tree/master/src/fmtc
+ * - Paul Heckbert's "zoom"
+ * - XBMC: ConvolutionKernels.cpp etc.
+ * - https://github.com/AviSynth/jinc-resize (only used to verify the math)
+ */
+
+#include <math.h>
+
+#include "common.h"
+#include "filters.h"
+#include "log.h"
+
+#ifdef PL_HAVE_WIN32
+#define j1 _j1
+#endif
+
+bool pl_filter_function_eq(const struct pl_filter_function *a,
+                           const struct pl_filter_function *b)
+{
+    return (a ? a->weight : NULL) == (b ? b->weight : NULL);
+}
+
+bool pl_filter_config_eq(const struct pl_filter_config *a,
+                         const struct pl_filter_config *b)
+{
+    if (!a || !b)
+        return a == b;
+
+    bool eq = pl_filter_function_eq(a->kernel, b->kernel) &&
+              pl_filter_function_eq(a->window, b->window) &&
+              a->radius   == b->radius &&
+              a->clamp    == b->clamp  &&
+              a->blur     == b->blur   &&
+              a->taper    == b->taper  &&
+              a->polar    == b->polar  &&
+              a->antiring == b->antiring;
+
+    for (int i = 0; i < PL_FILTER_MAX_PARAMS; i++) {
+        if (a->kernel->tunable[i])
+            eq &= a->params[i] == b->params[i];
+        if (a->window && a->window->tunable[i])
+            eq &= a->wparams[i] == b->wparams[i];
+    }
+
+    return eq;
+}
+
+double pl_filter_sample(const struct pl_filter_config *c, double x)
+{
+    const float radius = pl_filter_radius_bound(c);
+
+    // All filters are symmetric, and in particular only need to be defined
+    // for [0, radius].
+    x = fabs(x);
+
+    // Return early for values outside of the kernel radius, since the functions
+    // are not necessarily valid outside of this interval. No such check is
+    // needed for the window, because it's always stretched to fit.
+    if (x > radius)
+        return 0.0;
+
+    // Apply the blur and taper coefficients as needed
+    double kx = x <= c->taper ? 0.0 : (x - c->taper) / (1.0 - c->taper / radius);
+    if (c->blur > 0.0)
+        kx /= c->blur;
+
+    pl_assert(!c->kernel->opaque);
+    double k = c->kernel->weight(&(const struct pl_filter_ctx) {
+        .radius = radius,
+        .params = {
+            c->kernel->tunable[0] ? c->params[0] : c->kernel->params[0],
+            c->kernel->tunable[1] ? c->params[1] : c->kernel->params[1],
+        },
+    }, kx);
+
+    // Apply the optional windowing function
+    if (c->window) {
+        pl_assert(!c->window->opaque);
+        double wx = x / radius * c->window->radius;
+        k *= c->window->weight(&(struct pl_filter_ctx) {
+            .radius = c->window->radius,
+            .params = {
+                c->window->tunable[0] ? c->wparams[0] : c->window->params[0],
+                c->window->tunable[1] ? c->wparams[1] : c->window->params[1],
+            },
+        }, wx);
+    }
+
+    return k < 0 ? (1 - c->clamp) * k : k;
+}
+
+static void filter_cutoffs(const struct pl_filter_config *c, float cutoff,
+                           float *out_radius, float *out_radius_zero)
+{
+    const float bound = pl_filter_radius_bound(c);
+    float prev = 0.0, fprev = pl_filter_sample(c, prev);
+    bool found_root = false;
+
+    const float step = 1e-2f;
+    for (float x = 0.0; x < bound + step; x += step) {
+        float fx = pl_filter_sample(c, x);
+        if ((fprev > cutoff && fx <= cutoff) || (fprev < -cutoff && fx >= -cutoff)) {
+            // Found zero crossing
+            float root = x - fx * (x - prev) / (fx - fprev); // secant method
+            root = fminf(root, bound);
+            *out_radius = root;
+            if (!found_root) // first root
+                *out_radius_zero = root;
+            found_root = true;
+        }
+        prev = x;
+        fprev = fx;
+    }
+
+    if (!found_root)
+        *out_radius_zero = *out_radius = bound;
+}
+
+// Compute a single row of weights for a given filter in one dimension, indexed
+// by the indicated subpixel offset. Writes `f->row_size` values to `out`.
+static void compute_row(struct pl_filter_t *f, double offset, float *out)
+{
+    double wsum = 0.0;
+    for (int i = 0; i < f->row_size; i++) {
+        // For the example of a filter with row size 4 and offset 0.3, we have:
+        //
+        // 0    1 *  2    3
+        //
+        // * indicates the sampled position. What we want to compute is the
+        // distance from each index to that sampled position.
+        pl_assert(f->row_size % 2 == 0);
+        const int base = f->row_size / 2 - 1; // index to the left of the center
+        const double center = base + offset; // offset of center relative to idx 0
+        double w = pl_filter_sample(&f->params.config, i - center);
+        out[i] = w;
+        wsum += w;
+    }
+
+    // Readjust weights to preserve energy
+    pl_assert(wsum > 0);
+    for (int i = 0; i < f->row_size; i++)
+        out[i] /= wsum;
+}
+
+// Needed for backwards compatibility with v1 configuration API
+static struct pl_filter_function *dupfilter(void *alloc,
+                                            const struct pl_filter_function *f)
+{
+    return f ? pl_memdup(alloc, (void *)f, sizeof(*f)) : NULL;
+}
+
+pl_filter pl_filter_generate(pl_log log, const struct pl_filter_params *params)
+{
+    pl_assert(params);
+    if (params->lut_entries <= 0 || !params->config.kernel) {
+        pl_fatal(log, "Invalid params: missing lut_entries or config.kernel");
+        return NULL;
+    }
+
+    if (params->config.kernel->opaque) {
+        pl_err(log, "Trying to use opaque kernel '%s' in non-opaque context!",
+               params->config.kernel->name);
+        return NULL;
+    }
+
+    if (params->config.window && params->config.window->opaque) {
+        pl_err(log, "Trying to use opaque window '%s' in non-opaque context!",
+               params->config.window->name);
+        return NULL;
+    }
+
+    struct pl_filter_t *f = pl_zalloc_ptr(NULL, f);
+    f->params = *params;
+    f->params.config.kernel = dupfilter(f, params->config.kernel);
+    f->params.config.window = dupfilter(f, params->config.window);
+
+    // Compute main lobe and total filter size
+    filter_cutoffs(&params->config, params->cutoff, &f->radius, &f->radius_zero);
+    f->radius_cutoff = f->radius; // backwards compatibility
+
+    float *weights;
+    if (params->config.polar) {
+        // Compute a 1D array indexed by radius
+        weights = pl_alloc(f, params->lut_entries * sizeof(float));
+        for (int i = 0; i < params->lut_entries; i++) {
+            double x = f->radius * i / (params->lut_entries - 1);
+            weights[i] = pl_filter_sample(&params->config, x);
+        }
+    } else {
+        // Pick the most appropriate row size
+        f->row_size = ceilf(f->radius) * 2;
+        if (params->max_row_size && f->row_size > params->max_row_size) {
+            pl_info(log, "Required filter size %d exceeds the maximum allowed "
+                    "size of %d. This may result in adverse effects (aliasing, "
+                    "or moiré artifacts).", f->row_size, params->max_row_size);
+            f->row_size = params->max_row_size;
+            f->insufficient = true;
+        }
+        f->row_stride = PL_ALIGN(f->row_size, params->row_stride_align);
+
+        // Compute a 2D array indexed by the subpixel position
+        weights = pl_calloc(f, params->lut_entries * f->row_stride, sizeof(float));
+        for (int i = 0; i < params->lut_entries; i++) {
+            compute_row(f, i / (double)(params->lut_entries - 1),
+                        weights + f->row_stride * i);
+        }
+    }
+
+    f->weights = weights;
+    return f;
+}
+
+void pl_filter_free(pl_filter *filter)
+{
+    pl_free_ptr((void **) filter);
+}
+
+// Built-in filter functions
+
+static double box(const struct pl_filter_ctx *f, double x)
+{
+    return 1.0;
+}
+
+const struct pl_filter_function pl_filter_function_box = {
+    .weight    = box,
+    .name      = "box",
+    .radius    = 1.0,
+    .resizable = true,
+};
+
+static const struct pl_filter_function filter_function_dirichlet = {
+    .name      = "dirichlet", // alias
+    .weight    = box,
+    .radius    = 1.0,
+    .resizable = true,
+};
+
+static double triangle(const struct pl_filter_ctx *f, double x)
+{
+    return 1.0 - x / f->radius;
+}
+
+const struct pl_filter_function pl_filter_function_triangle = {
+    .name      = "triangle",
+    .weight    = triangle,
+    .radius    = 1.0,
+    .resizable = true,
+};
+
+static double cosine(const struct pl_filter_ctx *f, double x)
+{
+    return cos(x);
+}
+
+const struct pl_filter_function pl_filter_function_cosine = {
+    .name   = "cosine",
+    .weight = cosine,
+    .radius = M_PI / 2.0,
+};
+
+static double hann(const struct pl_filter_ctx *f, double x)
+{
+    return 0.5 + 0.5 * cos(M_PI * x);
+}
+
+const struct pl_filter_function pl_filter_function_hann = {
+    .name   = "hann",
+    .weight = hann,
+    .radius = 1.0,
+};
+
+static const struct pl_filter_function filter_function_hanning = {
+    .name   = "hanning", // alias
+    .weight = hann,
+    .radius = 1.0,
+};
+
+static double hamming(const struct pl_filter_ctx *f, double x)
+{
+    return 0.54 + 0.46 * cos(M_PI * x);
+}
+
+const struct pl_filter_function pl_filter_function_hamming = {
+    .name   = "hamming",
+    .weight = hamming,
+    .radius = 1.0,
+};
+
+static double welch(const struct pl_filter_ctx *f, double x)
+{
+    return 1.0 - x * x;
+}
+
+const struct pl_filter_function pl_filter_function_welch = {
+    .name   = "welch",
+    .weight = welch,
+    .radius = 1.0,
+};
+
+static double bessel_i0(double x)
+{
+    double s = 1.0;
+    double y = x * x / 4.0;
+    double t = y;
+    int i = 2;
+    while (t > 1e-12) {
+        s += t;
+        t *= y / (i * i);
+        i += 1;
+    }
+    return s;
+}
+
+static double kaiser(const struct pl_filter_ctx *f, double x)
+{
+    double alpha = fmax(f->params[0], 0.0);
+    double scale = bessel_i0(alpha);
+    return bessel_i0(alpha * sqrt(1.0 - x * x)) / scale;
+}
+
+const struct pl_filter_function pl_filter_function_kaiser = {
+    .name    = "kaiser",
+    .weight  = kaiser,
+    .radius  = 1.0,
+    .params  = {2.0},
+    .tunable = {true},
+};
+
+static double blackman(const struct pl_filter_ctx *f, double x)
+{
+    double a = f->params[0];
+    double a0 = (1 - a) / 2.0, a1 = 1 / 2.0, a2 = a / 2.0;
+    x *= M_PI;
+    return a0 + a1 * cos(x) + a2 * cos(2 * x);
+}
+
+const struct pl_filter_function pl_filter_function_blackman = {
+    .name    = "blackman",
+    .weight  = blackman,
+    .radius  = 1.0,
+    .params  = {0.16},
+    .tunable = {true},
+};
+
+static double bohman(const struct pl_filter_ctx *f, double x)
+{
+    double pix = M_PI * x;
+    return (1.0 - x) * cos(pix) + sin(pix) / M_PI;
+}
+
+const struct pl_filter_function pl_filter_function_bohman = {
+    .name   = "bohman",
+    .weight = bohman,
+    .radius = 1.0,
+};
+
+static double gaussian(const struct pl_filter_ctx *f, double x)
+{
+    return exp(-2.0 * x * x / f->params[0]);
+}
+
+const struct pl_filter_function pl_filter_function_gaussian = {
+    .name      = "gaussian",
+    .weight    = gaussian,
+    .radius    = 2.0,
+    .resizable = true,
+    .params    = {1.0},
+    .tunable   = {true},
+};
+
+static double quadratic(const struct pl_filter_ctx *f, double x)
+{
+    if (x < 0.5) {
+        return 1.0 - 4.0/3.0 * (x * x);
+    } else {
+        return 2.0 / 3.0 * (x - 1.5) * (x - 1.5);
+    }
+}
+
+const struct pl_filter_function pl_filter_function_quadratic = {
+    .name   = "quadratic",
+    .weight = quadratic,
+    .radius = 1.5,
+};
+
+static const struct pl_filter_function filter_function_quadric = {
+    .name   = "quadric", // alias
+    .weight = quadratic,
+    .radius = 1.5,
+};
+
+static double sinc(const struct pl_filter_ctx *f, double x)
+{
+    if (x < 1e-8)
+        return 1.0;
+    x *= M_PI;
+    return sin(x) / x;
+}
+
+const struct pl_filter_function pl_filter_function_sinc = {
+    .name      = "sinc",
+    .weight    = sinc,
+    .radius    = 1.0,
+    .resizable = true,
+};
+
+static double jinc(const struct pl_filter_ctx *f, double x)
+{
+    if (x < 1e-8)
+        return 1.0;
+    x *= M_PI;
+    return 2.0 * j1(x) / x;
+}
+
+const struct pl_filter_function pl_filter_function_jinc = {
+    .name      = "jinc",
+    .weight    = jinc,
+    .radius    = 1.2196698912665045, // first zero
+    .resizable = true,
+};
+
+static double sphinx(const struct pl_filter_ctx *f, double x)
+{
+    if (x < 1e-8)
+        return 1.0;
+    x *= M_PI;
+    return 3.0 * (sin(x) - x * cos(x)) / (x * x * x);
+}
+
+const struct pl_filter_function pl_filter_function_sphinx = {
+    .name      = "sphinx",
+    .weight    = sphinx,
+    .radius    = 1.4302966531242027, // first zero
+    .resizable = true,
+};
+
+static double cubic(const struct pl_filter_ctx *f, double x)
+{
+    const double b = f->params[0], c = f->params[1];
+    double p0 = 6.0 - 2.0 * b,
+           p2 = -18.0 + 12.0 * b + 6.0 * c,
+           p3 = 12.0 - 9.0 * b - 6.0 * c,
+           q0 = 8.0 * b + 24.0 * c,
+           q1 = -12.0 * b - 48.0 * c,
+           q2 = 6.0 * b + 30.0 * c,
+           q3 = -b - 6.0 * c;
+
+    if (x < 1.0) {
+        return (p0 + x * x * (p2 + x * p3)) / p0;
+    } else {
+        return (q0 + x * (q1 + x * (q2 + x * q3))) / p0;
+    }
+}
+
+const struct pl_filter_function pl_filter_function_cubic = {
+    .name    = "cubic",
+    .weight  = cubic,
+    .radius  = 2.0,
+    .params  = {1.0, 0.0},
+    .tunable = {true, true},
+};
+
+static const struct pl_filter_function filter_function_bicubic = {
+    .name    = "bicubic", // alias
+    .weight  = cubic,
+    .radius  = 2.0,
+    .params  = {1.0, 0.0},
+    .tunable = {true, true},
+};
+
+static const struct pl_filter_function filter_function_bcspline = {
+    .name    = "bcspline", // alias
+    .weight  = cubic,
+    .radius  = 2.0,
+    .params  = {1.0, 0.0},
+    .tunable = {true, true},
+};
+
+const struct pl_filter_function pl_filter_function_hermite = {
+    .name    = "hermite",
+    .weight  = cubic,
+    .radius  = 1.0,
+    .params  = {0.0, 0.0},
+};
+
+static double spline16(const struct pl_filter_ctx *f, double x)
+{
+    if (x < 1.0) {
+        return ((x - 9.0/5.0 ) * x - 1.0/5.0 ) * x + 1.0;
+    } else {
+        return ((-1.0/3.0 * (x-1) + 4.0/5.0) * (x-1) - 7.0/15.0 ) * (x-1);
+    }
+}
+
+const struct pl_filter_function pl_filter_function_spline16 = {
+    .name   = "spline16",
+    .weight = spline16,
+    .radius = 2.0,
+};
+
+static double spline36(const struct pl_filter_ctx *f, double x)
+{
+    if (x < 1.0) {
+        return ((13.0/11.0 * x - 453.0/209.0) * x - 3.0/209.0) * x + 1.0;
+    } else if (x < 2.0) {
+        return ((-6.0/11.0 * (x-1) + 270.0/209.0) * (x-1) - 156.0/ 209.0) * (x-1);
+    } else {
+        return ((1.0/11.0 * (x-2) - 45.0/209.0) * (x-2) +  26.0/209.0) * (x-2);
+    }
+}
+
+const struct pl_filter_function pl_filter_function_spline36 = {
+    .name   = "spline36",
+    .weight = spline36,
+    .radius = 3.0,
+};
+
+static double spline64(const struct pl_filter_ctx *f, double x)
+{
+    if (x < 1.0) {
+        return ((49.0/41.0 * x - 6387.0/2911.0) * x - 3.0/2911.0) * x + 1.0;
+    } else if (x < 2.0) {
+        return ((-24.0/41.0 * (x-1) + 4032.0/2911.0) * (x-1) - 2328.0/2911.0) * (x-1);
+    } else if (x < 3.0) {
+        return ((6.0/41.0 * (x-2) - 1008.0/2911.0) * (x-2) + 582.0/2911.0) * (x-2);
+    } else {
+        return ((-1.0/41.0 * (x-3) + 168.0/2911.0) * (x-3) - 97.0/2911.0) * (x-3);
+    }
+}
+
+const struct pl_filter_function pl_filter_function_spline64 = {
+    .name   = "spline64",
+    .weight = spline64,
+    .radius = 4.0,
+};
+
+static double oversample(const struct pl_filter_ctx *f, double x)
+{
+    return 0.0;
+}
+
+const struct pl_filter_function pl_filter_function_oversample = {
+    .name    = "oversample",
+    .weight  = oversample,
+    .params  = {0.0},
+    .tunable = {true},
+    .opaque  = true,
+};
+
+const struct pl_filter_function * const pl_filter_functions[] = {
+    &pl_filter_function_box,
+    &filter_function_dirichlet, // alias
+    &pl_filter_function_triangle,
+    &pl_filter_function_cosine,
+    &pl_filter_function_hann,
+    &filter_function_hanning, // alias
+    &pl_filter_function_hamming,
+    &pl_filter_function_welch,
+    &pl_filter_function_kaiser,
+    &pl_filter_function_blackman,
+    &pl_filter_function_bohman,
+    &pl_filter_function_gaussian,
+    &pl_filter_function_quadratic,
+    &filter_function_quadric, // alias
+    &pl_filter_function_sinc,
+    &pl_filter_function_jinc,
+    &pl_filter_function_sphinx,
+    &pl_filter_function_cubic,
+    &filter_function_bicubic, // alias
+    &filter_function_bcspline, // alias
+    &pl_filter_function_hermite,
+    &pl_filter_function_spline16,
+    &pl_filter_function_spline36,
+    &pl_filter_function_spline64,
+    &pl_filter_function_oversample,
+    NULL,
+};
+
+const int pl_num_filter_functions = PL_ARRAY_SIZE(pl_filter_functions) - 1;
+
+const struct pl_filter_function *pl_find_filter_function(const char *name)
+{
+    if (!name)
+        return NULL;
+
+    for (int i = 0; i < pl_num_filter_functions; i++) {
+        if (strcmp(name, pl_filter_functions[i]->name) == 0)
+            return pl_filter_functions[i];
+    }
+
+    return NULL;
+}
+
+// Built-in filter function configs
+
+const struct pl_filter_config pl_filter_spline16 = {
+    .name        = "spline16",
+    .description = "Spline (2 taps)",
+    .kernel      = &pl_filter_function_spline16,
+    .allowed     = PL_FILTER_ALL,
+};
+
+const struct pl_filter_config pl_filter_spline36 = {
+    .name        = "spline36",
+    .description = "Spline (3 taps)",
+    .kernel      = &pl_filter_function_spline36,
+    .allowed     = PL_FILTER_ALL,
+};
+
+const struct pl_filter_config pl_filter_spline64 = {
+    .name        = "spline64",
+    .description = "Spline (4 taps)",
+    .kernel      = &pl_filter_function_spline64,
+    .allowed     = PL_FILTER_ALL,
+};
+
+const struct pl_filter_config pl_filter_nearest = {
+    .name        = "nearest",
+    .description = "Nearest neighbor",
+    .kernel      = &pl_filter_function_box,
+    .radius      = 0.5,
+    .allowed     = PL_FILTER_UPSCALING,
+    .recommended = PL_FILTER_UPSCALING,
+};
+
+const struct pl_filter_config pl_filter_box = {
+    .name        = "box",
+    .description = "Box averaging",
+    .kernel      = &pl_filter_function_box,
+    .radius      = 0.5,
+    .allowed     = PL_FILTER_SCALING,
+    .recommended = PL_FILTER_DOWNSCALING,
+};
+
+const struct pl_filter_config pl_filter_bilinear = {
+    .name        = "bilinear",
+    .description = "Bilinear",
+    .kernel      = &pl_filter_function_triangle,
+    .allowed     = PL_FILTER_ALL,
+    .recommended = PL_FILTER_SCALING,
+};
+
+const struct pl_filter_config filter_linear = {
+    .name        = "linear",
+    .description = "Linear mixing",
+    .kernel      = &pl_filter_function_triangle,
+    .allowed     = PL_FILTER_FRAME_MIXING,
+    .recommended = PL_FILTER_FRAME_MIXING,
+};
+
+static const struct pl_filter_config filter_triangle = {
+    .name        = "triangle",
+    .kernel      = &pl_filter_function_triangle,
+    .allowed     = PL_FILTER_SCALING,
+};
+
+const struct pl_filter_config pl_filter_gaussian = {
+    .name        = "gaussian",
+    .description = "Gaussian",
+    .kernel      = &pl_filter_function_gaussian,
+    .params      = {1.0},
+    .allowed     = PL_FILTER_ALL,
+    .recommended = PL_FILTER_SCALING,
+};
+
+const struct pl_filter_config pl_filter_sinc = {
+    .name        = "sinc",
+    .description = "Sinc (unwindowed)",
+    .kernel      = &pl_filter_function_sinc,
+    .radius      = 3.0,
+    .allowed     = PL_FILTER_ALL,
+};
+
+const struct pl_filter_config pl_filter_lanczos = {
+    .name        = "lanczos",
+    .description = "Lanczos",
+    .kernel      = &pl_filter_function_sinc,
+    .window      = &pl_filter_function_sinc,
+    .radius      = 3.0,
+    .allowed     = PL_FILTER_ALL,
+    .recommended = PL_FILTER_SCALING,
+};
+
+const struct pl_filter_config pl_filter_ginseng = {
+    .name        = "ginseng",
+    .description = "Ginseng (Jinc-Sinc)",
+    .kernel      = &pl_filter_function_sinc,
+    .window      = &pl_filter_function_jinc,
+    .radius      = 3.0,
+    .allowed     = PL_FILTER_ALL,
+};
+
+#define JINC_ZERO3 3.2383154841662362076499
+#define JINC_ZERO4 4.2410628637960698819573
+
+const struct pl_filter_config pl_filter_ewa_jinc = {
+    .name        = "ewa_jinc",
+    .description = "EWA Jinc (unwindowed)",
+    .kernel      = &pl_filter_function_jinc,
+    .radius      = JINC_ZERO3,
+    .polar       = true,
+    .allowed     = PL_FILTER_SCALING,
+};
+
+const struct pl_filter_config pl_filter_ewa_lanczos = {
+    .name        = "ewa_lanczos",
+    .description = "Jinc (EWA Lanczos)",
+    .kernel      = &pl_filter_function_jinc,
+    .window      = &pl_filter_function_jinc,
+    .radius      = JINC_ZERO3,
+    .polar       = true,
+    .allowed     = PL_FILTER_SCALING,
+    .recommended = PL_FILTER_UPSCALING,
+};
+
+const struct pl_filter_config pl_filter_ewa_lanczossharp = {
+    .name        = "ewa_lanczossharp",
+    .description = "Sharpened Jinc",
+    .kernel      = &pl_filter_function_jinc,
+    .window      = &pl_filter_function_jinc,
+    .radius      = JINC_ZERO3,
+    .blur        = 0.98125058372237073562493,
+    .polar       = true,
+    .allowed     = PL_FILTER_SCALING,
+    .recommended = PL_FILTER_UPSCALING,
+};
+
+const struct pl_filter_config pl_filter_ewa_lanczos4sharpest = {
+    .name        = "ewa_lanczos4sharpest",
+    .description = "Sharpened Jinc-AR, 4 taps",
+    .kernel      = &pl_filter_function_jinc,
+    .window      = &pl_filter_function_jinc,
+    .radius      = JINC_ZERO4,
+    .blur        = 0.88451209326050047745788,
+    .antiring    = 0.8,
+    .polar       = true,
+    .allowed     = PL_FILTER_SCALING,
+    .recommended = PL_FILTER_UPSCALING,
+};
+
+const struct pl_filter_config pl_filter_ewa_ginseng = {
+    .name        = "ewa_ginseng",
+    .description = "EWA Ginseng",
+    .kernel      = &pl_filter_function_jinc,
+    .window      = &pl_filter_function_sinc,
+    .radius      = JINC_ZERO3,
+    .polar       = true,
+    .allowed     = PL_FILTER_SCALING,
+};
+
+const struct pl_filter_config pl_filter_ewa_hann = {
+    .name        = "ewa_hann",
+    .description = "EWA Hann",
+    .kernel      = &pl_filter_function_jinc,
+    .window      = &pl_filter_function_hann,
+    .radius      = JINC_ZERO3,
+    .polar       = true,
+    .allowed     = PL_FILTER_SCALING,
+};
+
+static const struct pl_filter_config filter_ewa_hanning = {
+    .name        = "ewa_hanning",
+    .kernel      = &pl_filter_function_jinc,
+    .window      = &pl_filter_function_hann,
+    .radius      = JINC_ZERO3,
+    .polar       = true,
+    .allowed     = PL_FILTER_SCALING,
+};
+
+// Spline family
+const struct pl_filter_config pl_filter_bicubic = {
+    .name        = "bicubic",
+    .description = "Bicubic",
+    .kernel      = &pl_filter_function_cubic,
+    .params      = {1.0, 0.0},
+    .allowed     = PL_FILTER_SCALING,
+    .recommended = PL_FILTER_SCALING,
+};
+
+static const struct pl_filter_config filter_cubic = {
+    .name        = "cubic",
+    .description = "Cubic",
+    .kernel      = &pl_filter_function_cubic,
+    .params      = {1.0, 0.0},
+    .allowed     = PL_FILTER_FRAME_MIXING,
+};
+
+const struct pl_filter_config pl_filter_hermite = {
+    .name        = "hermite",
+    .description = "Hermite",
+    .kernel      = &pl_filter_function_hermite,
+    .allowed     = PL_FILTER_ALL,
+    .recommended = PL_FILTER_DOWNSCALING | PL_FILTER_FRAME_MIXING,
+};
+
+const struct pl_filter_config pl_filter_catmull_rom = {
+    .name        = "catmull_rom",
+    .description = "Catmull-Rom",
+    .kernel      = &pl_filter_function_cubic,
+    .params      = {0.0, 0.5},
+    .allowed     = PL_FILTER_ALL,
+    .recommended = PL_FILTER_SCALING,
+};
+
+const struct pl_filter_config pl_filter_mitchell = {
+    .name        = "mitchell",
+    .description = "Mitchell-Netravali",
+    .kernel      = &pl_filter_function_cubic,
+    .params      = {1/3.0, 1/3.0},
+    .allowed     = PL_FILTER_ALL,
+    .recommended = PL_FILTER_DOWNSCALING,
+};
+
+const struct pl_filter_config pl_filter_mitchell_clamp = {
+    .name        = "mitchell_clamp",
+    .description = "Mitchell (clamped)",
+    .kernel      = &pl_filter_function_cubic,
+    .params      = {1/3.0, 1/3.0},
+    .clamp       = 1.0,
+    .allowed     = PL_FILTER_ALL,
+};
+
+const struct pl_filter_config pl_filter_robidoux = {
+    .name        = "robidoux",
+    .description = "Robidoux",
+    .kernel      = &pl_filter_function_cubic,
+    .params      = {12 / (19 + 9 * M_SQRT2), 113 / (58 + 216 * M_SQRT2)},
+    .allowed     = PL_FILTER_ALL,
+};
+
+const struct pl_filter_config pl_filter_robidouxsharp = {
+    .name        = "robidouxsharp",
+    .description = "RobidouxSharp",
+    .kernel      = &pl_filter_function_cubic,
+    .params      = {6 / (13 + 7 * M_SQRT2), 7 / (2 + 12 * M_SQRT2)},
+    .allowed     = PL_FILTER_ALL,
+};
+
+const struct pl_filter_config pl_filter_ewa_robidoux = {
+    .name        = "ewa_robidoux",
+    .description = "EWA Robidoux",
+    .kernel      = &pl_filter_function_cubic,
+    .params      = {12 / (19 + 9 * M_SQRT2), 113 / (58 + 216 * M_SQRT2)},
+    .polar       = true,
+    .allowed     = PL_FILTER_SCALING,
+};
+
+const struct pl_filter_config pl_filter_ewa_robidouxsharp = {
+    .name        = "ewa_robidouxsharp",
+    .description = "EWA RobidouxSharp",
+    .kernel      = &pl_filter_function_cubic,
+    .params      = {6 / (13 + 7 * M_SQRT2), 7 / (2 + 12 * M_SQRT2)},
+    .polar       = true,
+    .allowed     = PL_FILTER_SCALING,
+};
+
+const struct pl_filter_config pl_filter_oversample = {
+    .name        = "oversample",
+    .description = "Oversampling",
+    .kernel      = &pl_filter_function_oversample,
+    .params      = {0.0},
+    .allowed     = PL_FILTER_UPSCALING | PL_FILTER_FRAME_MIXING,
+    .recommended = PL_FILTER_UPSCALING | PL_FILTER_FRAME_MIXING,
+};
+
+const struct pl_filter_config * const pl_filter_configs[] = {
+    // Sorted roughly in terms of priority / relevance
+    &pl_filter_bilinear,
+    &filter_triangle, // alias
+    &filter_linear, // pseudo-alias (frame mixing only)
+    &pl_filter_nearest,
+    &pl_filter_spline16,
+    &pl_filter_spline36,
+    &pl_filter_spline64,
+    &pl_filter_lanczos,
+    &pl_filter_ewa_lanczos,
+    &pl_filter_ewa_lanczossharp,
+    &pl_filter_ewa_lanczos4sharpest,
+    &pl_filter_bicubic,
+    &filter_cubic, // pseudo-alias (frame mixing only)
+    &pl_filter_hermite,
+    &pl_filter_gaussian,
+    &pl_filter_oversample,
+    &pl_filter_mitchell,
+    &pl_filter_mitchell_clamp,
+    &pl_filter_sinc,
+    &pl_filter_ginseng,
+    &pl_filter_ewa_jinc,
+    &pl_filter_ewa_ginseng,
+    &pl_filter_ewa_hann,
+    &filter_ewa_hanning, // alias
+    &pl_filter_catmull_rom,
+    &pl_filter_robidoux,
+    &pl_filter_robidouxsharp,
+    &pl_filter_ewa_robidoux,
+    &pl_filter_ewa_robidouxsharp,
+
+    NULL,
+};
+
+const int pl_num_filter_configs = PL_ARRAY_SIZE(pl_filter_configs) - 1;
+
+const struct pl_filter_config *
+pl_find_filter_config(const char *name, enum pl_filter_usage usage)
+{
+    if (!name)
+        return NULL;
+
+    for (int i = 0; i < pl_num_filter_configs; i++) {
+        if ((pl_filter_configs[i]->allowed & usage) != usage)
+            continue;
+        if (strcmp(name, pl_filter_configs[i]->name) == 0)
+            return pl_filter_configs[i];
+    }
+
+    return NULL;
+}
+
+// Backwards compatibility with older API
+
+const struct pl_filter_function_preset pl_filter_function_presets[] = {
+    {"none",            NULL},
+    {"box",             &pl_filter_function_box},
+    {"dirichlet",       &filter_function_dirichlet}, // alias
+    {"triangle",        &pl_filter_function_triangle},
+    {"cosine",          &pl_filter_function_cosine},
+    {"hann",            &pl_filter_function_hann},
+    {"hanning",         &filter_function_hanning}, // alias
+    {"hamming",         &pl_filter_function_hamming},
+    {"welch",           &pl_filter_function_welch},
+    {"kaiser",          &pl_filter_function_kaiser},
+    {"blackman",        &pl_filter_function_blackman},
+    {"bohman",          &pl_filter_function_bohman},
+    {"gaussian",        &pl_filter_function_gaussian},
+    {"quadratic",       &pl_filter_function_quadratic},
+    {"quadric",         &filter_function_quadric}, // alias
+    {"sinc",            &pl_filter_function_sinc},
+    {"jinc",            &pl_filter_function_jinc},
+    {"sphinx",          &pl_filter_function_sphinx},
+    {"cubic",           &pl_filter_function_cubic},
+    {"bicubic",         &filter_function_bicubic}, // alias
+    {"bcspline",        &filter_function_bcspline}, // alias
+    {"hermite",         &pl_filter_function_hermite},
+    {"spline16",        &pl_filter_function_spline16},
+    {"spline36",        &pl_filter_function_spline36},
+    {"spline64",        &pl_filter_function_spline64},
+    {0},
+};
+
+const int pl_num_filter_function_presets = PL_ARRAY_SIZE(pl_filter_function_presets) - 1;
+
+const struct pl_filter_function_preset *pl_find_filter_function_preset(const char *name)
+{
+    if (!name)
+        return NULL;
+
+    for (int i = 0; pl_filter_function_presets[i].name; i++) {
+        if (strcmp(pl_filter_function_presets[i].name, name) == 0)
+            return &pl_filter_function_presets[i];
+    }
+
+    return NULL;
+}
+
+const struct pl_filter_preset *pl_find_filter_preset(const char *name)
+{
+    if (!name)
+        return NULL;
+
+    for (int i = 0; pl_filter_presets[i].name; i++) {
+        if (strcmp(pl_filter_presets[i].name, name) == 0)
+            return &pl_filter_presets[i];
+    }
+
+    return NULL;
+}
+
+const struct pl_filter_preset pl_filter_presets[] = {
+    {"none",                NULL,                   "Built-in sampling"},
+    COMMON_FILTER_PRESETS,
+    {0}
+};
+
+const int pl_num_filter_presets = PL_ARRAY_SIZE(pl_filter_presets) - 1;
diff --git a/src/filters.h b/src/filters.h
new file mode 100644
index 0000000..c3227db
--- /dev/null
+++ b/src/filters.h
@@ -0,0 +1,58 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <libplacebo/filters.h>
+
+static inline float pl_filter_radius_bound(const struct pl_filter_config *c)
+{
+    const float r = c->radius && c->kernel->resizable ? c->radius : c->kernel->radius;
+    return c->blur > 0.0 ? r * c->blur : r;
+}
+
+#define COMMON_FILTER_PRESETS                                                   \
+    /* Highest priority / recommended filters */                                \
+    {"bilinear",            &pl_filter_bilinear,    "Bilinear"},                \
+    {"nearest",             &pl_filter_nearest,     "Nearest neighbour"},       \
+    {"bicubic",             &pl_filter_bicubic,     "Bicubic"},                 \
+    {"lanczos",             &pl_filter_lanczos,     "Lanczos"},                 \
+    {"ewa_lanczos",         &pl_filter_ewa_lanczos, "Jinc (EWA Lanczos)"},      \
+    {"ewa_lanczossharp",    &pl_filter_ewa_lanczossharp,     "Sharpened Jinc"}, \
+    {"ewa_lanczos4sharpest",&pl_filter_ewa_lanczos4sharpest, "Sharpened Jinc-AR, 4 taps"},\
+    {"gaussian",            &pl_filter_gaussian,    "Gaussian"},                \
+    {"spline16",            &pl_filter_spline16,    "Spline (2 taps)"},         \
+    {"spline36",            &pl_filter_spline36,    "Spline (3 taps)"},         \
+    {"spline64",            &pl_filter_spline64,    "Spline (4 taps)"},         \
+    {"mitchell",            &pl_filter_mitchell,    "Mitchell-Netravali"},      \
+                                                                                \
+    /* Remaining filters */                                                     \
+    {"sinc",                &pl_filter_sinc,        "Sinc (unwindowed)"},       \
+    {"ginseng",             &pl_filter_ginseng,     "Ginseng (Jinc-Sinc)"},     \
+    {"ewa_jinc",            &pl_filter_ewa_jinc,    "EWA Jinc (unwindowed)"},   \
+    {"ewa_ginseng",         &pl_filter_ewa_ginseng, "EWA Ginseng"},             \
+    {"ewa_hann",            &pl_filter_ewa_hann,    "EWA Hann"},                \
+    {"hermite",             &pl_filter_hermite,     "Hermite"},                 \
+    {"catmull_rom",         &pl_filter_catmull_rom, "Catmull-Rom"},             \
+    {"robidoux",            &pl_filter_robidoux,          "Robidoux"},          \
+    {"robidouxsharp",       &pl_filter_robidouxsharp,     "RobidouxSharp"},     \
+    {"ewa_robidoux",        &pl_filter_ewa_robidoux,      "EWA Robidoux"},      \
+    {"ewa_robidouxsharp",   &pl_filter_ewa_robidouxsharp, "EWA RobidouxSharp"}, \
+                                                                                \
+    /* Aliases */                                                               \
+    {"triangle",            &pl_filter_bilinear},                               \
+    {"ewa_hanning",         &pl_filter_ewa_hann}
diff --git a/src/format.c b/src/format.c
new file mode 100644
index 0000000..458d493
--- /dev/null
+++ b/src/format.c
@@ -0,0 +1,205 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include "common.h"
+
+void pl_str_append_asprintf_c(void *alloc, pl_str *str, const char *fmt, ...)
+{
+    va_list ap;
+    va_start(ap, fmt);
+    pl_str_append_vasprintf_c(alloc, str, fmt, ap);
+    va_end(ap);
+}
+
+void pl_str_append_vasprintf_c(void *alloc, pl_str *str, const char *fmt,
+                               va_list ap)
+{
+    for (const char *c; (c = strchr(fmt, '%')) != NULL; fmt = c + 1) {
+        // Append the preceding string literal
+        pl_str_append_raw(alloc, str, fmt, c - fmt);
+        c++; // skip '%'
+
+        char buf[32];
+        int len;
+
+        // The format character follows the % sign
+        switch (c[0]) {
+        case '%':
+            pl_str_append_raw(alloc, str, c, 1);
+            continue;
+        case 's': {
+            const char *arg = va_arg(ap, const char *);
+            pl_str_append_raw(alloc, str, arg, strlen(arg));
+            continue;
+        }
+        case '.': { // only used for %.*s
+            assert(c[1] == '*');
+            assert(c[2] == 's');
+            len = va_arg(ap, int);
+            pl_str_append_raw(alloc, str, va_arg(ap, char *), len);
+            c += 2; // skip '*s'
+            continue;
+        }
+        case 'c':
+            buf[0] = (char) va_arg(ap, int);
+            len = 1;
+            break;
+        case 'd':
+            len = pl_str_print_int(buf, sizeof(buf), va_arg(ap, int));
+            break;
+        case 'h': ; // only used for %hx
+            assert(c[1] == 'x');
+            len = pl_str_print_hex(buf, sizeof(buf), (unsigned short) va_arg(ap, unsigned int));
+            c++;
+            break;
+        case 'u':
+            len = pl_str_print_uint(buf, sizeof(buf), va_arg(ap, unsigned int));
+            break;
+        case 'l':
+            assert(c[1] == 'l');
+            switch (c[2]) {
+            case 'u':
+                len = pl_str_print_uint64(buf, sizeof(buf), va_arg(ap, unsigned long long));
+                break;
+            case 'd':
+                len = pl_str_print_int64(buf, sizeof(buf), va_arg(ap, long long));
+                break;
+            default: pl_unreachable();
+            }
+            c += 2;
+            break;
+        case 'z':
+            assert(c[1] == 'u');
+            len = pl_str_print_uint64(buf, sizeof(buf), va_arg(ap, size_t));
+            c++;
+            break;
+        case 'f':
+            len = pl_str_print_double(buf, sizeof(buf), va_arg(ap, double));
+            break;
+        default:
+            fprintf(stderr, "Invalid conversion character: '%c'!\n", c[0]);
+            abort();
+        }
+
+        pl_str_append_raw(alloc, str, buf, len);
+    }
+
+    // Append the remaining string literal
+    pl_str_append(alloc, str, pl_str0(fmt));
+}
+
+size_t pl_str_append_memprintf_c(void *alloc, pl_str *str, const char *fmt,
+                                 const void *args)
+{
+    const uint8_t *ptr = args;
+
+    for (const char *c; (c = strchr(fmt, '%')) != NULL; fmt = c + 1) {
+        pl_str_append_raw(alloc, str, fmt, c - fmt);
+        c++;
+
+        char buf[32];
+        int len;
+
+#define LOAD(var)                           \
+  do {                                      \
+      memcpy(&(var), ptr, sizeof(var));     \
+      ptr += sizeof(var);                   \
+  } while (0)
+
+        switch (c[0]) {
+        case '%':
+            pl_str_append_raw(alloc, str, c, 1);
+            continue;
+        case 's': {
+            len = strlen((const char *) ptr);
+            pl_str_append_raw(alloc, str, ptr, len);
+            ptr += len + 1; // also skip \0
+            continue;
+        }
+        case '.': {
+            assert(c[1] == '*');
+            assert(c[2] == 's');
+            LOAD(len);
+            pl_str_append_raw(alloc, str, ptr, len);
+            ptr += len; // no trailing \0
+            c += 2;
+            continue;
+        }
+        case 'c':
+            LOAD(buf[0]);
+            len = 1;
+            break;
+        case 'd': ;
+            int d;
+            LOAD(d);
+            len = pl_str_print_int(buf, sizeof(buf), d);
+            break;
+        case 'h': ;
+            assert(c[1] == 'x');
+            unsigned short hx;
+            LOAD(hx);
+            len = pl_str_print_hex(buf, sizeof(buf), hx);
+            c++;
+            break;
+        case 'u': ;
+            unsigned u;
+            LOAD(u);
+            len = pl_str_print_uint(buf, sizeof(buf), u);
+            break;
+        case 'l':
+            assert(c[1] == 'l');
+            switch (c[2]) {
+            case 'u': ;
+                long long unsigned llu;
+                LOAD(llu);
+                len = pl_str_print_uint64(buf, sizeof(buf), llu);
+                break;
+            case 'd': ;
+                long long int lld;
+                LOAD(lld);
+                len = pl_str_print_int64(buf, sizeof(buf), lld);
+                break;
+            default: pl_unreachable();
+            }
+            c += 2;
+            break;
+        case 'z': ;
+            assert(c[1] == 'u');
+            size_t zu;
+            LOAD(zu);
+            len = pl_str_print_uint64(buf, sizeof(buf), zu);
+            c++;
+            break;
+        case 'f': ;
+            double f;
+            LOAD(f);
+            len = pl_str_print_double(buf, sizeof(buf), f);
+            break;
+        default:
+            fprintf(stderr, "Invalid conversion character: '%c'!\n", c[0]);
+            abort();
+        }
+
+        pl_str_append_raw(alloc, str, buf, len);
+    }
+#undef LOAD
+
+    pl_str_append(alloc, str, pl_str0(fmt));
+    return (uintptr_t) ptr - (uintptr_t) args;
+}
diff --git a/src/gamut_mapping.c b/src/gamut_mapping.c
new file mode 100644
index 0000000..e80d0a7
--- /dev/null
+++ b/src/gamut_mapping.c
@@ -0,0 +1,1008 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include "common.h"
+#include "pl_thread.h"
+
+#include <libplacebo/gamut_mapping.h>
+
+#define fclampf(x, lo, hi) fminf(fmaxf(x, lo), hi)
+static void fix_constants(struct pl_gamut_map_constants *c)
+{
+    c->perceptual_deadzone = fclampf(c->perceptual_deadzone, 0.0f, 1.0f);
+    c->perceptual_strength = fclampf(c->perceptual_strength, 0.0f, 1.0f);
+    c->colorimetric_gamma  = fclampf(c->colorimetric_gamma, 0.0f, 10.0f);
+    c->softclip_knee       = fclampf(c->softclip_knee, 0.0f, 1.0f);
+    c->softclip_desat      = fclampf(c->softclip_desat, 0.0f, 1.0f);
+}
+
+static inline bool constants_equal(const struct pl_gamut_map_constants *a,
+                                   const struct pl_gamut_map_constants *b)
+{
+    pl_static_assert(sizeof(*a) % sizeof(float) == 0);
+    return !memcmp(a, b, sizeof(*a));
+}
+
+bool pl_gamut_map_params_equal(const struct pl_gamut_map_params *a,
+                               const struct pl_gamut_map_params *b)
+{
+    return a->function      == b->function      &&
+           a->min_luma      == b->min_luma      &&
+           a->max_luma      == b->max_luma      &&
+           a->lut_size_I    == b->lut_size_I    &&
+           a->lut_size_C    == b->lut_size_C    &&
+           a->lut_size_h    == b->lut_size_h    &&
+           a->lut_stride    == b->lut_stride    &&
+           constants_equal(&a->constants, &b->constants) &&
+           pl_raw_primaries_equal(&a->input_gamut,  &b->input_gamut) &&
+           pl_raw_primaries_equal(&a->output_gamut, &b->output_gamut);
+}
+
+#define FUN(params) (params->function ? *params->function : pl_gamut_map_clip)
+
+static void noop(float *lut, const struct pl_gamut_map_params *params);
+bool pl_gamut_map_params_noop(const struct pl_gamut_map_params *params)
+{
+    if (FUN(params).map == &noop)
+        return true;
+
+    struct pl_raw_primaries src = params->input_gamut, dst = params->output_gamut;
+    if (!pl_primaries_compatible(&dst, &src))
+        return true;
+
+    bool need_map = !pl_primaries_superset(&dst, &src);
+    need_map |= !pl_cie_xy_equal(&src.white, &dst.white);
+    if (FUN(params).bidirectional)
+        need_map |= !pl_raw_primaries_equal(&dst, &src);
+
+    return !need_map;
+}
+
+// For some minimal type safety, and code cleanliness
+struct RGB {
+    float R, G, B;
+};
+
+struct IPT {
+    float I, P, T;
+};
+
+struct ICh {
+    float I, C, h;
+};
+
+static inline struct ICh ipt2ich(struct IPT c)
+{
+    return (struct ICh) {
+        .I = c.I,
+        .C = sqrtf(c.P * c.P + c.T * c.T),
+        .h = atan2f(c.T, c.P),
+    };
+}
+
+static inline struct IPT ich2ipt(struct ICh c)
+{
+    return (struct IPT) {
+        .I = c.I,
+        .P = c.C * cosf(c.h),
+        .T = c.C * sinf(c.h),
+    };
+}
+
+static const float PQ_M1 = 2610./4096 * 1./4,
+                   PQ_M2 = 2523./4096 * 128,
+                   PQ_C1 = 3424./4096,
+                   PQ_C2 = 2413./4096 * 32,
+                   PQ_C3 = 2392./4096 * 32;
+
+enum { PQ_LUT_SIZE = 1024 };
+static const float pq_eotf_lut[1024+1] = {
+    0.0000000e+00f, 4.0422718e-09f, 1.3111372e-08f, 2.6236826e-08f, 4.3151495e-08f, 6.3746885e-08f, 8.7982383e-08f, 1.1585362e-07f,
+    1.4737819e-07f, 1.8258818e-07f, 2.2152586e-07f, 2.6424098e-07f, 3.1078907e-07f, 3.6123021e-07f, 4.1562821e-07f, 4.7405001e-07f,
+    5.3656521e-07f, 6.0324583e-07f, 6.7416568e-07f, 7.4940095e-07f, 8.2902897e-07f, 9.1312924e-07f, 1.0017822e-06f, 1.0950702e-06f,
+    1.1930764e-06f, 1.2958861e-06f, 1.4035847e-06f, 1.5162600e-06f, 1.6340000e-06f, 1.7568948e-06f, 1.8850346e-06f, 2.0185119e-06f,
+    2.1574192e-06f, 2.3018509e-06f, 2.4519029e-06f, 2.6076704e-06f, 2.7692516e-06f, 2.9367449e-06f, 3.1102509e-06f, 3.2898690e-06f,
+    3.4757019e-06f, 3.6678526e-06f, 3.8664261e-06f, 4.0715262e-06f, 4.2832601e-06f, 4.5017354e-06f, 4.7270617e-06f, 4.9593473e-06f,
+    5.1987040e-06f, 5.4452441e-06f, 5.6990819e-06f, 5.9603301e-06f, 6.2291055e-06f, 6.5055251e-06f, 6.7897080e-06f, 7.0817717e-06f,
+    7.3818379e-06f, 7.6900283e-06f, 8.0064675e-06f, 8.3312774e-06f, 8.6645849e-06f, 9.0065169e-06f, 9.3572031e-06f, 9.7167704e-06f,
+    1.0085351e-05f, 1.0463077e-05f, 1.0850082e-05f, 1.1246501e-05f, 1.1652473e-05f, 1.2068130e-05f, 1.2493614e-05f, 1.2929066e-05f,
+    1.3374626e-05f, 1.3830439e-05f, 1.4296648e-05f, 1.4773401e-05f, 1.5260848e-05f, 1.5759132e-05f, 1.6268405e-05f, 1.6788821e-05f,
+    1.7320534e-05f, 1.7863697e-05f, 1.8418467e-05f, 1.8985004e-05f, 1.9563470e-05f, 2.0154019e-05f, 2.0756818e-05f, 2.1372031e-05f,
+    2.1999824e-05f, 2.2640365e-05f, 2.3293824e-05f, 2.3960372e-05f, 2.4640186e-05f, 2.5333431e-05f, 2.6040288e-05f, 2.6760935e-05f,
+    2.7495552e-05f, 2.8244319e-05f, 2.9007421e-05f, 2.9785041e-05f, 3.0577373e-05f, 3.1384594e-05f, 3.2206899e-05f, 3.3044481e-05f,
+    3.3897533e-05f, 3.4766253e-05f, 3.5650838e-05f, 3.6551487e-05f, 3.7468409e-05f, 3.8401794e-05f, 3.9351855e-05f, 4.0318799e-05f,
+    4.1302836e-05f, 4.2304177e-05f, 4.3323036e-05f, 4.4359629e-05f, 4.5414181e-05f, 4.6486897e-05f, 4.7578006e-05f, 4.8687732e-05f,
+    4.9816302e-05f, 5.0963944e-05f, 5.2130889e-05f, 5.3317369e-05f, 5.4523628e-05f, 5.5749886e-05f, 5.6996391e-05f, 5.8263384e-05f,
+    5.9551111e-05f, 6.0859816e-05f, 6.2189750e-05f, 6.3541162e-05f, 6.4914307e-05f, 6.6309439e-05f, 6.7726819e-05f, 6.9166705e-05f,
+    7.0629384e-05f, 7.2115077e-05f, 7.3624074e-05f, 7.5156646e-05f, 7.6713065e-05f, 7.8293608e-05f, 7.9898553e-05f, 8.1528181e-05f,
+    8.3182776e-05f, 8.4862623e-05f, 8.6568012e-05f, 8.8299235e-05f, 9.0056585e-05f, 9.1840360e-05f, 9.3650860e-05f, 9.5488388e-05f,
+    9.7353277e-05f, 9.9245779e-05f, 1.0116623e-04f, 1.0311496e-04f, 1.0509226e-04f, 1.0709847e-04f, 1.0913391e-04f, 1.1119889e-04f,
+    1.1329376e-04f, 1.1541885e-04f, 1.1757448e-04f, 1.1976100e-04f, 1.2197875e-04f, 1.2422807e-04f, 1.2650931e-04f, 1.2882282e-04f,
+    1.3116900e-04f, 1.3354812e-04f, 1.3596059e-04f, 1.3840676e-04f, 1.4088701e-04f, 1.4340170e-04f, 1.4595121e-04f, 1.4853593e-04f,
+    1.5115622e-04f, 1.5381247e-04f, 1.5650507e-04f, 1.5923442e-04f, 1.6200090e-04f, 1.6480492e-04f, 1.6764688e-04f, 1.7052718e-04f,
+    1.7344629e-04f, 1.7640451e-04f, 1.7940233e-04f, 1.8244015e-04f, 1.8551840e-04f, 1.8863752e-04f, 1.9179792e-04f, 1.9500006e-04f,
+    1.9824437e-04f, 2.0153130e-04f, 2.0486129e-04f, 2.0823479e-04f, 2.1165227e-04f, 2.1511419e-04f, 2.1862101e-04f, 2.2217319e-04f,
+    2.2577128e-04f, 2.2941563e-04f, 2.3310679e-04f, 2.3684523e-04f, 2.4063146e-04f, 2.4446597e-04f, 2.4834925e-04f, 2.5228182e-04f,
+    2.5626417e-04f, 2.6029683e-04f, 2.6438031e-04f, 2.6851514e-04f, 2.7270184e-04f, 2.7694094e-04f, 2.8123299e-04f, 2.8557852e-04f,
+    2.8997815e-04f, 2.9443230e-04f, 2.9894159e-04f, 3.0350657e-04f, 3.0812783e-04f, 3.1280593e-04f, 3.1754144e-04f, 3.2233495e-04f,
+    3.2718705e-04f, 3.3209833e-04f, 3.3706938e-04f, 3.4210082e-04f, 3.4719324e-04f, 3.5234727e-04f, 3.5756351e-04f, 3.6284261e-04f,
+    3.6818526e-04f, 3.7359195e-04f, 3.7906340e-04f, 3.8460024e-04f, 3.9020315e-04f, 3.9587277e-04f, 4.0160977e-04f, 4.0741483e-04f,
+    4.1328861e-04f, 4.1923181e-04f, 4.2524511e-04f, 4.3132921e-04f, 4.3748480e-04f, 4.4371260e-04f, 4.5001332e-04f, 4.5638768e-04f,
+    4.6283650e-04f, 4.6936032e-04f, 4.7595999e-04f, 4.8263624e-04f, 4.8938982e-04f, 4.9622151e-04f, 5.0313205e-04f, 5.1012223e-04f,
+    5.1719283e-04f, 5.2434463e-04f, 5.3157843e-04f, 5.3889502e-04f, 5.4629521e-04f, 5.5377982e-04f, 5.6134968e-04f, 5.6900560e-04f,
+    5.7674843e-04f, 5.8457900e-04f, 5.9249818e-04f, 6.0050682e-04f, 6.0860578e-04f, 6.1679595e-04f, 6.2507819e-04f, 6.3345341e-04f,
+    6.4192275e-04f, 6.5048661e-04f, 6.5914616e-04f, 6.6790231e-04f, 6.7675600e-04f, 6.8570816e-04f, 6.9475975e-04f, 7.0391171e-04f,
+    7.1316500e-04f, 7.2252060e-04f, 7.3197948e-04f, 7.4154264e-04f, 7.5121107e-04f, 7.6098577e-04f, 7.7086777e-04f, 7.8085807e-04f,
+    7.9095772e-04f, 8.0116775e-04f, 8.1148922e-04f, 8.2192318e-04f, 8.3247071e-04f, 8.4313287e-04f, 8.5391076e-04f, 8.6480548e-04f,
+    8.7581812e-04f, 8.8694982e-04f, 8.9820168e-04f, 9.0957485e-04f, 9.2107048e-04f, 9.3268971e-04f, 9.4443372e-04f, 9.5630368e-04f,
+    9.6830115e-04f, 9.8042658e-04f, 9.9268155e-04f, 1.0050673e-03f, 1.0175850e-03f, 1.0302359e-03f, 1.0430213e-03f, 1.0559425e-03f,
+    1.0690006e-03f, 1.0821970e-03f, 1.0955331e-03f, 1.1090100e-03f, 1.1226290e-03f, 1.1363917e-03f, 1.1502992e-03f, 1.1643529e-03f,
+    1.1785542e-03f, 1.1929044e-03f, 1.2074050e-03f, 1.2220573e-03f, 1.2368628e-03f, 1.2518229e-03f, 1.2669390e-03f, 1.2822125e-03f,
+    1.2976449e-03f, 1.3132377e-03f, 1.3289925e-03f, 1.3449105e-03f, 1.3609935e-03f, 1.3772429e-03f, 1.3936602e-03f, 1.4102470e-03f,
+    1.4270054e-03f, 1.4439360e-03f, 1.4610407e-03f, 1.4783214e-03f, 1.4957794e-03f, 1.5134166e-03f, 1.5312345e-03f, 1.5492348e-03f,
+    1.5674192e-03f, 1.5857894e-03f, 1.6043471e-03f, 1.6230939e-03f, 1.6420317e-03f, 1.6611622e-03f, 1.6804871e-03f, 1.7000083e-03f,
+    1.7197275e-03f, 1.7396465e-03f, 1.7597672e-03f, 1.7800914e-03f, 1.8006210e-03f, 1.8213578e-03f, 1.8423038e-03f, 1.8634608e-03f,
+    1.8848308e-03f, 1.9064157e-03f, 1.9282175e-03f, 1.9502381e-03f, 1.9724796e-03f, 1.9949439e-03f, 2.0176331e-03f, 2.0405492e-03f,
+    2.0636950e-03f, 2.0870711e-03f, 2.1106805e-03f, 2.1345250e-03f, 2.1586071e-03f, 2.1829286e-03f, 2.2074919e-03f, 2.2322992e-03f,
+    2.2573525e-03f, 2.2826542e-03f, 2.3082066e-03f, 2.3340118e-03f, 2.3600721e-03f, 2.3863900e-03f, 2.4129676e-03f, 2.4398074e-03f,
+    2.4669117e-03f, 2.4942828e-03f, 2.5219233e-03f, 2.5498355e-03f, 2.5780219e-03f, 2.6064849e-03f, 2.6352271e-03f, 2.6642509e-03f,
+    2.6935589e-03f, 2.7231536e-03f, 2.7530377e-03f, 2.7832137e-03f, 2.8136843e-03f, 2.8444520e-03f, 2.8755196e-03f, 2.9068898e-03f,
+    2.9385662e-03f, 2.9705496e-03f, 3.0028439e-03f, 3.0354517e-03f, 3.0683758e-03f, 3.1016192e-03f, 3.1351846e-03f, 3.1690750e-03f,
+    3.2032932e-03f, 3.2378422e-03f, 3.2727250e-03f, 3.3079445e-03f, 3.3435038e-03f, 3.3794058e-03f, 3.4156537e-03f, 3.4522505e-03f,
+    3.4891993e-03f, 3.5265034e-03f, 3.5641658e-03f, 3.6021897e-03f, 3.6405785e-03f, 3.6793353e-03f, 3.7184634e-03f, 3.7579661e-03f,
+    3.7978468e-03f, 3.8381088e-03f, 3.8787555e-03f, 3.9197904e-03f, 3.9612169e-03f, 4.0030385e-03f, 4.0452587e-03f, 4.0878810e-03f,
+    4.1309104e-03f, 4.1743478e-03f, 4.2181981e-03f, 4.2624651e-03f, 4.3071525e-03f, 4.3522639e-03f, 4.3978031e-03f, 4.4437739e-03f,
+    4.4901803e-03f, 4.5370259e-03f, 4.5843148e-03f, 4.6320508e-03f, 4.6802379e-03f, 4.7288801e-03f, 4.7779815e-03f, 4.8275461e-03f,
+    4.8775780e-03f, 4.9280813e-03f, 4.9790603e-03f, 5.0305191e-03f, 5.0824620e-03f, 5.1348933e-03f, 5.1878172e-03f, 5.2412382e-03f,
+    5.2951607e-03f, 5.3495890e-03f, 5.4045276e-03f, 5.4599811e-03f, 5.5159540e-03f, 5.5724510e-03f, 5.6294765e-03f, 5.6870353e-03f,
+    5.7451339e-03f, 5.8037735e-03f, 5.8629606e-03f, 5.9227001e-03f, 5.9829968e-03f, 6.0438557e-03f, 6.1052818e-03f, 6.1672799e-03f,
+    6.2298552e-03f, 6.2930128e-03f, 6.3567578e-03f, 6.4210953e-03f, 6.4860306e-03f, 6.5515690e-03f, 6.6177157e-03f, 6.6844762e-03f,
+    6.7518558e-03f, 6.8198599e-03f, 6.8884942e-03f, 6.9577641e-03f, 7.0276752e-03f, 7.0982332e-03f, 7.1694438e-03f, 7.2413127e-03f,
+    7.3138457e-03f, 7.3870486e-03f, 7.4609273e-03f, 7.5354878e-03f, 7.6107361e-03f, 7.6866782e-03f, 7.7633203e-03f, 7.8406684e-03f,
+    7.9187312e-03f, 7.9975101e-03f, 8.0770139e-03f, 8.1572490e-03f, 8.2382216e-03f, 8.3199385e-03f, 8.4024059e-03f, 8.4856307e-03f,
+    8.5696193e-03f, 8.6543786e-03f, 8.7399153e-03f, 8.8262362e-03f, 8.9133482e-03f, 9.0012582e-03f, 9.0899733e-03f, 9.1795005e-03f,
+    9.2698470e-03f, 9.3610199e-03f, 9.4530265e-03f, 9.5458741e-03f, 9.6395701e-03f, 9.7341219e-03f, 9.8295370e-03f, 9.9258231e-03f,
+    1.0022988e-02f, 1.0121039e-02f, 1.0219984e-02f, 1.0319830e-02f, 1.0420587e-02f, 1.0522261e-02f, 1.0624862e-02f, 1.0728396e-02f,
+    1.0832872e-02f, 1.0938299e-02f, 1.1044684e-02f, 1.1152036e-02f, 1.1260365e-02f, 1.1369677e-02f, 1.1479982e-02f, 1.1591288e-02f,
+    1.1703605e-02f, 1.1816941e-02f, 1.1931305e-02f, 1.2046706e-02f, 1.2163153e-02f, 1.2280656e-02f, 1.2399223e-02f, 1.2518864e-02f,
+    1.2639596e-02f, 1.2761413e-02f, 1.2884333e-02f, 1.3008365e-02f, 1.3133519e-02f, 1.3259804e-02f, 1.3387231e-02f, 1.3515809e-02f,
+    1.3645549e-02f, 1.3776461e-02f, 1.3908555e-02f, 1.4041841e-02f, 1.4176331e-02f, 1.4312034e-02f, 1.4448961e-02f, 1.4587123e-02f,
+    1.4726530e-02f, 1.4867194e-02f, 1.5009126e-02f, 1.5152336e-02f, 1.5296837e-02f, 1.5442638e-02f, 1.5589753e-02f, 1.5738191e-02f,
+    1.5887965e-02f, 1.6039087e-02f, 1.6191567e-02f, 1.6345419e-02f, 1.6500655e-02f, 1.6657285e-02f, 1.6815323e-02f, 1.6974781e-02f,
+    1.7135672e-02f, 1.7298007e-02f, 1.7461800e-02f, 1.7627063e-02f, 1.7793810e-02f, 1.7962053e-02f, 1.8131805e-02f, 1.8303080e-02f,
+    1.8475891e-02f, 1.8650252e-02f, 1.8826176e-02f, 1.9003676e-02f, 1.9182767e-02f, 1.9363463e-02f, 1.9545777e-02f, 1.9729724e-02f,
+    1.9915319e-02f, 2.0102575e-02f, 2.0291507e-02f, 2.0482131e-02f, 2.0674460e-02f, 2.0868510e-02f, 2.1064296e-02f, 2.1261833e-02f,
+    2.1461136e-02f, 2.1662222e-02f, 2.1865105e-02f, 2.2069802e-02f, 2.2276328e-02f, 2.2484700e-02f, 2.2694934e-02f, 2.2907045e-02f,
+    2.3121064e-02f, 2.3336982e-02f, 2.3554827e-02f, 2.3774618e-02f, 2.3996370e-02f, 2.4220102e-02f, 2.4445831e-02f, 2.4673574e-02f,
+    2.4903349e-02f, 2.5135174e-02f, 2.5369067e-02f, 2.5605046e-02f, 2.5843129e-02f, 2.6083336e-02f, 2.6325684e-02f, 2.6570192e-02f,
+    2.6816880e-02f, 2.7065767e-02f, 2.7316872e-02f, 2.7570215e-02f, 2.7825815e-02f, 2.8083692e-02f, 2.8343867e-02f, 2.8606359e-02f,
+    2.8871189e-02f, 2.9138378e-02f, 2.9407946e-02f, 2.9679914e-02f, 2.9954304e-02f, 3.0231137e-02f, 3.0510434e-02f, 3.0792217e-02f,
+    3.1076508e-02f, 3.1363330e-02f, 3.1652704e-02f, 3.1944653e-02f, 3.2239199e-02f, 3.2536367e-02f, 3.2836178e-02f, 3.3138657e-02f,
+    3.3443826e-02f, 3.3751710e-02f, 3.4062333e-02f, 3.4375718e-02f, 3.4691890e-02f, 3.5010874e-02f, 3.5332694e-02f, 3.5657377e-02f,
+    3.5984946e-02f, 3.6315428e-02f, 3.6648848e-02f, 3.6985233e-02f, 3.7324608e-02f, 3.7667000e-02f, 3.8012436e-02f, 3.8360942e-02f,
+    3.8712547e-02f, 3.9067276e-02f, 3.9425159e-02f, 3.9786223e-02f, 4.0150496e-02f, 4.0518006e-02f, 4.0888783e-02f, 4.1262855e-02f,
+    4.1640274e-02f, 4.2021025e-02f, 4.2405159e-02f, 4.2792707e-02f, 4.3183699e-02f, 4.3578166e-02f, 4.3976138e-02f, 4.4377647e-02f,
+    4.4782724e-02f, 4.5191401e-02f, 4.5603709e-02f, 4.6019681e-02f, 4.6439350e-02f, 4.6862749e-02f, 4.7289910e-02f, 4.7720867e-02f,
+    4.8155654e-02f, 4.8594305e-02f, 4.9036854e-02f, 4.9483336e-02f, 4.9933787e-02f, 5.0388240e-02f, 5.0846733e-02f, 5.1309301e-02f,
+    5.1775981e-02f, 5.2246808e-02f, 5.2721821e-02f, 5.3201056e-02f, 5.3684551e-02f, 5.4172344e-02f, 5.4664473e-02f, 5.5160978e-02f,
+    5.5661897e-02f, 5.6167269e-02f, 5.6677135e-02f, 5.7191535e-02f, 5.7710508e-02f, 5.8234097e-02f, 5.8762342e-02f, 5.9295285e-02f,
+    5.9832968e-02f, 6.0375433e-02f, 6.0922723e-02f, 6.1474882e-02f, 6.2031952e-02f, 6.2593979e-02f, 6.3161006e-02f, 6.3733078e-02f,
+    6.4310241e-02f, 6.4892540e-02f, 6.5480021e-02f, 6.6072730e-02f, 6.6670715e-02f, 6.7274023e-02f, 6.7882702e-02f, 6.8496800e-02f,
+    6.9116365e-02f, 6.9741447e-02f, 7.0372096e-02f, 7.1008361e-02f, 7.1650293e-02f, 7.2297942e-02f, 7.2951361e-02f, 7.3610602e-02f,
+    7.4275756e-02f, 7.4946797e-02f, 7.5623818e-02f, 7.6306873e-02f, 7.6996016e-02f, 7.7691302e-02f, 7.8392787e-02f, 7.9100526e-02f,
+    7.9814576e-02f, 8.0534993e-02f, 8.1261837e-02f, 8.1995163e-02f, 8.2735032e-02f, 8.3481501e-02f, 8.4234632e-02f, 8.4994483e-02f,
+    8.5761116e-02f, 8.6534592e-02f, 8.7314974e-02f, 8.8102323e-02f, 8.8896702e-02f, 8.9698176e-02f, 9.0506809e-02f, 9.1322665e-02f,
+    9.2145810e-02f, 9.2976310e-02f, 9.3814232e-02f, 9.4659643e-02f, 9.5512612e-02f, 9.6373206e-02f, 9.7241496e-02f, 9.8117550e-02f,
+    9.9001441e-02f, 9.9893238e-02f, 1.0079301e-01f, 1.0170084e-01f, 1.0261679e-01f, 1.0354094e-01f, 1.0447337e-01f, 1.0541414e-01f,
+    1.0636334e-01f, 1.0732104e-01f, 1.0828731e-01f, 1.0926225e-01f, 1.1024592e-01f, 1.1123841e-01f, 1.1223979e-01f, 1.1325016e-01f,
+    1.1426958e-01f, 1.1529814e-01f, 1.1633594e-01f, 1.1738304e-01f, 1.1843954e-01f, 1.1950552e-01f, 1.2058107e-01f, 1.2166627e-01f,
+    1.2276122e-01f, 1.2386601e-01f, 1.2498072e-01f, 1.2610544e-01f, 1.2724027e-01f, 1.2838531e-01f, 1.2954063e-01f, 1.3070635e-01f,
+    1.3188262e-01f, 1.3306940e-01f, 1.3426686e-01f, 1.3547509e-01f, 1.3669420e-01f, 1.3792428e-01f, 1.3916544e-01f, 1.4041778e-01f,
+    1.4168140e-01f, 1.4295640e-01f, 1.4424289e-01f, 1.4554098e-01f, 1.4685078e-01f, 1.4817238e-01f, 1.4950591e-01f, 1.5085147e-01f,
+    1.5220916e-01f, 1.5357912e-01f, 1.5496144e-01f, 1.5635624e-01f, 1.5776364e-01f, 1.5918375e-01f, 1.6061670e-01f, 1.6206260e-01f,
+    1.6352156e-01f, 1.6499372e-01f, 1.6647920e-01f, 1.6797811e-01f, 1.6949059e-01f, 1.7101676e-01f, 1.7255674e-01f, 1.7411067e-01f,
+    1.7567867e-01f, 1.7726087e-01f, 1.7885742e-01f, 1.8046844e-01f, 1.8209406e-01f, 1.8373443e-01f, 1.8538967e-01f, 1.8705994e-01f,
+    1.8874536e-01f, 1.9044608e-01f, 1.9216225e-01f, 1.9389401e-01f, 1.9564150e-01f, 1.9740486e-01f, 1.9918426e-01f, 2.0097984e-01f,
+    2.0279175e-01f, 2.0462014e-01f, 2.0646517e-01f, 2.0832699e-01f, 2.1020577e-01f, 2.1210165e-01f, 2.1401481e-01f, 2.1594540e-01f,
+    2.1789359e-01f, 2.1985954e-01f, 2.2184342e-01f, 2.2384540e-01f, 2.2586565e-01f, 2.2790434e-01f, 2.2996165e-01f, 2.3203774e-01f,
+    2.3413293e-01f, 2.3624714e-01f, 2.3838068e-01f, 2.4053372e-01f, 2.4270646e-01f, 2.4489908e-01f, 2.4711177e-01f, 2.4934471e-01f,
+    2.5159811e-01f, 2.5387214e-01f, 2.5616702e-01f, 2.5848293e-01f, 2.6082007e-01f, 2.6317866e-01f, 2.6555888e-01f, 2.6796095e-01f,
+    2.7038507e-01f, 2.7283145e-01f, 2.7530031e-01f, 2.7779186e-01f, 2.8030631e-01f, 2.8284388e-01f, 2.8540479e-01f, 2.8798927e-01f,
+    2.9059754e-01f, 2.9322983e-01f, 2.9588635e-01f, 2.9856736e-01f, 3.0127308e-01f, 3.0400374e-01f, 3.0675959e-01f, 3.0954086e-01f,
+    3.1234780e-01f, 3.1518066e-01f, 3.1803969e-01f, 3.2092512e-01f, 3.2383723e-01f, 3.2677625e-01f, 3.2974246e-01f, 3.3273611e-01f,
+    3.3575747e-01f, 3.3880680e-01f, 3.4188437e-01f, 3.4499045e-01f, 3.4812533e-01f, 3.5128926e-01f, 3.5448255e-01f, 3.5770546e-01f,
+    3.6095828e-01f, 3.6424131e-01f, 3.6755483e-01f, 3.7089914e-01f, 3.7427454e-01f, 3.7768132e-01f, 3.8111979e-01f, 3.8459027e-01f,
+    3.8809304e-01f, 3.9162844e-01f, 3.9519678e-01f, 3.9879837e-01f, 4.0243354e-01f, 4.0610261e-01f, 4.0980592e-01f, 4.1354380e-01f,
+    4.1731681e-01f, 4.2112483e-01f, 4.2496844e-01f, 4.2884798e-01f, 4.3276381e-01f, 4.3671627e-01f, 4.4070572e-01f, 4.4473253e-01f,
+    4.4879706e-01f, 4.5289968e-01f, 4.5704076e-01f, 4.6122068e-01f, 4.6543981e-01f, 4.6969854e-01f, 4.7399727e-01f, 4.7833637e-01f,
+    4.8271625e-01f, 4.8713731e-01f, 4.9159995e-01f, 4.9610458e-01f, 5.0065162e-01f, 5.0524147e-01f, 5.0987457e-01f, 5.1455133e-01f,
+    5.1927219e-01f, 5.2403759e-01f, 5.2884795e-01f, 5.3370373e-01f, 5.3860537e-01f, 5.4355333e-01f, 5.4854807e-01f, 5.5359004e-01f,
+    5.5867972e-01f, 5.6381757e-01f, 5.6900408e-01f, 5.7423972e-01f, 5.7952499e-01f, 5.8486037e-01f, 5.9024637e-01f, 5.9568349e-01f,
+    6.0117223e-01f, 6.0671311e-01f, 6.1230664e-01f, 6.1795336e-01f, 6.2365379e-01f, 6.2940847e-01f, 6.3521793e-01f, 6.4108273e-01f,
+    6.4700342e-01f, 6.5298056e-01f, 6.5901471e-01f, 6.6510643e-01f, 6.7125632e-01f, 6.7746495e-01f, 6.8373290e-01f, 6.9006078e-01f,
+    6.9644918e-01f, 7.0289872e-01f, 7.0941001e-01f, 7.1598366e-01f, 7.2262031e-01f, 7.2932059e-01f, 7.3608513e-01f, 7.4291460e-01f,
+    7.4981006e-01f, 7.5677134e-01f, 7.6379952e-01f, 7.7089527e-01f, 7.7805929e-01f, 7.8529226e-01f, 7.9259489e-01f, 7.9996786e-01f,
+    8.0741191e-01f, 8.1492774e-01f, 8.2251609e-01f, 8.3017769e-01f, 8.3791329e-01f, 8.4572364e-01f, 8.5360950e-01f, 8.6157163e-01f,
+    8.6961082e-01f, 8.7772786e-01f, 8.8592352e-01f, 8.9419862e-01f, 9.0255397e-01f, 9.1099038e-01f, 9.1950869e-01f, 9.2810973e-01f,
+    9.3679435e-01f, 9.4556340e-01f, 9.5441776e-01f, 9.6335829e-01f, 9.7238588e-01f, 9.8150143e-01f, 9.9070583e-01f, 1.0000000e+00f,
+    1.0f, // extra padding to avoid out of bounds access
+};
+
+static inline float pq_eotf(float x)
+{
+    float idxf  = fminf(fmaxf(x, 0.0f), 1.0f) * (PQ_LUT_SIZE - 1);
+    int ipart   = floorf(idxf);
+    float fpart = idxf - ipart;
+    return PL_MIX(pq_eotf_lut[ipart], pq_eotf_lut[ipart + 1], fpart);
+}
+
+static inline float pq_oetf(float x)
+{
+    x = powf(fmaxf(x, 0.0f), PQ_M1);
+    x = (PQ_C1 + PQ_C2 * x) / (1.0f + PQ_C3 * x);
+    return powf(x, PQ_M2);
+}
+
+// Helper struct containing pre-computed cached values describing a gamut
+struct gamut {
+    pl_matrix3x3 lms2rgb;
+    pl_matrix3x3 rgb2lms;
+    float min_luma, max_luma;   // pq
+    float min_rgb,  max_rgb;    // 10k normalized
+    struct ICh *peak_cache;     // 1-item cache for computed peaks (per hue)
+};
+
+struct cache {
+    struct ICh src_cache;
+    struct ICh dst_cache;
+};
+
+static void get_gamuts(struct gamut *dst, struct gamut *src, struct cache *cache,
+                       const struct pl_gamut_map_params *params)
+{
+    const float epsilon = 1e-6;
+    memset(cache, 0, sizeof(*cache));
+    struct gamut base = {
+        .min_luma = params->min_luma,
+        .max_luma = params->max_luma,
+        .min_rgb  = pq_eotf(params->min_luma) - epsilon,
+        .max_rgb  = pq_eotf(params->max_luma) + epsilon,
+    };
+
+    if (dst) {
+        *dst = base;
+        dst->lms2rgb = dst->rgb2lms = pl_ipt_rgb2lms(&params->output_gamut);
+        dst->peak_cache = &cache->dst_cache;
+        pl_matrix3x3_invert(&dst->lms2rgb);
+    }
+
+    if (src) {
+        *src = base;
+        src->lms2rgb = src->rgb2lms = pl_ipt_rgb2lms(&params->input_gamut);
+        src->peak_cache = &cache->src_cache;
+        pl_matrix3x3_invert(&src->lms2rgb);
+    }
+}
+
+static inline struct IPT rgb2ipt(struct RGB c, struct gamut gamut)
+{
+    const float L = gamut.rgb2lms.m[0][0] * c.R +
+                    gamut.rgb2lms.m[0][1] * c.G +
+                    gamut.rgb2lms.m[0][2] * c.B;
+    const float M = gamut.rgb2lms.m[1][0] * c.R +
+                    gamut.rgb2lms.m[1][1] * c.G +
+                    gamut.rgb2lms.m[1][2] * c.B;
+    const float S = gamut.rgb2lms.m[2][0] * c.R +
+                    gamut.rgb2lms.m[2][1] * c.G +
+                    gamut.rgb2lms.m[2][2] * c.B;
+    const float Lp = pq_oetf(L);
+    const float Mp = pq_oetf(M);
+    const float Sp = pq_oetf(S);
+    return (struct IPT) {
+        .I = 0.4000f * Lp + 0.4000f * Mp + 0.2000f * Sp,
+        .P = 4.4550f * Lp - 4.8510f * Mp + 0.3960f * Sp,
+        .T = 0.8056f * Lp + 0.3572f * Mp - 1.1628f * Sp,
+    };
+}
+
+static inline struct RGB ipt2rgb(struct IPT c, struct gamut gamut)
+{
+    const float Lp = c.I + 0.0975689f * c.P + 0.205226f * c.T;
+    const float Mp = c.I - 0.1138760f * c.P + 0.133217f * c.T;
+    const float Sp = c.I + 0.0326151f * c.P - 0.676887f * c.T;
+    const float L = pq_eotf(Lp);
+    const float M = pq_eotf(Mp);
+    const float S = pq_eotf(Sp);
+    return (struct RGB) {
+        .R = gamut.lms2rgb.m[0][0] * L +
+             gamut.lms2rgb.m[0][1] * M +
+             gamut.lms2rgb.m[0][2] * S,
+        .G = gamut.lms2rgb.m[1][0] * L +
+             gamut.lms2rgb.m[1][1] * M +
+             gamut.lms2rgb.m[1][2] * S,
+        .B = gamut.lms2rgb.m[2][0] * L +
+             gamut.lms2rgb.m[2][1] * M +
+             gamut.lms2rgb.m[2][2] * S,
+    };
+}
+
+static inline bool ingamut(struct IPT c, struct gamut gamut)
+{
+    const float Lp = c.I + 0.0975689f * c.P + 0.205226f * c.T;
+    const float Mp = c.I - 0.1138760f * c.P + 0.133217f * c.T;
+    const float Sp = c.I + 0.0326151f * c.P - 0.676887f * c.T;
+    if (Lp < gamut.min_luma || Lp > gamut.max_luma ||
+        Mp < gamut.min_luma || Mp > gamut.max_luma ||
+        Sp < gamut.min_luma || Sp > gamut.max_luma)
+    {
+        // Early exit for values outside legal LMS range
+        return false;
+    }
+
+    const float L = pq_eotf(Lp);
+    const float M = pq_eotf(Mp);
+    const float S = pq_eotf(Sp);
+    struct RGB rgb = {
+        .R = gamut.lms2rgb.m[0][0] * L +
+             gamut.lms2rgb.m[0][1] * M +
+             gamut.lms2rgb.m[0][2] * S,
+        .G = gamut.lms2rgb.m[1][0] * L +
+             gamut.lms2rgb.m[1][1] * M +
+             gamut.lms2rgb.m[1][2] * S,
+        .B = gamut.lms2rgb.m[2][0] * L +
+             gamut.lms2rgb.m[2][1] * M +
+             gamut.lms2rgb.m[2][2] * S,
+    };
+    return rgb.R >= gamut.min_rgb && rgb.R <= gamut.max_rgb &&
+           rgb.G >= gamut.min_rgb && rgb.G <= gamut.max_rgb &&
+           rgb.B >= gamut.min_rgb && rgb.B <= gamut.max_rgb;
+}
+
+struct generate_args {
+    const struct pl_gamut_map_params *params;
+    float *out;
+    int start;
+    int count;
+};
+
+static PL_THREAD_VOID generate(void *priv)
+{
+    const struct generate_args *args = priv;
+    const struct pl_gamut_map_params *params = args->params;
+
+    float *in = args->out;
+    const int end = args->start + args->count;
+    for (int h = args->start; h < end; h++) {
+        for (int C = 0; C < params->lut_size_C; C++) {
+            for (int I = 0; I < params->lut_size_I; I++) {
+                float Ix = (float) I / (params->lut_size_I - 1);
+                float Cx = (float) C / (params->lut_size_C - 1);
+                float hx = (float) h / (params->lut_size_h - 1);
+                struct IPT ipt = ich2ipt((struct ICh) {
+                    .I = PL_MIX(params->min_luma, params->max_luma, Ix),
+                    .C = PL_MIX(0.0f, 0.5f, Cx),
+                    .h = PL_MIX(-M_PI, M_PI, hx),
+                });
+                in[0] = ipt.I;
+                in[1] = ipt.P;
+                in[2] = ipt.T;
+                in += params->lut_stride;
+            }
+        }
+    }
+
+    struct pl_gamut_map_params fixed = *params;
+    fix_constants(&fixed.constants);
+    fixed.lut_size_h = args->count;
+    FUN(params).map(args->out, &fixed);
+    PL_THREAD_RETURN();
+}
+
+void pl_gamut_map_generate(float *out, const struct pl_gamut_map_params *params)
+{
+    enum { MAX_WORKERS = 32 };
+    struct generate_args args[MAX_WORKERS];
+
+    const int num_per_worker = PL_DIV_UP(params->lut_size_h, MAX_WORKERS);
+    const int num_workers = PL_DIV_UP(params->lut_size_h, num_per_worker);
+    for (int i = 0; i < num_workers; i++) {
+        const int start = i * num_per_worker;
+        const int count = PL_MIN(num_per_worker, params->lut_size_h - start);
+        args[i] = (struct generate_args) {
+            .params = params,
+            .out    = out,
+            .start  = start,
+            .count  = count,
+        };
+        out += count * params->lut_size_C * params->lut_size_I * params->lut_stride;
+    }
+
+    pl_thread workers[MAX_WORKERS] = {0};
+    for (int i = 0; i < num_workers; i++) {
+        if (pl_thread_create(&workers[i], generate, &args[i]) != 0)
+            generate(&args[i]); // fallback
+    }
+
+    for (int i = 0; i < num_workers; i++) {
+        if (!workers[i])
+            continue;
+        if (pl_thread_join(workers[i]) != 0)
+            generate(&args[i]); // fallback
+    }
+}
+
+void pl_gamut_map_sample(float x[3], const struct pl_gamut_map_params *params)
+{
+    struct pl_gamut_map_params fixed = *params;
+    fix_constants(&fixed.constants);
+    fixed.lut_size_I = fixed.lut_size_C = fixed.lut_size_h = 1;
+    fixed.lut_stride = 3;
+
+    FUN(params).map(x, &fixed);
+}
+
+#define LUT_SIZE(p) (p->lut_size_I * p->lut_size_C * p->lut_size_h * p->lut_stride)
+#define FOREACH_LUT(lut, C)                                                     \
+    for (struct IPT *_i = (struct IPT *) lut,                                   \
+                    *_end = (struct IPT *) (lut + LUT_SIZE(params)),            \
+                    C;                                                          \
+         _i < _end && ( C = *_i, 1 );                                           \
+         *_i = C, _i = (struct IPT *) ((float *) _i + params->lut_stride))
+
+// Something like PL_MIX(base, c, x) but follows an exponential curve, note
+// that this can be used to extend 'c' outwards for x > 1
+static inline struct ICh mix_exp(struct ICh c, float x, float gamma, float base)
+{
+    return (struct ICh) {
+        .I = base + (c.I - base) * powf(x, gamma),
+        .C = c.C * x,
+        .h = c.h,
+    };
+}
+
+// Drop gamma for colors approaching black and achromatic to avoid numerical
+// instabilities, and excessive brightness boosting of grain, while also
+// strongly boosting gamma for values exceeding the target peak
+static inline float scale_gamma(float gamma, struct ICh ich, struct ICh peak,
+                                struct gamut gamut)
+{
+    const float Imin = gamut.min_luma;
+    const float Irel = fmaxf((ich.I - Imin) / (peak.I - Imin), 0.0f);
+    return gamma * powf(Irel, 3) * fminf(ich.C / peak.C, 1.0f);
+}
+
+static const float maxDelta = 5e-5f;
+
+// Find gamut intersection using specified bounds
+static inline struct ICh
+desat_bounded(float I, float h, float Cmin, float Cmax, struct gamut gamut)
+{
+    if (I <= gamut.min_luma)
+        return (struct ICh) { .I = gamut.min_luma, .C = 0, .h = h };
+    if (I >= gamut.max_luma)
+        return (struct ICh) { .I = gamut.max_luma, .C = 0, .h = h };
+
+    const float maxDI = I * maxDelta;
+    struct ICh res = { .I = I, .C = (Cmin + Cmax) / 2, .h = h };
+    do {
+        if (ingamut(ich2ipt(res), gamut)) {
+            Cmin = res.C;
+        } else {
+            Cmax = res.C;
+        }
+        res.C = (Cmin + Cmax) / 2;
+    } while (Cmax - Cmin > maxDI);
+
+    return res;
+}
+
+// Finds maximally saturated in-gamut color (for given hue)
+static inline struct ICh saturate(float hue, struct gamut gamut)
+{
+    if (gamut.peak_cache->I && fabsf(gamut.peak_cache->h - hue) < 1e-3)
+        return *gamut.peak_cache;
+
+    static const float invphi = 0.6180339887498948f;
+    static const float invphi2 = 0.38196601125010515f;
+
+    struct ICh lo = { .I = gamut.min_luma, .h = hue };
+    struct ICh hi = { .I = gamut.max_luma, .h = hue };
+    float de = hi.I - lo.I;
+    struct ICh a = { .I = lo.I + invphi2 * de };
+    struct ICh b = { .I = lo.I + invphi  * de };
+    a = desat_bounded(a.I, hue, 0.0f, 0.5f, gamut);
+    b = desat_bounded(b.I, hue, 0.0f, 0.5f, gamut);
+
+    while (de > maxDelta) {
+        de *= invphi;
+        if (a.C > b.C) {
+            hi = b;
+            b = a;
+            a.I = lo.I + invphi2 * de;
+            a = desat_bounded(a.I, hue, lo.C - maxDelta, 0.5f, gamut);
+        } else {
+            lo = a;
+            a = b;
+            b.I = lo.I + invphi * de;
+            b = desat_bounded(b.I, hue, hi.C - maxDelta, 0.5f, gamut);
+        }
+    }
+
+    struct ICh peak = a.C > b.C ? a : b;
+    *gamut.peak_cache = peak;
+    return peak;
+}
+
+// Clip a color along the exponential curve given by `gamma`
+static inline struct IPT
+clip_gamma(struct IPT ipt, float gamma, struct gamut gamut)
+{
+    if (ipt.I <= gamut.min_luma)
+        return (struct IPT) { .I = gamut.min_luma };
+    if (ingamut(ipt, gamut))
+        return ipt;
+
+    struct ICh ich = ipt2ich(ipt);
+    if (!gamma)
+        return ich2ipt(desat_bounded(ich.I, ich.h, 0.0f, ich.C, gamut));
+
+    const float maxDI = fmaxf(ich.I * maxDelta, 1e-7f);
+    struct ICh peak = saturate(ich.h, gamut);
+    gamma = scale_gamma(gamma, ich, peak, gamut);
+    float lo = 0.0f, hi = 1.0f, x = 0.5f;
+    do {
+        struct ICh test = mix_exp(ich, x, gamma, peak.I);
+        if (ingamut(ich2ipt(test), gamut)) {
+            lo = x;
+        } else {
+            hi = x;
+        }
+        x = (lo + hi) / 2.0f;
+    } while (hi - lo > maxDI);
+
+    return ich2ipt(mix_exp(ich, x, gamma, peak.I));
+}
+
+static float softclip(float value, float source, float target,
+                      const struct pl_gamut_map_constants *c)
+{
+    if (!target)
+        return 0.0f;
+    const float peak = source / target;
+    const float x = fminf(value / target, peak);
+    const float j = c->softclip_knee;
+    if (x <= j || peak <= 1.0)
+        return value;
+    // Apply simple mobius function
+    const float a = -j*j * (peak - 1.0f) / (j*j - 2.0f * j + peak);
+    const float b = (j*j - 2.0f * j * peak + peak) /
+                    fmaxf(1e-6f, peak - 1.0f);
+    const float scale = (b*b + 2.0f * b*j + j*j) / (b - a);
+    return scale * (x + a) / (x + b) * target;
+}
+
+static int cmp_float(const void *a, const void *b)
+{
+    float fa = *(const float*) a;
+    float fb = *(const float*) b;
+    return PL_CMP(fa, fb);
+}
+
+static float wrap(float h)
+{
+    if (h > M_PI) {
+        return h - 2 * M_PI;
+    } else if (h < -M_PI) {
+        return h + 2 * M_PI;
+    } else {
+        return h;
+    }
+}
+
+enum {
+    S = 12,    // number of hue shift vertices
+    N = S + 2, // +2 for the endpoints
+};
+
+// Hue-shift helper struct
+struct hueshift {
+    float dh[N];
+    float dddh[N];
+    float K[N];
+    float prev_hue;
+    float prev_shift;
+    struct { float hue, delta; } hueshift[N];
+};
+
+static void hueshift_prepare(struct hueshift *s, struct gamut src, struct gamut dst)
+{
+    const float O = pq_eotf(src.min_luma), X = pq_eotf(src.max_luma);
+    const float M = (O + X) / 2.0f;
+    const struct RGB refpoints[S] = {
+        {X, O, O}, {O, X, O}, {O, O, X},
+        {O, X, X}, {X, O, X}, {X, X, O},
+        {O, X, M}, {X, O, M}, {X, M, O},
+        {O, M, X}, {M, O, X}, {M, X, O},
+    };
+
+    memset(s, 0, sizeof(*s));
+    for (int i = 0; i < S; i++) {
+        struct ICh ich_src = ipt2ich(rgb2ipt(refpoints[i], src));
+        struct ICh ich_dst = ipt2ich(rgb2ipt(refpoints[i], dst));
+        const float delta = wrap(ich_dst.h - ich_src.h);
+        s->hueshift[i+1].hue = ich_src.h;
+        s->hueshift[i+1].delta = delta;
+    }
+
+    // Sort and wrap endpoints
+    qsort(s->hueshift + 1, S, sizeof(*s->hueshift), cmp_float);
+    s->hueshift[0]   = s->hueshift[S];
+    s->hueshift[S+1] = s->hueshift[1];
+    s->hueshift[0].hue   -= 2 * M_PI;
+    s->hueshift[S+1].hue += 2 * M_PI;
+
+    // Construction of cubic spline coefficients
+    float tmp[N][N] = {0};
+    for (int i = N - 1; i > 0; i--) {
+        s->dh[i-1] = s->hueshift[i].hue - s->hueshift[i-1].hue;
+        s->dddh[i] = (s->hueshift[i].delta - s->hueshift[i-1].delta) / s->dh[i-1];
+    }
+    for (int i = 1; i < N - 1; i++) {
+        tmp[i][i] = 2 * (s->dh[i-1] + s->dh[i]);
+        if (i != 1)
+            tmp[i][i-1] = tmp[i-1][i] = s->dh[i-1];
+        tmp[i][N-1] = 6 * (s->dddh[i+1] - s->dddh[i]);
+    }
+    for (int i = 1; i < N - 2; i++) {
+        const float q = (tmp[i+1][i] / tmp[i][i]);
+        for (int j = 1; j <= N - 1; j++)
+            tmp[i+1][j] -= q * tmp[i][j];
+    }
+    for (int i = N - 2; i > 0; i--) {
+        float sum = 0.0f;
+        for (int j = i; j <= N - 2; j++)
+            sum += tmp[i][j] * s->K[j];
+        s->K[i] = (tmp[i][N-1] - sum) / tmp[i][i];
+    }
+
+    s->prev_hue = -10.0f;
+}
+
+static struct ICh hueshift_apply(struct hueshift *s, struct ICh ich)
+{
+    if (fabsf(ich.h - s->prev_hue) < 1e-6f)
+        goto done;
+
+    // Determine perceptual hue shift delta by interpolation of refpoints
+    for (int i = 0; i < N - 1; i++) {
+        if (s->hueshift[i+1].hue > ich.h) {
+            pl_assert(s->hueshift[i].hue <= ich.h);
+            float a = (s->K[i+1] - s->K[i]) / (6 * s->dh[i]);
+            float b = s->K[i] / 2;
+            float c = s->dddh[i+1] - (2 * s->dh[i] * s->K[i] + s->K[i+1] * s->dh[i]) / 6;
+            float d = s->hueshift[i].delta;
+            float x = ich.h - s->hueshift[i].hue;
+            float delta = ((a * x + b) * x + c) * x + d;
+            s->prev_shift = ich.h + delta;
+            s->prev_hue = ich.h;
+            break;
+        }
+    }
+
+done:
+    return (struct ICh) {
+        .I = ich.I,
+        .C = ich.C,
+        .h = s->prev_shift,
+    };
+}
+
+static void perceptual(float *lut, const struct pl_gamut_map_params *params)
+{
+    const struct pl_gamut_map_constants *c = &params->constants;
+    struct cache cache;
+    struct gamut dst, src;
+    get_gamuts(&dst, &src, &cache, params);
+
+    FOREACH_LUT(lut, ipt) {
+        struct ICh ich = ipt2ich(ipt);
+        struct ICh src_peak = saturate(ich.h, src);
+        struct ICh dst_peak = saturate(ich.h, dst);
+        struct IPT mapped = rgb2ipt(ipt2rgb(ipt, src), dst);
+
+        // Protect in gamut region
+        const float maxC = fmaxf(src_peak.C, dst_peak.C);
+        float k = pl_smoothstep(c->perceptual_deadzone, 1.0f, ich.C / maxC);
+        k *= c->perceptual_strength;
+        ipt.I = PL_MIX(ipt.I, mapped.I, k);
+        ipt.P = PL_MIX(ipt.P, mapped.P, k);
+        ipt.T = PL_MIX(ipt.T, mapped.T, k);
+
+        struct RGB rgb = ipt2rgb(ipt, dst);
+        const float maxRGB = fmaxf(rgb.R, fmaxf(rgb.G, rgb.B));
+        rgb.R = fmaxf(softclip(rgb.R, maxRGB, dst.max_rgb, c), dst.min_rgb);
+        rgb.G = fmaxf(softclip(rgb.G, maxRGB, dst.max_rgb, c), dst.min_rgb);
+        rgb.B = fmaxf(softclip(rgb.B, maxRGB, dst.max_rgb, c), dst.min_rgb);
+        ipt = rgb2ipt(rgb, dst);
+    }
+}
+
+const struct pl_gamut_map_function pl_gamut_map_perceptual = {
+    .name = "perceptual",
+    .description = "Perceptual mapping",
+    .bidirectional = true,
+    .map = perceptual,
+};
+
+static void softclip_map(float *lut, const struct pl_gamut_map_params *params)
+{
+    const struct pl_gamut_map_constants *c = &params->constants;
+
+    // Separate cache after hueshift, because this invalidates previous cache
+    struct cache cache_pre, cache_post;
+    struct gamut dst_pre, src_pre, src_post, dst_post;
+    struct hueshift hueshift;
+    get_gamuts(&dst_pre, &src_pre, &cache_pre, params);
+    get_gamuts(&dst_post, &src_post, &cache_post, params);
+    hueshift_prepare(&hueshift, src_pre, dst_pre);
+
+    FOREACH_LUT(lut, ipt) {
+        struct gamut src = src_pre;
+        struct gamut dst = dst_pre;
+
+        if (ipt.I <= dst.min_luma) {
+            ipt.P = ipt.T = 0.0f;
+            continue;
+        }
+
+        struct ICh ich = ipt2ich(ipt);
+        if (ich.C <= 1e-2f)
+            continue; // Fast path for achromatic colors
+
+        float margin = 1.0f;
+        struct ICh shifted = hueshift_apply(&hueshift, ich);
+        if (fabsf(shifted.h - ich.h) >= 1e-3f) {
+            struct ICh src_border = desat_bounded(ich.I, ich.h, 0.0f, 0.5f, src);
+            struct ICh dst_border = desat_bounded(ich.I, ich.h, 0.0f, 0.5f, dst);
+            const float k = pl_smoothstep(dst_border.C * c->softclip_knee,
+                                          src_border.C, ich.C);
+            ich.h = PL_MIX(ich.h, shifted.h, k);
+            src = src_post;
+            dst = dst_post;
+
+            // Expand/contract chromaticity margin to correspond to the altered
+            // size of the hue leaf after applying the hue delta
+            struct ICh shift_border = desat_bounded(ich.I, ich.h, 0.0f, 0.5f, src);
+            margin *= fmaxf(1.0f, src_border.C / shift_border.C);
+        }
+
+        // Determine intersections with source and target gamuts, and
+        // apply softclip to the chromaticity
+        struct ICh source = saturate(ich.h, src);
+        struct ICh target = saturate(ich.h, dst);
+        struct ICh border = desat_bounded(ich.I, ich.h, 0.0f, target.C, dst);
+        const float chromaticity = PL_MIX(target.C, border.C, c->softclip_desat);
+        ich.C = softclip(ich.C, margin * source.C, chromaticity, c);
+
+        // Soft-clip the resulting RGB color. This will generally distort
+        // hues slightly, but hopefully in an aesthetically pleasing way.
+        struct ICh saturated = { ich.I, chromaticity, ich.h };
+        struct RGB peak = ipt2rgb(ich2ipt(saturated), dst);
+        struct RGB rgb = ipt2rgb(ich2ipt(ich), dst);
+        rgb.R = fmaxf(softclip(rgb.R, peak.R, dst.max_rgb, c), dst.min_rgb);
+        rgb.G = fmaxf(softclip(rgb.G, peak.G, dst.max_rgb, c), dst.min_rgb);
+        rgb.B = fmaxf(softclip(rgb.B, peak.B, dst.max_rgb, c), dst.min_rgb);
+        ipt = rgb2ipt(rgb, dst);
+    }
+}
+
+const struct pl_gamut_map_function pl_gamut_map_softclip = {
+    .name = "softclip",
+    .description = "Soft clipping",
+    .map = softclip_map,
+};
+
+static void relative(float *lut, const struct pl_gamut_map_params *params)
+{
+    const struct pl_gamut_map_constants *c = &params->constants;
+    struct cache cache;
+    struct gamut dst;
+    get_gamuts(&dst, NULL, &cache, params);
+
+    FOREACH_LUT(lut, ipt)
+        ipt = clip_gamma(ipt, c->colorimetric_gamma, dst);
+}
+
+const struct pl_gamut_map_function pl_gamut_map_relative = {
+    .name = "relative",
+    .description = "Colorimetric clip",
+    .map = relative,
+};
+
+static void desaturate(float *lut, const struct pl_gamut_map_params *params)
+{
+    struct cache cache;
+    struct gamut dst;
+    get_gamuts(&dst, NULL, &cache, params);
+
+    FOREACH_LUT(lut, ipt)
+        ipt = clip_gamma(ipt, 0.0f, dst);
+}
+
+const struct pl_gamut_map_function pl_gamut_map_desaturate = {
+    .name = "desaturate",
+    .description = "Desaturating clip",
+    .map = desaturate,
+};
+
+static void saturation(float *lut, const struct pl_gamut_map_params *params)
+{
+    struct cache cache;
+    struct gamut dst, src;
+    get_gamuts(&dst, &src, &cache, params);
+
+    FOREACH_LUT(lut, ipt)
+        ipt = rgb2ipt(ipt2rgb(ipt, src), dst);
+}
+
+const struct pl_gamut_map_function pl_gamut_map_saturation = {
+    .name = "saturation",
+    .description = "Saturation mapping",
+    .bidirectional = true,
+    .map = saturation,
+};
+
+static void absolute(float *lut, const struct pl_gamut_map_params *params)
+{
+    const struct pl_gamut_map_constants *c = &params->constants;
+    struct cache cache;
+    struct gamut dst;
+    get_gamuts(&dst, NULL, &cache, params);
+    pl_matrix3x3 m = pl_get_adaptation_matrix(params->output_gamut.white,
+                                              params->input_gamut.white);
+
+    FOREACH_LUT(lut, ipt) {
+        struct RGB rgb = ipt2rgb(ipt, dst);
+        pl_matrix3x3_apply(&m, (float *) &rgb);
+        ipt = rgb2ipt(rgb, dst);
+        ipt = clip_gamma(ipt, c->colorimetric_gamma, dst);
+    }
+}
+
+const struct pl_gamut_map_function pl_gamut_map_absolute = {
+    .name = "absolute",
+    .description = "Absolute colorimetric clip",
+    .map = absolute,
+};
+
+static void highlight(float *lut, const struct pl_gamut_map_params *params)
+{
+    struct cache cache;
+    struct gamut dst;
+    get_gamuts(&dst, NULL, &cache, params);
+
+    FOREACH_LUT(lut, ipt) {
+        if (!ingamut(ipt, dst)) {
+            ipt.I = fminf(ipt.I + 0.1f, 1.0f);
+            ipt.P = fclampf(-1.2f * ipt.P, -0.5f, 0.5f);
+            ipt.T = fclampf(-1.2f * ipt.T, -0.5f, 0.5f);
+        }
+    }
+}
+
+const struct pl_gamut_map_function pl_gamut_map_highlight = {
+    .name = "highlight",
+    .description = "Highlight out-of-gamut pixels",
+    .map = highlight,
+};
+
+static void linear(float *lut, const struct pl_gamut_map_params *params)
+{
+    struct cache cache;
+    struct gamut dst, src;
+    get_gamuts(&dst, &src, &cache, params);
+
+    float gain = 1.0f;
+    for (float hue = -M_PI; hue < M_PI; hue += 0.1f)
+        gain = fminf(gain, saturate(hue, dst).C / saturate(hue, src).C);
+
+    FOREACH_LUT(lut, ipt) {
+        struct ICh ich = ipt2ich(ipt);
+        ich.C *= gain;
+        ipt = ich2ipt(ich);
+    }
+}
+
+const struct pl_gamut_map_function pl_gamut_map_linear = {
+    .name = "linear",
+    .description = "Linear desaturate",
+    .map = linear,
+};
+
+static void darken(float *lut, const struct pl_gamut_map_params *params)
+{
+    const struct pl_gamut_map_constants *c = &params->constants;
+    struct cache cache;
+    struct gamut dst, src;
+    get_gamuts(&dst, &src, &cache, params);
+
+    static const struct RGB points[6] = {
+        {1, 0, 0}, {0, 1, 0}, {0, 0, 1},
+        {0, 1, 1}, {1, 0, 1}, {1, 1, 0},
+    };
+
+    float gain = 1.0f;
+    for (int i = 0; i < PL_ARRAY_SIZE(points); i++) {
+        const struct RGB p = ipt2rgb(rgb2ipt(points[i], src), dst);
+        const float maxRGB = PL_MAX3(p.R, p.G, p.B);
+        gain = fminf(gain, 1.0 / maxRGB);
+    }
+
+    FOREACH_LUT(lut, ipt) {
+        struct RGB rgb = ipt2rgb(ipt, dst);
+        rgb.R *= gain;
+        rgb.G *= gain;
+        rgb.B *= gain;
+        ipt = rgb2ipt(rgb, dst);
+        ipt = clip_gamma(ipt, c->colorimetric_gamma, dst);
+    }
+}
+
+const struct pl_gamut_map_function pl_gamut_map_darken = {
+    .name = "darken",
+    .description = "Darken and clip",
+    .map = darken,
+};
+
+static void noop(float *lut, const struct pl_gamut_map_params *params)
+{
+    return;
+}
+
+const struct pl_gamut_map_function pl_gamut_map_clip = {
+    .name = "clip",
+    .description = "No gamut mapping (hard clip)",
+    .map = noop,
+};
+
+const struct pl_gamut_map_function * const pl_gamut_map_functions[] = {
+    &pl_gamut_map_clip,
+    &pl_gamut_map_perceptual,
+    &pl_gamut_map_softclip,
+    &pl_gamut_map_relative,
+    &pl_gamut_map_saturation,
+    &pl_gamut_map_absolute,
+    &pl_gamut_map_desaturate,
+    &pl_gamut_map_darken,
+    &pl_gamut_map_highlight,
+    &pl_gamut_map_linear,
+    NULL
+};
+
+const int pl_num_gamut_map_functions = PL_ARRAY_SIZE(pl_gamut_map_functions) - 1;
+
+const struct pl_gamut_map_function *pl_find_gamut_map_function(const char *name)
+{
+    for (int i = 0; i < pl_num_gamut_map_functions; i++) {
+        if (strcmp(name, pl_gamut_map_functions[i]->name) == 0)
+            return pl_gamut_map_functions[i];
+    }
+
+    return NULL;
+}
diff --git a/src/glsl/glslang.cc b/src/glsl/glslang.cc
new file mode 100644
index 0000000..2bc923c
--- /dev/null
+++ b/src/glsl/glslang.cc
@@ -0,0 +1,121 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "config_internal.h"
+
+#include <assert.h>
+
+extern "C" {
+#include "pl_alloc.h"
+#include "pl_thread.h"
+}
+
+#include <glslang/Public/ShaderLang.h>
+#include <glslang/SPIRV/GlslangToSpv.h>
+#include <glslang/build_info.h>
+
+#include "glslang.h"
+
+#if (GLSLANG_VERSION_MAJOR * 1000 + GLSLANG_VERSION_MINOR) >= 11013
+#include <glslang/Public/ResourceLimits.h>
+#define DefaultTBuiltInResource *GetDefaultResources()
+#endif
+
+using namespace glslang;
+
+static pl_static_mutex pl_glslang_mutex = PL_STATIC_MUTEX_INITIALIZER;
+static int pl_glslang_refcount;
+
+bool pl_glslang_init(void)
+{
+    bool ret = true;
+
+    pl_static_mutex_lock(&pl_glslang_mutex);
+    if (pl_glslang_refcount++ == 0)
+        ret = InitializeProcess();
+    pl_static_mutex_unlock(&pl_glslang_mutex);
+
+    return ret;
+}
+
+void pl_glslang_uninit(void)
+{
+    pl_static_mutex_lock(&pl_glslang_mutex);
+    if (--pl_glslang_refcount == 0)
+        FinalizeProcess();
+    pl_static_mutex_unlock(&pl_glslang_mutex);
+}
+
+struct pl_glslang_res *pl_glslang_compile(struct pl_glsl_version glsl_ver,
+                                          struct pl_spirv_version spirv_ver,
+                                          enum glsl_shader_stage stage,
+                                          const char *text)
+{
+    assert(pl_glslang_refcount);
+    struct pl_glslang_res *res = pl_zalloc_ptr(NULL, res);
+
+    EShLanguage lang;
+    switch (stage) {
+    case GLSL_SHADER_VERTEX:     lang = EShLangVertex; break;
+    case GLSL_SHADER_FRAGMENT:   lang = EShLangFragment; break;
+    case GLSL_SHADER_COMPUTE:    lang = EShLangCompute; break;
+    default: abort();
+    }
+
+    TShader *shader = new TShader(lang);
+
+    shader->setEnvClient(EShClientVulkan, (EShTargetClientVersion) spirv_ver.env_version);
+    shader->setEnvTarget(EShTargetSpv, (EShTargetLanguageVersion) spirv_ver.spv_version);
+    shader->setStrings(&text, 1);
+
+    TBuiltInResource limits = DefaultTBuiltInResource;
+    limits.maxComputeWorkGroupSizeX = glsl_ver.max_group_size[0];
+    limits.maxComputeWorkGroupSizeY = glsl_ver.max_group_size[1];
+    limits.maxComputeWorkGroupSizeZ = glsl_ver.max_group_size[2];
+    limits.minProgramTexelOffset = glsl_ver.min_gather_offset;
+    limits.maxProgramTexelOffset = glsl_ver.max_gather_offset;
+
+    if (!shader->parse(&limits, 0, true, EShMsgDefault)) {
+        res->error_msg = pl_str0dup0(res, shader->getInfoLog());
+        delete shader;
+        return res;
+    }
+
+    TProgram *prog = new TProgram();
+    prog->addShader(shader);
+    if (!prog->link(EShMsgDefault)) {
+        res->error_msg = pl_str0dup0(res, prog->getInfoLog());
+        delete shader;
+        delete prog;
+        return res;
+    }
+
+    SpvOptions options;
+    options.disableOptimizer = false;
+    options.stripDebugInfo = true;
+    options.optimizeSize = true;
+    options.validate = true;
+    std::vector<unsigned int> spirv;
+    GlslangToSpv(*prog->getIntermediate(lang), spirv, &options);
+
+    res->success = true;
+    res->size = spirv.size() * sizeof(unsigned int);
+    res->data = pl_memdup(res, spirv.data(), res->size),
+    delete shader;
+    delete prog;
+    return res;
+}
diff --git a/src/glsl/glslang.h b/src/glsl/glslang.h
new file mode 100644
index 0000000..a5965a5
--- /dev/null
+++ b/src/glsl/glslang.h
@@ -0,0 +1,57 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <stdlib.h>
+#include <stdbool.h>
+
+typedef struct TLimits TLimits;
+typedef struct TBuiltInResource TBuiltInResource;
+#include <glslang/Include/ResourceLimits.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "utils.h"
+
+bool pl_glslang_init(void);
+void pl_glslang_uninit(void);
+
+struct pl_glslang_res {
+    // Compilation status
+    bool success;
+    const char *error_msg;
+
+    // Compiled shader memory, or NULL
+    void *data;
+    size_t size;
+};
+
+// Compile GLSL into a SPIRV stream, if possible. The resulting
+// pl_glslang_res can simply be freed with pl_free() when done.
+struct pl_glslang_res *pl_glslang_compile(struct pl_glsl_version glsl_ver,
+                                          struct pl_spirv_version spirv_ver,
+                                          enum glsl_shader_stage stage,
+                                          const char *shader);
+
+extern const TBuiltInResource DefaultTBuiltInResource;
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/glsl/glslang_resources.c b/src/glsl/glslang_resources.c
new file mode 100644
index 0000000..a111c15
--- /dev/null
+++ b/src/glsl/glslang_resources.c
@@ -0,0 +1,132 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "glslang.h"
+
+// Taken from glslang's examples, which apparently generally bases the choices
+// on OpenGL specification limits
+//
+// Note: This lives in a separate file so we can compile this struct using C99
+// designated initializers instead of using C++ struct initializers, because
+// the latter will break on every upstream struct extension.
+const TBuiltInResource DefaultTBuiltInResource = {
+    .maxLights = 32,
+    .maxClipPlanes = 6,
+    .maxTextureUnits = 32,
+    .maxTextureCoords = 32,
+    .maxVertexAttribs = 64,
+    .maxVertexUniformComponents = 4096,
+    .maxVaryingFloats = 64,
+    .maxVertexTextureImageUnits = 32,
+    .maxCombinedTextureImageUnits = 80,
+    .maxTextureImageUnits = 32,
+    .maxFragmentUniformComponents = 4096,
+    .maxDrawBuffers = 32,
+    .maxVertexUniformVectors = 128,
+    .maxVaryingVectors = 8,
+    .maxFragmentUniformVectors = 16,
+    .maxVertexOutputVectors = 16,
+    .maxFragmentInputVectors = 15,
+    .minProgramTexelOffset = -8,
+    .maxProgramTexelOffset = 7,
+    .maxClipDistances = 8,
+    .maxComputeWorkGroupCountX = 65535,
+    .maxComputeWorkGroupCountY = 65535,
+    .maxComputeWorkGroupCountZ = 65535,
+    .maxComputeWorkGroupSizeX = 1024,
+    .maxComputeWorkGroupSizeY = 1024,
+    .maxComputeWorkGroupSizeZ = 64,
+    .maxComputeUniformComponents = 1024,
+    .maxComputeTextureImageUnits = 16,
+    .maxComputeImageUniforms = 8,
+    .maxComputeAtomicCounters = 8,
+    .maxComputeAtomicCounterBuffers = 1,
+    .maxVaryingComponents = 60,
+    .maxVertexOutputComponents = 64,
+    .maxGeometryInputComponents = 64,
+    .maxGeometryOutputComponents = 128,
+    .maxFragmentInputComponents = 128,
+    .maxImageUnits = 8,
+    .maxCombinedImageUnitsAndFragmentOutputs = 8,
+    .maxCombinedShaderOutputResources = 8,
+    .maxImageSamples = 0,
+    .maxVertexImageUniforms = 0,
+    .maxTessControlImageUniforms = 0,
+    .maxTessEvaluationImageUniforms = 0,
+    .maxGeometryImageUniforms = 0,
+    .maxFragmentImageUniforms = 8,
+    .maxCombinedImageUniforms = 8,
+    .maxGeometryTextureImageUnits = 16,
+    .maxGeometryOutputVertices = 256,
+    .maxGeometryTotalOutputComponents = 1024,
+    .maxGeometryUniformComponents = 1024,
+    .maxGeometryVaryingComponents = 64,
+    .maxTessControlInputComponents = 128,
+    .maxTessControlOutputComponents = 128,
+    .maxTessControlTextureImageUnits = 16,
+    .maxTessControlUniformComponents = 1024,
+    .maxTessControlTotalOutputComponents = 4096,
+    .maxTessEvaluationInputComponents = 128,
+    .maxTessEvaluationOutputComponents = 128,
+    .maxTessEvaluationTextureImageUnits = 16,
+    .maxTessEvaluationUniformComponents = 1024,
+    .maxTessPatchComponents = 120,
+    .maxPatchVertices = 32,
+    .maxTessGenLevel = 64,
+    .maxViewports = 16,
+    .maxVertexAtomicCounters = 0,
+    .maxTessControlAtomicCounters = 0,
+    .maxTessEvaluationAtomicCounters = 0,
+    .maxGeometryAtomicCounters = 0,
+    .maxFragmentAtomicCounters = 8,
+    .maxCombinedAtomicCounters = 8,
+    .maxAtomicCounterBindings = 1,
+    .maxVertexAtomicCounterBuffers = 0,
+    .maxTessControlAtomicCounterBuffers = 0,
+    .maxTessEvaluationAtomicCounterBuffers = 0,
+    .maxGeometryAtomicCounterBuffers = 0,
+    .maxFragmentAtomicCounterBuffers = 1,
+    .maxCombinedAtomicCounterBuffers = 1,
+    .maxAtomicCounterBufferSize = 16384,
+    .maxTransformFeedbackBuffers = 4,
+    .maxTransformFeedbackInterleavedComponents = 64,
+    .maxCullDistances = 8,
+    .maxCombinedClipAndCullDistances = 8,
+    .maxSamples = 4,
+    .maxMeshOutputVerticesNV = 256,
+    .maxMeshOutputPrimitivesNV = 512,
+    .maxMeshWorkGroupSizeX_NV = 32,
+    .maxMeshWorkGroupSizeY_NV = 1,
+    .maxMeshWorkGroupSizeZ_NV = 1,
+    .maxTaskWorkGroupSizeX_NV = 32,
+    .maxTaskWorkGroupSizeY_NV = 1,
+    .maxTaskWorkGroupSizeZ_NV = 1,
+    .maxMeshViewCountNV = 4,
+    .maxDualSourceDrawBuffersEXT = 1,
+
+    .limits = {
+        .nonInductiveForLoops = 1,
+        .whileLoops = 1,
+        .doWhileLoops = 1,
+        .generalUniformIndexing = 1,
+        .generalAttributeMatrixVectorIndexing = 1,
+        .generalVaryingIndexing = 1,
+        .generalSamplerIndexing = 1,
+        .generalVariableIndexing = 1,
+        .generalConstantMatrixVectorIndexing = 1,
+    },
+};
diff --git a/src/glsl/meson.build b/src/glsl/meson.build
new file mode 100644
index 0000000..5cebfb8
--- /dev/null
+++ b/src/glsl/meson.build
@@ -0,0 +1,73 @@
+# shaderc
+shaderc = dependency('shaderc', version: '>=2019.1', required: get_option('shaderc'))
+components.set('shaderc', shaderc.found())
+if shaderc.found()
+  build_deps += shaderc
+  sources += 'glsl/spirv_shaderc.c'
+endif
+
+# glslang
+glslang = disabler()
+glslang_req = get_option('glslang')
+if glslang_req.auto() and shaderc.found()
+
+  # we only need one or the other, and shaderc is preferred
+  message('Skipping `glslang` because `shaderc` is available')
+
+elif not glslang_req.disabled()
+
+  glslang_deps = [
+    cxx.find_library('glslang-default-resource-limits', required: false)
+  ]
+
+  # meson doesn't respect generator expressions in INTERFACE_LINK_LIBRARIES
+  # https://github.com/mesonbuild/meson/issues/8232
+  # TODO: Use the following once it's fixed
+  # glslang = dependency('glslang', method: 'cmake', modules: ['glslang::SPIRV'])
+
+  prefer_static = get_option('prefer_static')
+  found_lib = false
+  foreach arg : [[prefer_static, false], [not prefer_static, glslang_req]]
+    static   = arg[0]
+    required = arg[1]
+
+    spirv = cxx.find_library('SPIRV', required: required, static: static)
+
+    if not spirv.found()
+      continue
+    endif
+
+    glslang_deps += spirv
+
+    if static
+      glslang_deps += [
+        # Always required for static linking
+        cxx.find_library('MachineIndependent', required: true, static: true),
+        cxx.find_library('OSDependent',        required: true, static: true),
+        cxx.find_library('OGLCompiler',        required: true, static: true),
+        cxx.find_library('GenericCodeGen',     required: true, static: true),
+        # SPIRV-Tools are required only if optimizer is enabled in glslang build
+        cxx.find_library('SPIRV-Tools',        required: false, static: true),
+        cxx.find_library('SPIRV-Tools-opt',    required: false, static: true),
+      ]
+    endif
+
+    found_lib = true
+    break
+  endforeach
+
+  if found_lib and cc.has_header('glslang/build_info.h')
+    glslang = declare_dependency(dependencies: glslang_deps)
+  endif
+
+endif
+
+components.set('glslang', glslang.found())
+if glslang.found()
+  build_deps += glslang
+  sources += [
+    'glsl/glslang.cc',
+    'glsl/glslang_resources.c',
+    'glsl/spirv_glslang.c',
+  ]
+endif
diff --git a/src/glsl/spirv.c b/src/glsl/spirv.c
new file mode 100644
index 0000000..8317ed7
--- /dev/null
+++ b/src/glsl/spirv.c
@@ -0,0 +1,64 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "spirv.h"
+
+extern const struct spirv_compiler pl_spirv_shaderc;
+extern const struct spirv_compiler pl_spirv_glslang;
+
+static const struct spirv_compiler *compilers[] = {
+#ifdef PL_HAVE_SHADERC
+    &pl_spirv_shaderc,
+#endif
+#ifdef PL_HAVE_GLSLANG
+    &pl_spirv_glslang,
+#endif
+};
+
+pl_spirv pl_spirv_create(pl_log log, struct pl_spirv_version spirv_ver)
+{
+    for (int i = 0; i < PL_ARRAY_SIZE(compilers); i++) {
+        pl_spirv spirv = compilers[i]->create(log, spirv_ver);
+        if (!spirv)
+            continue;
+
+        pl_info(log, "Initialized SPIR-V compiler '%s'", compilers[i]->name);
+        return spirv;
+    }
+
+    pl_fatal(log, "Failed initializing any SPIR-V compiler! Maybe libplacebo "
+             "was built without support for either libshaderc or glslang?");
+    return NULL;
+}
+
+void pl_spirv_destroy(pl_spirv *pspirv)
+{
+    pl_spirv spirv = *pspirv;
+    if (!spirv)
+        return;
+
+    spirv->impl->destroy(spirv);
+    *pspirv = NULL;
+}
+
+pl_str pl_spirv_compile_glsl(pl_spirv spirv, void *alloc,
+                             struct pl_glsl_version glsl,
+                             enum glsl_shader_stage stage,
+                             const char *shader)
+{
+    return spirv->impl->compile(spirv, alloc, glsl, stage, shader);
+}
diff --git a/src/glsl/spirv.h b/src/glsl/spirv.h
new file mode 100644
index 0000000..fa4494a
--- /dev/null
+++ b/src/glsl/spirv.h
@@ -0,0 +1,50 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "log.h"
+#include "utils.h"
+
+typedef const struct pl_spirv_t {
+    const struct spirv_compiler *impl;
+    pl_log log;
+
+    // SPIR-V version specified at creation time.
+    struct pl_spirv_version version;
+
+    // For cache invalidation, should uniquely identify everything about this
+    // spirv compiler and its configuration.
+    uint64_t signature;
+} *pl_spirv;
+
+// Initialize a SPIR-V compiler instance, or returns NULL on failure.
+pl_spirv pl_spirv_create(pl_log log, struct pl_spirv_version spirv_ver);
+void pl_spirv_destroy(pl_spirv *spirv);
+
+// Compile GLSL to SPIR-V. Returns {0} on failure.
+pl_str pl_spirv_compile_glsl(pl_spirv spirv, void *alloc,
+                             struct pl_glsl_version glsl_ver,
+                             enum glsl_shader_stage stage,
+                             const char *shader);
+
+struct spirv_compiler {
+    const char *name;
+    void (*destroy)(pl_spirv spirv);
+    __typeof__(pl_spirv_create) *create;
+    __typeof__(pl_spirv_compile_glsl) *compile;
+};
diff --git a/src/glsl/spirv_glslang.c b/src/glsl/spirv_glslang.c
new file mode 100644
index 0000000..ffb8f55
--- /dev/null
+++ b/src/glsl/spirv_glslang.c
@@ -0,0 +1,112 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hash.h"
+#include "spirv.h"
+#include "utils.h"
+#include "glsl/glslang.h"
+
+// This header contains only preprocessor definitions
+#include <glslang/build_info.h>
+
+// This is awkward, but we cannot use upstream macro, it was fixed in 11.11.0
+#define PL_GLSLANG_VERSION_GREATER_THAN(major, minor, patch) \
+    ((GLSLANG_VERSION_MAJOR) > (major) || ((major) == GLSLANG_VERSION_MAJOR && \
+    ((GLSLANG_VERSION_MINOR) > (minor) || ((minor) == GLSLANG_VERSION_MINOR && \
+     (GLSLANG_VERSION_PATCH) > (patch)))))
+
+#if PL_GLSLANG_VERSION_GREATER_THAN(11, 8, 0)
+#define GLSLANG_SPV_MAX PL_SPV_VERSION(1, 6)
+#elif PL_GLSLANG_VERSION_GREATER_THAN(7, 13, 3496)
+#define GLSLANG_SPV_MAX PL_SPV_VERSION(1, 5)
+#elif PL_GLSLANG_VERSION_GREATER_THAN(6, 2, 2596)
+#define GLSLANG_SPV_MAX PL_SPV_VERSION(1, 3)
+#else
+#define GLSLANG_SPV_MAX PL_SPV_VERSION(1, 0)
+#endif
+
+const struct spirv_compiler pl_spirv_glslang;
+
+static void glslang_destroy(pl_spirv spirv)
+{
+    pl_glslang_uninit();
+    pl_free((void *) spirv);
+}
+
+static pl_spirv glslang_create(pl_log log, struct pl_spirv_version spirv_ver)
+{
+    if (!pl_glslang_init()) {
+        pl_fatal(log, "Failed initializing glslang SPIR-V compiler!");
+        return NULL;
+    }
+
+    struct pl_spirv_t *spirv = pl_alloc_ptr(NULL, spirv);
+    *spirv = (struct pl_spirv_t) {
+        .signature = pl_str0_hash(pl_spirv_glslang.name),
+        .impl      = &pl_spirv_glslang,
+        .version   = spirv_ver,
+        .log       = log,
+    };
+
+    PL_INFO(spirv, "glslang version: %d.%d.%d",
+            GLSLANG_VERSION_MAJOR,
+            GLSLANG_VERSION_MINOR,
+            GLSLANG_VERSION_PATCH);
+
+    // Clamp to supported version by glslang
+    if (GLSLANG_SPV_MAX < spirv->version.spv_version) {
+        spirv->version.spv_version = GLSLANG_SPV_MAX;
+        spirv->version.env_version = pl_spirv_version_to_vulkan(GLSLANG_SPV_MAX);
+    }
+
+    pl_hash_merge(&spirv->signature, (uint64_t) spirv->version.spv_version << 32 |
+                                                spirv->version.env_version);
+    pl_hash_merge(&spirv->signature, (GLSLANG_VERSION_MAJOR & 0xFF) << 24 |
+                                     (GLSLANG_VERSION_MINOR & 0xFF) << 16 |
+                                     (GLSLANG_VERSION_PATCH & 0xFFFF));
+    return spirv;
+}
+
+static pl_str glslang_compile(pl_spirv spirv, void *alloc,
+                              struct pl_glsl_version glsl_ver,
+                              enum glsl_shader_stage stage,
+                              const char *shader)
+{
+    struct pl_glslang_res *res;
+
+    res = pl_glslang_compile(glsl_ver, spirv->version, stage, shader);
+    if (!res || !res->success) {
+        PL_ERR(spirv, "glslang failed: %s", res ? res->error_msg : "(null)");
+        pl_free(res);
+        return (struct pl_str) {0};
+    }
+
+    struct pl_str ret = {
+        .buf = pl_steal(alloc, res->data),
+        .len = res->size,
+    };
+
+    pl_free(res);
+    return ret;
+}
+
+const struct spirv_compiler pl_spirv_glslang = {
+    .name       = "glslang",
+    .destroy    = glslang_destroy,
+    .create     = glslang_create,
+    .compile    = glslang_compile,
+};
diff --git a/src/glsl/spirv_shaderc.c b/src/glsl/spirv_shaderc.c
new file mode 100644
index 0000000..e384382
--- /dev/null
+++ b/src/glsl/spirv_shaderc.c
@@ -0,0 +1,174 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+#include <shaderc/shaderc.h>
+
+#include "hash.h"
+#include "spirv.h"
+#include "utils.h"
+
+const struct spirv_compiler pl_spirv_shaderc;
+
+struct priv {
+    shaderc_compiler_t compiler;
+};
+
+static void shaderc_destroy(pl_spirv spirv)
+{
+    struct priv *p = PL_PRIV(spirv);
+    shaderc_compiler_release(p->compiler);
+    pl_free((void *) spirv);
+}
+
+static pl_spirv shaderc_create(pl_log log, struct pl_spirv_version spirv_ver)
+{
+    struct pl_spirv_t *spirv = pl_alloc_obj(NULL, spirv, struct priv);
+    *spirv = (struct pl_spirv_t) {
+        .signature = pl_str0_hash(pl_spirv_shaderc.name),
+        .impl      = &pl_spirv_shaderc,
+        .version   = spirv_ver,
+        .log       = log,
+    };
+
+    struct priv *p = PL_PRIV(spirv);
+    p->compiler = shaderc_compiler_initialize();
+    if (!p->compiler)
+        goto error;
+
+    unsigned int ver = 0, rev = 0;
+    shaderc_get_spv_version(&ver, &rev);
+    PL_INFO(spirv, "shaderc SPIR-V version %u.%u rev %u",
+            ver >> 16, (ver >> 8) & 0xff, rev);
+
+    // Clamp to supported version by shaderc
+    if (ver < spirv->version.spv_version) {
+        spirv->version.spv_version = ver;
+        spirv->version.env_version = pl_spirv_version_to_vulkan(ver);
+    }
+
+    pl_hash_merge(&spirv->signature, (uint64_t) spirv->version.spv_version << 32 |
+                                                spirv->version.env_version);
+    pl_hash_merge(&spirv->signature, (uint64_t) ver << 32 | rev);
+    return spirv;
+
+error:
+    shaderc_destroy(spirv);
+    return NULL;
+}
+
+static pl_str shaderc_compile(pl_spirv spirv, void *alloc,
+                              struct pl_glsl_version glsl_ver,
+                              enum glsl_shader_stage stage,
+                              const char *shader)
+{
+    struct priv *p = PL_PRIV(spirv);
+    const size_t len = strlen(shader);
+
+    shaderc_compile_options_t opts = shaderc_compile_options_initialize();
+    if (!opts)
+        return (pl_str) {0};
+
+    shaderc_compile_options_set_optimization_level(opts,
+            shaderc_optimization_level_performance);
+    shaderc_compile_options_set_target_spirv(opts, spirv->version.spv_version);
+    shaderc_compile_options_set_target_env(opts, shaderc_target_env_vulkan,
+                                                 spirv->version.env_version);
+
+    for (int i = 0; i < 3; i++) {
+        shaderc_compile_options_set_limit(opts,
+                shaderc_limit_max_compute_work_group_size_x + i,
+                glsl_ver.max_group_size[i]);
+    }
+
+    shaderc_compile_options_set_limit(opts,
+            shaderc_limit_min_program_texel_offset,
+            glsl_ver.min_gather_offset);
+    shaderc_compile_options_set_limit(opts,
+            shaderc_limit_max_program_texel_offset,
+            glsl_ver.max_gather_offset);
+
+    static const shaderc_shader_kind kinds[] = {
+        [GLSL_SHADER_VERTEX]   = shaderc_glsl_vertex_shader,
+        [GLSL_SHADER_FRAGMENT] = shaderc_glsl_fragment_shader,
+        [GLSL_SHADER_COMPUTE]  = shaderc_glsl_compute_shader,
+    };
+
+    static const char * const file_name = "input";
+    static const char * const entry_point = "main";
+
+    shaderc_compilation_result_t res;
+    res = shaderc_compile_into_spv(p->compiler, shader, len, kinds[stage],
+                                   file_name, entry_point, opts);
+
+    int errs = shaderc_result_get_num_errors(res),
+        warn = shaderc_result_get_num_warnings(res);
+
+    enum pl_log_level lev = errs ? PL_LOG_ERR : warn ? PL_LOG_INFO : PL_LOG_DEBUG;
+
+    int s = shaderc_result_get_compilation_status(res);
+    bool success = s == shaderc_compilation_status_success;
+    if (!success)
+        lev = PL_LOG_ERR;
+
+    const char *msg = shaderc_result_get_error_message(res);
+    if (msg[0])
+        PL_MSG(spirv, lev, "shaderc output:\n%s", msg);
+
+    static const char *results[] = {
+        [shaderc_compilation_status_success]            = "success",
+        [shaderc_compilation_status_invalid_stage]      = "invalid stage",
+        [shaderc_compilation_status_compilation_error]  = "error",
+        [shaderc_compilation_status_internal_error]     = "internal error",
+        [shaderc_compilation_status_null_result_object] = "no result",
+        [shaderc_compilation_status_invalid_assembly]   = "invalid assembly",
+    };
+
+    const char *status = s < PL_ARRAY_SIZE(results) ? results[s] : "unknown";
+    PL_MSG(spirv, lev, "shaderc compile status '%s' (%d errors, %d warnings)",
+           status, errs, warn);
+
+    pl_str ret = {0};
+    if (success) {
+        void *bytes = (void *) shaderc_result_get_bytes(res);
+        pl_assert(bytes);
+        ret.len = shaderc_result_get_length(res);
+        ret.buf = pl_memdup(alloc, bytes, ret.len);
+
+        if (pl_msg_test(spirv->log, PL_LOG_TRACE)) {
+            shaderc_compilation_result_t dis;
+            dis = shaderc_compile_into_spv_assembly(p->compiler, shader, len,
+                                                    kinds[stage], file_name,
+                                                    entry_point, opts);
+            PL_TRACE(spirv, "Generated SPIR-V:\n%.*s",
+                     (int) shaderc_result_get_length(dis),
+                     shaderc_result_get_bytes(dis));
+            shaderc_result_release(dis);
+        }
+    }
+
+    shaderc_result_release(res);
+    shaderc_compile_options_release(opts);
+    return ret;
+}
+
+const struct spirv_compiler pl_spirv_shaderc = {
+    .name       = "shaderc",
+    .destroy    = shaderc_destroy,
+    .create     = shaderc_create,
+    .compile    = shaderc_compile,
+};
diff --git a/src/glsl/utils.h b/src/glsl/utils.h
new file mode 100644
index 0000000..965ea9e
--- /dev/null
+++ b/src/glsl/utils.h
@@ -0,0 +1,52 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <libplacebo/gpu.h>
+
+#define PL_SPV_VERSION(major, minor) ((major) << 16 | (minor) << 8)
+#define PL_VLK_VERSION(major, minor) ((major) << 22 | (minor) << 12)
+
+// Max version that can be used
+#define PL_MAX_SPIRV_VER PL_SPV_VERSION(1, 6)
+
+struct pl_spirv_version {
+    uint32_t env_version;
+    uint32_t spv_version;
+};
+
+// Returns minimum Vulkan version for given SPIR-V version
+static inline uint32_t pl_spirv_version_to_vulkan(uint32_t spirv_ver)
+{
+    if (spirv_ver >= PL_SPV_VERSION(1, 6))
+        return PL_VLK_VERSION(1, 3);
+    if (spirv_ver >= PL_SPV_VERSION(1, 4))
+        return PL_VLK_VERSION(1, 2);
+    if (spirv_ver >= PL_SPV_VERSION(1, 1))
+        return PL_VLK_VERSION(1, 1);
+    return PL_VLK_VERSION(1, 0);
+}
+
+enum glsl_shader_stage {
+    GLSL_SHADER_VERTEX = 0,
+    GLSL_SHADER_FRAGMENT,
+    GLSL_SHADER_COMPUTE,
+};
diff --git a/src/gpu.c b/src/gpu.c
new file mode 100644
index 0000000..b639ec2
--- /dev/null
+++ b/src/gpu.c
@@ -0,0 +1,1338 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include "gpu.h"
+
+#define require(expr) pl_require(gpu, expr)
+
+void pl_gpu_destroy(pl_gpu gpu)
+{
+    if (!gpu)
+        return;
+
+    struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    pl_dispatch_destroy(&impl->dp);
+    impl->destroy(gpu);
+}
+
+pl_dispatch pl_gpu_dispatch(pl_gpu gpu)
+{
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    return impl->dp;
+}
+
+pl_cache pl_gpu_cache(pl_gpu gpu)
+{
+    if (!gpu)
+        return NULL;
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    return atomic_load(&impl->cache);
+}
+
+void pl_gpu_set_cache(pl_gpu gpu, pl_cache cache)
+{
+    struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    atomic_store(&impl->cache, cache);
+}
+
+bool pl_fmt_is_ordered(pl_fmt fmt)
+{
+    bool ret = !fmt->opaque;
+    for (int i = 0; i < fmt->num_components; i++)
+        ret &= fmt->sample_order[i] == i;
+    return ret;
+}
+
+bool pl_fmt_is_float(pl_fmt fmt)
+{
+    switch (fmt->type) {
+    case PL_FMT_UNKNOWN: // more likely than not
+    case PL_FMT_FLOAT:
+    case PL_FMT_UNORM:
+    case PL_FMT_SNORM:
+        return true;
+
+    case PL_FMT_UINT:
+    case PL_FMT_SINT:
+        return false;
+
+    case PL_FMT_TYPE_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+}
+
+bool pl_fmt_has_modifier(pl_fmt fmt, uint64_t modifier)
+{
+    if (!fmt)
+        return false;
+
+    for (int i = 0; i < fmt->num_modifiers; i++) {
+        if (fmt->modifiers[i] == modifier)
+            return true;
+    }
+
+    return false;
+}
+
+pl_fmt pl_find_fmt(pl_gpu gpu, enum pl_fmt_type type, int num_components,
+                    int min_depth, int host_bits, enum pl_fmt_caps caps)
+{
+    for (int n = 0; n < gpu->num_formats; n++) {
+        pl_fmt fmt = gpu->formats[n];
+        if (fmt->type != type || fmt->num_components != num_components)
+            continue;
+        if ((fmt->caps & caps) != caps)
+            continue;
+
+        // When specifying some particular host representation, ensure the
+        // format is non-opaque, ordered and unpadded
+        if (host_bits && fmt->opaque)
+            continue;
+        if (host_bits && fmt->texel_size * 8 != host_bits * num_components)
+            continue;
+        if (host_bits && !pl_fmt_is_ordered(fmt))
+            continue;
+
+        for (int i = 0; i < fmt->num_components; i++) {
+            if (fmt->component_depth[i] < min_depth)
+                goto next_fmt;
+            if (host_bits && fmt->host_bits[i] != host_bits)
+                goto next_fmt;
+        }
+
+        return fmt;
+
+next_fmt: ; // equivalent to `continue`
+    }
+
+    // ran out of formats
+    PL_TRACE(gpu, "No matching format found");
+    return NULL;
+}
+
+pl_fmt pl_find_vertex_fmt(pl_gpu gpu, enum pl_fmt_type type, int comps)
+{
+    static const size_t sizes[] = {
+        [PL_FMT_FLOAT] = sizeof(float),
+        [PL_FMT_UNORM] = sizeof(unsigned),
+        [PL_FMT_UINT]  = sizeof(unsigned),
+        [PL_FMT_SNORM] = sizeof(int),
+        [PL_FMT_SINT]  = sizeof(int),
+    };
+
+    return pl_find_fmt(gpu, type, comps, 0, 8 * sizes[type], PL_FMT_CAP_VERTEX);
+}
+
+pl_fmt pl_find_named_fmt(pl_gpu gpu, const char *name)
+{
+    if (!name)
+        return NULL;
+
+    for (int i = 0; i < gpu->num_formats; i++) {
+        pl_fmt fmt = gpu->formats[i];
+        if (strcmp(name, fmt->name) == 0)
+            return fmt;
+    }
+
+    // ran out of formats
+    return NULL;
+}
+
+pl_fmt pl_find_fourcc(pl_gpu gpu, uint32_t fourcc)
+{
+    if (!fourcc)
+        return NULL;
+
+    for (int i = 0; i < gpu->num_formats; i++) {
+        pl_fmt fmt = gpu->formats[i];
+        if (fourcc == fmt->fourcc)
+            return fmt;
+    }
+
+    // ran out of formats
+    return NULL;
+}
+
+static inline bool check_mod(pl_gpu gpu, pl_fmt fmt, uint64_t mod)
+{
+    for (int i = 0; i < fmt->num_modifiers; i++) {
+        if (fmt->modifiers[i] == mod)
+            return true;
+    }
+
+
+    PL_ERR(gpu, "DRM modifier %s not available for format %s. Available modifiers:",
+           PRINT_DRM_MOD(mod), fmt->name);
+    for (int i = 0; i < fmt->num_modifiers; i++)
+        PL_ERR(gpu, "    %s", PRINT_DRM_MOD(fmt->modifiers[i]));
+
+    return false;
+}
+
+pl_tex pl_tex_create(pl_gpu gpu, const struct pl_tex_params *params)
+{
+    require(params->format);
+    require(!params->import_handle || !params->export_handle);
+    require(!params->import_handle || !params->initial_data);
+    if (params->export_handle) {
+        require(params->export_handle & gpu->export_caps.tex);
+        require(PL_ISPOT(params->export_handle));
+    }
+    if (params->import_handle) {
+        require(params->import_handle & gpu->import_caps.tex);
+        require(PL_ISPOT(params->import_handle));
+        if (params->import_handle == PL_HANDLE_DMA_BUF) {
+            if (!check_mod(gpu, params->format, params->shared_mem.drm_format_mod))
+                goto error;
+            if (params->shared_mem.stride_w)
+                require(params->w && params->shared_mem.stride_w >= params->w);
+            if (params->shared_mem.stride_h)
+                require(params->h && params->shared_mem.stride_h >= params->h);
+        } else if (params->import_handle == PL_HANDLE_MTL_TEX) {
+            require(params->shared_mem.plane <= 2);
+        }
+    }
+
+    switch (pl_tex_params_dimension(*params)) {
+    case 1:
+        require(params->w > 0);
+        require(params->w <= gpu->limits.max_tex_1d_dim);
+        require(!params->renderable);
+        require(!params->blit_src || gpu->limits.blittable_1d_3d);
+        require(!params->blit_dst || gpu->limits.blittable_1d_3d);
+        require(!params->format->num_planes);
+        break;
+    case 2:
+        require(params->w > 0 && params->h > 0);
+        require(params->w <= gpu->limits.max_tex_2d_dim);
+        require(params->h <= gpu->limits.max_tex_2d_dim);
+        break;
+    case 3:
+        require(params->w > 0 && params->h > 0 && params->d > 0);
+        require(params->w <= gpu->limits.max_tex_3d_dim);
+        require(params->h <= gpu->limits.max_tex_3d_dim);
+        require(params->d <= gpu->limits.max_tex_3d_dim);
+        require(!params->renderable);
+        require(!params->blit_src || gpu->limits.blittable_1d_3d);
+        require(!params->blit_dst || gpu->limits.blittable_1d_3d);
+        require(!params->format->num_planes);
+        break;
+    }
+
+    enum pl_fmt_caps fmt_caps = params->format->caps;
+    bool fmt_opaque = params->format->opaque;
+    for (int i = 0; i < params->format->num_planes; i++) {
+        pl_fmt pfmt = params->format->planes[i].format;
+        fmt_caps |= pfmt->caps;
+        fmt_opaque &= pfmt->opaque;
+    }
+
+    require(!params->host_readable || fmt_caps & PL_FMT_CAP_HOST_READABLE);
+    require(!params->host_writable || !fmt_opaque);
+    require(!params->sampleable || fmt_caps & PL_FMT_CAP_SAMPLEABLE);
+    require(!params->renderable || fmt_caps & PL_FMT_CAP_RENDERABLE);
+    require(!params->storable   || fmt_caps & PL_FMT_CAP_STORABLE);
+    require(!params->blit_src   || fmt_caps & PL_FMT_CAP_BLITTABLE);
+    require(!params->blit_dst   || fmt_caps & PL_FMT_CAP_BLITTABLE);
+
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    return impl->tex_create(gpu, params);
+
+error:
+    if (params->debug_tag)
+        PL_ERR(gpu, "  for texture: %s", params->debug_tag);
+    return NULL;
+}
+
+void pl_tex_destroy(pl_gpu gpu, pl_tex *tex)
+{
+    if (!*tex)
+        return;
+
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    impl->tex_destroy(gpu, *tex);
+    *tex = NULL;
+}
+
+static bool pl_tex_params_superset(struct pl_tex_params a, struct pl_tex_params b)
+{
+    return a.w == b.w && a.h == b.h && a.d == b.d &&
+           a.format          == b.format &&
+           (a.sampleable     || !b.sampleable) &&
+           (a.renderable     || !b.renderable) &&
+           (a.storable       || !b.storable) &&
+           (a.blit_src       || !b.blit_src) &&
+           (a.blit_dst       || !b.blit_dst) &&
+           (a.host_writable  || !b.host_writable) &&
+           (a.host_readable  || !b.host_readable);
+}
+
+bool pl_tex_recreate(pl_gpu gpu, pl_tex *tex, const struct pl_tex_params *params)
+{
+    if (params->initial_data) {
+        PL_ERR(gpu, "pl_tex_recreate may not be used with `initial_data`!");
+        return false;
+    }
+
+    if (params->import_handle) {
+        PL_ERR(gpu, "pl_tex_recreate may not be used with `import_handle`!");
+        return false;
+    }
+
+    if (*tex && pl_tex_params_superset((*tex)->params, *params)) {
+        pl_tex_invalidate(gpu, *tex);
+        return true;
+    }
+
+    PL_DEBUG(gpu, "(Re)creating %dx%dx%d texture with format %s: %s",
+             params->w, params->h, params->d, params->format->name,
+             PL_DEF(params->debug_tag, "unknown"));
+
+    pl_tex_destroy(gpu, tex);
+    *tex = pl_tex_create(gpu, params);
+
+    return !!*tex;
+}
+
+void pl_tex_clear_ex(pl_gpu gpu, pl_tex dst, const union pl_clear_color color)
+{
+    require(dst->params.blit_dst);
+
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    if (impl->tex_invalidate)
+        impl->tex_invalidate(gpu, dst);
+    impl->tex_clear_ex(gpu, dst, color);
+    return;
+
+error:
+    if (dst->params.debug_tag)
+        PL_ERR(gpu, "  for texture: %s", dst->params.debug_tag);
+}
+
+void pl_tex_clear(pl_gpu gpu, pl_tex dst, const float color[4])
+{
+    if (!pl_fmt_is_float(dst->params.format)) {
+        PL_ERR(gpu, "Cannot call `pl_tex_clear` on integer textures, please "
+               "use `pl_tex_clear_ex` instead.");
+        return;
+    }
+
+    const union pl_clear_color col = {
+        .f = { color[0], color[1], color[2], color[3] },
+    };
+
+    pl_tex_clear_ex(gpu, dst, col);
+}
+
+void pl_tex_invalidate(pl_gpu gpu, pl_tex tex)
+{
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    if (impl->tex_invalidate)
+        impl->tex_invalidate(gpu, tex);
+}
+
+static void strip_coords(pl_tex tex, pl_rect3d *rc)
+{
+    if (!tex->params.d) {
+        rc->z0 = 0;
+        rc->z1 = 1;
+    }
+
+    if (!tex->params.h) {
+        rc->y0 = 0;
+        rc->y1 = 1;
+    }
+}
+
+static void infer_rc(pl_tex tex, pl_rect3d *rc)
+{
+    if (!rc->x0 && !rc->x1)
+        rc->x1 = tex->params.w;
+    if (!rc->y0 && !rc->y1)
+        rc->y1 = tex->params.h;
+    if (!rc->z0 && !rc->z1)
+        rc->z1 = tex->params.d;
+}
+
+void pl_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params)
+{
+    pl_tex src = params->src, dst = params->dst;
+    require(src && dst);
+    pl_fmt src_fmt = src->params.format;
+    pl_fmt dst_fmt = dst->params.format;
+    require(src_fmt->internal_size == dst_fmt->internal_size);
+    require((src_fmt->type == PL_FMT_UINT) == (dst_fmt->type == PL_FMT_UINT));
+    require((src_fmt->type == PL_FMT_SINT) == (dst_fmt->type == PL_FMT_SINT));
+    require(src->params.blit_src);
+    require(dst->params.blit_dst);
+    require(params->sample_mode != PL_TEX_SAMPLE_LINEAR || (src_fmt->caps & PL_FMT_CAP_LINEAR));
+
+    struct pl_tex_blit_params fixed = *params;
+    infer_rc(src, &fixed.src_rc);
+    infer_rc(dst, &fixed.dst_rc);
+    strip_coords(src, &fixed.src_rc);
+    strip_coords(dst, &fixed.dst_rc);
+
+    require(fixed.src_rc.x0 >= 0 && fixed.src_rc.x0 < src->params.w);
+    require(fixed.src_rc.x1 > 0 && fixed.src_rc.x1 <= src->params.w);
+    require(fixed.dst_rc.x0 >= 0 && fixed.dst_rc.x0 < dst->params.w);
+    require(fixed.dst_rc.x1 > 0 && fixed.dst_rc.x1 <= dst->params.w);
+
+    if (src->params.h) {
+        require(fixed.src_rc.y0 >= 0 && fixed.src_rc.y0 < src->params.h);
+        require(fixed.src_rc.y1 > 0 && fixed.src_rc.y1 <= src->params.h);
+    }
+
+    if (dst->params.h) {
+        require(fixed.dst_rc.y0 >= 0 && fixed.dst_rc.y0 < dst->params.h);
+        require(fixed.dst_rc.y1 > 0 && fixed.dst_rc.y1 <= dst->params.h);
+    }
+
+    if (src->params.d) {
+        require(fixed.src_rc.z0 >= 0 && fixed.src_rc.z0 < src->params.d);
+        require(fixed.src_rc.z1 > 0 && fixed.src_rc.z1 <= src->params.d);
+    }
+
+    if (dst->params.d) {
+        require(fixed.dst_rc.z0 >= 0 && fixed.dst_rc.z0 < dst->params.d);
+        require(fixed.dst_rc.z1 > 0 && fixed.dst_rc.z1 <= dst->params.d);
+    }
+
+    pl_rect3d full = {0, 0, 0, dst->params.w, dst->params.h, dst->params.d};
+    strip_coords(dst, &full);
+
+    pl_rect3d rcnorm = fixed.dst_rc;
+    pl_rect3d_normalize(&rcnorm);
+    if (pl_rect3d_eq(rcnorm, full))
+        pl_tex_invalidate(gpu, dst);
+
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    impl->tex_blit(gpu, &fixed);
+    return;
+
+error:
+    if (src->params.debug_tag || dst->params.debug_tag) {
+        PL_ERR(gpu, "  for textures: src %s, dst %s",
+               PL_DEF(src->params.debug_tag, "(unknown)"),
+               PL_DEF(dst->params.debug_tag, "(unknown)"));
+    }
+}
+
+static bool fix_tex_transfer(pl_gpu gpu, struct pl_tex_transfer_params *params)
+{
+    pl_tex tex = params->tex;
+    pl_fmt fmt = tex->params.format;
+    pl_rect3d rc = params->rc;
+
+    // Infer the default values
+    infer_rc(tex, &rc);
+    strip_coords(tex, &rc);
+
+    if (!params->row_pitch || !tex->params.w)
+        params->row_pitch = pl_rect_w(rc) * fmt->texel_size;
+    if (!params->depth_pitch || !tex->params.d)
+        params->depth_pitch = pl_rect_h(rc) * params->row_pitch;
+
+    require(params->row_pitch);
+    require(params->depth_pitch);
+    params->rc = rc;
+
+    // Check the parameters for sanity
+    switch (pl_tex_params_dimension(tex->params))
+    {
+    case 3:
+        require(rc.z1 > rc.z0);
+        require(rc.z0 >= 0 && rc.z0 <  tex->params.d);
+        require(rc.z1 >  0 && rc.z1 <= tex->params.d);
+        require(params->depth_pitch >= pl_rect_h(rc) * params->row_pitch);
+        require(params->depth_pitch % params->row_pitch == 0);
+        // fall through
+    case 2:
+        require(rc.y1 > rc.y0);
+        require(rc.y0 >= 0 && rc.y0 <  tex->params.h);
+        require(rc.y1 >  0 && rc.y1 <= tex->params.h);
+        require(params->row_pitch >= pl_rect_w(rc) * fmt->texel_size);
+        require(params->row_pitch % fmt->texel_align == 0);
+        // fall through
+    case 1:
+        require(rc.x1 > rc.x0);
+        require(rc.x0 >= 0 && rc.x0 <  tex->params.w);
+        require(rc.x1 >  0 && rc.x1 <= tex->params.w);
+        break;
+    }
+
+    require(!params->buf ^ !params->ptr); // exactly one
+    if (params->buf) {
+        pl_buf buf = params->buf;
+        size_t size = pl_tex_transfer_size(params);
+        require(params->buf_offset + size >= params->buf_offset); // overflow check
+        require(params->buf_offset + size <= buf->params.size);
+        require(gpu->limits.buf_transfer);
+    }
+
+    require(!params->callback || gpu->limits.callbacks);
+    return true;
+
+error:
+    if (tex->params.debug_tag)
+        PL_ERR(gpu, "  for texture: %s", tex->params.debug_tag);
+    return false;
+}
+
+bool pl_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+    pl_tex tex = params->tex;
+    require(tex->params.host_writable);
+
+    struct pl_tex_transfer_params fixed = *params;
+    if (!fix_tex_transfer(gpu, &fixed))
+        goto error;
+
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    return impl->tex_upload(gpu, &fixed);
+
+error:
+    if (tex->params.debug_tag)
+        PL_ERR(gpu, "  for texture: %s", tex->params.debug_tag);
+    return false;
+}
+
+bool pl_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+    pl_tex tex = params->tex;
+    require(tex->params.host_readable);
+
+    struct pl_tex_transfer_params fixed = *params;
+    if (!fix_tex_transfer(gpu, &fixed))
+        goto error;
+
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    return impl->tex_download(gpu, &fixed);
+
+error:
+    if (tex->params.debug_tag)
+        PL_ERR(gpu, "  for texture: %s", tex->params.debug_tag);
+    return false;
+}
+
+bool pl_tex_poll(pl_gpu gpu, pl_tex tex, uint64_t t)
+{
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    return impl->tex_poll ? impl->tex_poll(gpu, tex, t) : false;
+}
+
+pl_buf pl_buf_create(pl_gpu gpu, const struct pl_buf_params *params)
+{
+    struct pl_buf_params params_rounded;
+
+    require(!params->import_handle || !params->export_handle);
+    if (params->export_handle) {
+        require(PL_ISPOT(params->export_handle));
+        require(params->export_handle & gpu->export_caps.buf);
+    }
+    if (params->import_handle) {
+        require(PL_ISPOT(params->import_handle));
+        require(params->import_handle & gpu->import_caps.buf);
+        const struct pl_shared_mem *shmem = &params->shared_mem;
+        require(shmem->offset + params->size <= shmem->size);
+        require(params->import_handle != PL_HANDLE_DMA_BUF || !shmem->drm_format_mod);
+
+        // Fix misalignment on host pointer imports
+        if (params->import_handle == PL_HANDLE_HOST_PTR) {
+            uintptr_t page_mask = ~(gpu->limits.align_host_ptr - 1);
+            uintptr_t ptr_base = (uintptr_t) shmem->handle.ptr & page_mask;
+            size_t ptr_offset = (uintptr_t) shmem->handle.ptr - ptr_base;
+            size_t buf_offset = ptr_offset + shmem->offset;
+            size_t ptr_size = PL_ALIGN2(ptr_offset + shmem->size,
+                                        gpu->limits.align_host_ptr);
+
+            if (ptr_base != (uintptr_t) shmem->handle.ptr || ptr_size > shmem->size) {
+                static bool warned_rounding = false;
+                if (!warned_rounding) {
+                    warned_rounding = true;
+                    PL_WARN(gpu, "Imported host pointer is not page-aligned. "
+                            "This should normally be fine on most platforms, "
+                            "but may cause issues in some rare circumstances.");
+                }
+
+                PL_TRACE(gpu, "Rounding imported host pointer %p + %zu -> %zu to "
+                         "nearest page boundaries: %p + %zu -> %zu",
+                          shmem->handle.ptr, shmem->offset, shmem->size,
+                          (void *) ptr_base, buf_offset, ptr_size);
+            }
+
+            params_rounded = *params;
+            params_rounded.shared_mem.handle.ptr = (void *) ptr_base;
+            params_rounded.shared_mem.offset = buf_offset;
+            params_rounded.shared_mem.size = ptr_size;
+            params = &params_rounded;
+        }
+    }
+
+    require(params->size > 0 && params->size <= gpu->limits.max_buf_size);
+    require(!params->uniform || params->size <= gpu->limits.max_ubo_size);
+    require(!params->storable || params->size <= gpu->limits.max_ssbo_size);
+    require(!params->drawable || params->size <= gpu->limits.max_vbo_size);
+    require(!params->host_mapped || params->size <= gpu->limits.max_mapped_size);
+
+    if (params->format) {
+        pl_fmt fmt = params->format;
+        require(params->size <= gpu->limits.max_buffer_texels * fmt->texel_size);
+        require(!params->uniform || (fmt->caps & PL_FMT_CAP_TEXEL_UNIFORM));
+        require(!params->storable || (fmt->caps & PL_FMT_CAP_TEXEL_STORAGE));
+    }
+
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    pl_buf buf = impl->buf_create(gpu, params);
+    if (buf)
+        require(!params->host_mapped || buf->data);
+
+    return buf;
+
+error:
+    if (params->debug_tag)
+        PL_ERR(gpu, "  for buffer: %s", params->debug_tag);
+    return NULL;
+}
+
+void pl_buf_destroy(pl_gpu gpu, pl_buf *buf)
+{
+    if (!*buf)
+        return;
+
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    impl->buf_destroy(gpu, *buf);
+    *buf = NULL;
+}
+
+static bool pl_buf_params_superset(struct pl_buf_params a, struct pl_buf_params b)
+{
+    return a.size            >= b.size &&
+           a.memory_type     == b.memory_type &&
+           a.format          == b.format &&
+           (a.host_writable  || !b.host_writable) &&
+           (a.host_readable  || !b.host_readable) &&
+           (a.host_mapped    || !b.host_mapped) &&
+           (a.uniform        || !b.uniform) &&
+           (a.storable       || !b.storable) &&
+           (a.drawable       || !b.drawable);
+}
+
+bool pl_buf_recreate(pl_gpu gpu, pl_buf *buf, const struct pl_buf_params *params)
+{
+
+    if (params->initial_data) {
+        PL_ERR(gpu, "pl_buf_recreate may not be used with `initial_data`!");
+        return false;
+    }
+
+    if (*buf && pl_buf_params_superset((*buf)->params, *params))
+        return true;
+
+    PL_INFO(gpu, "(Re)creating %zu buffer", params->size);
+    pl_buf_destroy(gpu, buf);
+    *buf = pl_buf_create(gpu, params);
+
+    return !!*buf;
+}
+
+void pl_buf_write(pl_gpu gpu, pl_buf buf, size_t buf_offset,
+                  const void *data, size_t size)
+{
+    require(buf->params.host_writable);
+    require(buf_offset + size <= buf->params.size);
+    require(buf_offset == PL_ALIGN2(buf_offset, 4));
+
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    impl->buf_write(gpu, buf, buf_offset, data, size);
+    return;
+
+error:
+    if (buf->params.debug_tag)
+        PL_ERR(gpu, "  for buffer: %s", buf->params.debug_tag);
+}
+
+bool pl_buf_read(pl_gpu gpu, pl_buf buf, size_t buf_offset,
+                 void *dest, size_t size)
+{
+    require(buf->params.host_readable);
+    require(buf_offset + size <= buf->params.size);
+
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    return impl->buf_read(gpu, buf, buf_offset, dest, size);
+
+error:
+    if (buf->params.debug_tag)
+        PL_ERR(gpu, "  for buffer: %s", buf->params.debug_tag);
+    return false;
+}
+
+void pl_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset,
+                 pl_buf src, size_t src_offset, size_t size)
+{
+    require(src_offset + size <= src->params.size);
+    require(dst_offset + size <= dst->params.size);
+    require(src != dst);
+
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    impl->buf_copy(gpu, dst, dst_offset, src, src_offset, size);
+    return;
+
+error:
+    if (src->params.debug_tag || dst->params.debug_tag) {
+        PL_ERR(gpu, "  for buffers: src %s, dst %s",
+               src->params.debug_tag, dst->params.debug_tag);
+    }
+}
+
+bool pl_buf_export(pl_gpu gpu, pl_buf buf)
+{
+    require(buf->params.export_handle || buf->params.import_handle);
+
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    return impl->buf_export(gpu, buf);
+
+error:
+    if (buf->params.debug_tag)
+        PL_ERR(gpu, "  for buffer: %s", buf->params.debug_tag);
+    return false;
+}
+
+bool pl_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t t)
+{
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    return impl->buf_poll ? impl->buf_poll(gpu, buf, t) : false;
+}
+
+size_t pl_var_type_size(enum pl_var_type type)
+{
+    switch (type) {
+    case PL_VAR_SINT:  return sizeof(int);
+    case PL_VAR_UINT:  return sizeof(unsigned int);
+    case PL_VAR_FLOAT: return sizeof(float);
+    case PL_VAR_INVALID: // fall through
+    case PL_VAR_TYPE_COUNT: break;
+    }
+
+    pl_unreachable();
+}
+
+#define PL_VAR(TYPE, NAME, M, V)                        \
+    struct pl_var pl_var_##NAME(const char *name) {     \
+        return (struct pl_var) {                        \
+            .name  = name,                              \
+            .type  = PL_VAR_##TYPE,                     \
+            .dim_m = M,                                 \
+            .dim_v = V,                                 \
+            .dim_a = 1,                                 \
+        };                                              \
+    }
+
+PL_VAR(FLOAT, float,    1, 1)
+PL_VAR(FLOAT, vec2,     1, 2)
+PL_VAR(FLOAT, vec3,     1, 3)
+PL_VAR(FLOAT, vec4,     1, 4)
+PL_VAR(FLOAT, mat2,     2, 2)
+PL_VAR(FLOAT, mat2x3,   2, 3)
+PL_VAR(FLOAT, mat2x4,   2, 4)
+PL_VAR(FLOAT, mat3,     3, 3)
+PL_VAR(FLOAT, mat3x4,   3, 4)
+PL_VAR(FLOAT, mat4x2,   4, 2)
+PL_VAR(FLOAT, mat4x3,   4, 3)
+PL_VAR(FLOAT, mat4,     4, 4)
+PL_VAR(SINT,  int,      1, 1)
+PL_VAR(SINT,  ivec2,    1, 2)
+PL_VAR(SINT,  ivec3,    1, 3)
+PL_VAR(SINT,  ivec4,    1, 4)
+PL_VAR(UINT,  uint,     1, 1)
+PL_VAR(UINT,  uvec2,    1, 2)
+PL_VAR(UINT,  uvec3,    1, 3)
+PL_VAR(UINT,  uvec4,    1, 4)
+
+#undef PL_VAR
+
+const struct pl_named_var pl_var_glsl_types[] = {
+    // float vectors
+    { "float",  { .type = PL_VAR_FLOAT, .dim_m = 1, .dim_v = 1, .dim_a = 1, }},
+    { "vec2",   { .type = PL_VAR_FLOAT, .dim_m = 1, .dim_v = 2, .dim_a = 1, }},
+    { "vec3",   { .type = PL_VAR_FLOAT, .dim_m = 1, .dim_v = 3, .dim_a = 1, }},
+    { "vec4",   { .type = PL_VAR_FLOAT, .dim_m = 1, .dim_v = 4, .dim_a = 1, }},
+    // float matrices
+    { "mat2",   { .type = PL_VAR_FLOAT, .dim_m = 2, .dim_v = 2, .dim_a = 1, }},
+    { "mat2x3", { .type = PL_VAR_FLOAT, .dim_m = 2, .dim_v = 3, .dim_a = 1, }},
+    { "mat2x4", { .type = PL_VAR_FLOAT, .dim_m = 2, .dim_v = 4, .dim_a = 1, }},
+    { "mat3",   { .type = PL_VAR_FLOAT, .dim_m = 3, .dim_v = 3, .dim_a = 1, }},
+    { "mat3x4", { .type = PL_VAR_FLOAT, .dim_m = 3, .dim_v = 4, .dim_a = 1, }},
+    { "mat4x2", { .type = PL_VAR_FLOAT, .dim_m = 4, .dim_v = 2, .dim_a = 1, }},
+    { "mat4x3", { .type = PL_VAR_FLOAT, .dim_m = 4, .dim_v = 3, .dim_a = 1, }},
+    { "mat4",   { .type = PL_VAR_FLOAT, .dim_m = 4, .dim_v = 4, .dim_a = 1, }},
+    // integer vectors
+    { "int",    { .type = PL_VAR_SINT,  .dim_m = 1, .dim_v = 1, .dim_a = 1, }},
+    { "ivec2",  { .type = PL_VAR_SINT,  .dim_m = 1, .dim_v = 2, .dim_a = 1, }},
+    { "ivec3",  { .type = PL_VAR_SINT,  .dim_m = 1, .dim_v = 3, .dim_a = 1, }},
+    { "ivec4",  { .type = PL_VAR_SINT,  .dim_m = 1, .dim_v = 4, .dim_a = 1, }},
+    // unsigned integer vectors
+    { "uint",   { .type = PL_VAR_UINT,  .dim_m = 1, .dim_v = 1, .dim_a = 1, }},
+    { "uvec2",  { .type = PL_VAR_UINT,  .dim_m = 1, .dim_v = 2, .dim_a = 1, }},
+    { "uvec3",  { .type = PL_VAR_UINT,  .dim_m = 1, .dim_v = 3, .dim_a = 1, }},
+    { "uvec4",  { .type = PL_VAR_UINT,  .dim_m = 1, .dim_v = 4, .dim_a = 1, }},
+
+    {0},
+};
+
+#define MAX_DIM 4
+
+const char *pl_var_glsl_type_name(struct pl_var var)
+{
+    static const char *types[PL_VAR_TYPE_COUNT][MAX_DIM+1][MAX_DIM+1] = {
+    // float vectors
+    [PL_VAR_FLOAT][1][1] = "float",
+    [PL_VAR_FLOAT][1][2] = "vec2",
+    [PL_VAR_FLOAT][1][3] = "vec3",
+    [PL_VAR_FLOAT][1][4] = "vec4",
+    // float matrices
+    [PL_VAR_FLOAT][2][2] = "mat2",
+    [PL_VAR_FLOAT][2][3] = "mat2x3",
+    [PL_VAR_FLOAT][2][4] = "mat2x4",
+    [PL_VAR_FLOAT][3][2] = "mat3x2",
+    [PL_VAR_FLOAT][3][3] = "mat3",
+    [PL_VAR_FLOAT][3][4] = "mat3x4",
+    [PL_VAR_FLOAT][4][2] = "mat4x2",
+    [PL_VAR_FLOAT][4][3] = "mat4x3",
+    [PL_VAR_FLOAT][4][4] = "mat4",
+    // integer vectors
+    [PL_VAR_SINT][1][1] = "int",
+    [PL_VAR_SINT][1][2] = "ivec2",
+    [PL_VAR_SINT][1][3] = "ivec3",
+    [PL_VAR_SINT][1][4] = "ivec4",
+    // unsigned integer vectors
+    [PL_VAR_UINT][1][1] = "uint",
+    [PL_VAR_UINT][1][2] = "uvec2",
+    [PL_VAR_UINT][1][3] = "uvec3",
+    [PL_VAR_UINT][1][4] = "uvec4",
+    };
+
+    if (var.dim_v > MAX_DIM || var.dim_m > MAX_DIM)
+        return NULL;
+
+    return types[var.type][var.dim_m][var.dim_v];
+}
+
+struct pl_var pl_var_from_fmt(pl_fmt fmt, const char *name)
+{
+    static const enum pl_var_type vartypes[] = {
+        [PL_FMT_FLOAT] = PL_VAR_FLOAT,
+        [PL_FMT_UNORM] = PL_VAR_FLOAT,
+        [PL_FMT_SNORM] = PL_VAR_FLOAT,
+        [PL_FMT_UINT]  = PL_VAR_UINT,
+        [PL_FMT_SINT]  = PL_VAR_SINT,
+    };
+
+    pl_assert(fmt->type < PL_ARRAY_SIZE(vartypes));
+    return (struct pl_var) {
+        .type  = vartypes[fmt->type],
+        .name  = name,
+        .dim_v = fmt->num_components,
+        .dim_m = 1,
+        .dim_a = 1,
+    };
+}
+
+struct pl_var_layout pl_var_host_layout(size_t offset, const struct pl_var *var)
+{
+    size_t col_size = pl_var_type_size(var->type) * var->dim_v;
+    return (struct pl_var_layout) {
+        .offset = offset,
+        .stride = col_size,
+        .size   = col_size * var->dim_m * var->dim_a,
+    };
+}
+
+struct pl_var_layout pl_std140_layout(size_t offset, const struct pl_var *var)
+{
+    size_t el_size = pl_var_type_size(var->type);
+
+    // std140 packing rules:
+    // 1. The size of generic values is their size in bytes
+    // 2. The size of vectors is the vector length * the base count
+    // 3. Matrices are treated like arrays of column vectors
+    // 4. The size of array rows is that of the element size rounded up to
+    // the nearest multiple of vec4
+    // 5. All values are aligned to a multiple of their size (stride for arrays),
+    // with the exception of vec3 which is aligned like vec4
+    size_t stride = el_size * var->dim_v;
+    size_t align = stride;
+    if (var->dim_v == 3)
+        align += el_size;
+    if (var->dim_m * var->dim_a > 1)
+        stride = align = PL_ALIGN2(align, sizeof(float[4]));
+
+    return (struct pl_var_layout) {
+        .offset = PL_ALIGN2(offset, align),
+        .stride = stride,
+        .size   = stride * var->dim_m * var->dim_a,
+    };
+}
+
+struct pl_var_layout pl_std430_layout(size_t offset, const struct pl_var *var)
+{
+    size_t el_size = pl_var_type_size(var->type);
+
+    // std430 packing rules: like std140, except arrays/matrices are always
+    // "tightly" packed, even arrays/matrices of vec3s
+    size_t stride = el_size * var->dim_v;
+    size_t align = stride;
+    if (var->dim_v == 3)
+        align += el_size;
+    if (var->dim_m * var->dim_a > 1)
+        stride = align;
+
+    return (struct pl_var_layout) {
+        .offset = PL_ALIGN2(offset, align),
+        .stride = stride,
+        .size   = stride * var->dim_m * var->dim_a,
+    };
+}
+
+void memcpy_layout(void *dst_p, struct pl_var_layout dst_layout,
+                   const void *src_p, struct pl_var_layout src_layout)
+{
+    uintptr_t src = (uintptr_t) src_p + src_layout.offset;
+    uintptr_t dst = (uintptr_t) dst_p + dst_layout.offset;
+
+    if (src_layout.stride == dst_layout.stride) {
+        pl_assert(dst_layout.size == src_layout.size);
+        memcpy((void *) dst, (const void *) src, src_layout.size);
+        return;
+    }
+
+    size_t stride = PL_MIN(src_layout.stride, dst_layout.stride);
+    uintptr_t end = src + src_layout.size;
+    while (src < end) {
+        pl_assert(dst < dst + dst_layout.size);
+        memcpy((void *) dst, (const void *) src, stride);
+        src += src_layout.stride;
+        dst += dst_layout.stride;
+    }
+}
+
+int pl_desc_namespace(pl_gpu gpu, enum pl_desc_type type)
+{
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    int ret = impl->desc_namespace(gpu, type);
+    pl_assert(ret >= 0 && ret < PL_DESC_TYPE_COUNT);
+    return ret;
+}
+
+const char *pl_desc_access_glsl_name(enum pl_desc_access mode)
+{
+    switch (mode) {
+    case PL_DESC_ACCESS_READWRITE: return "";
+    case PL_DESC_ACCESS_READONLY:  return "readonly";
+    case PL_DESC_ACCESS_WRITEONLY: return "writeonly";
+    case PL_DESC_ACCESS_COUNT: break;
+    }
+
+    pl_unreachable();
+}
+
+const struct pl_blend_params pl_alpha_overlay = {
+    .src_rgb    = PL_BLEND_SRC_ALPHA,
+    .dst_rgb    = PL_BLEND_ONE_MINUS_SRC_ALPHA,
+    .src_alpha  = PL_BLEND_ONE,
+    .dst_alpha  = PL_BLEND_ONE_MINUS_SRC_ALPHA,
+};
+
+static inline void log_shader_sources(pl_log log, enum pl_log_level level,
+                                      const struct pl_pass_params *params)
+{
+    if (!pl_msg_test(log, level) || !params->glsl_shader)
+        return;
+
+    switch (params->type) {
+    case PL_PASS_RASTER:
+        if (!params->vertex_shader)
+            return;
+        pl_msg(log, level, "vertex shader source:");
+        pl_msg_source(log, level, params->vertex_shader);
+        pl_msg(log, level, "fragment shader source:");
+        pl_msg_source(log, level, params->glsl_shader);
+        return;
+
+    case PL_PASS_COMPUTE:
+        pl_msg(log, level, "compute shader source:");
+        pl_msg_source(log, level, params->glsl_shader);
+        return;
+
+    case PL_PASS_INVALID:
+    case PL_PASS_TYPE_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+}
+
+static void log_spec_constants(pl_log log, enum pl_log_level lev,
+                               const struct pl_pass_params *params,
+                               const void *constant_data)
+{
+    if (!constant_data || !params->num_constants || !pl_msg_test(log, lev))
+        return;
+
+    pl_msg(log, lev, "Specialization constant values:");
+
+    uintptr_t data_base = (uintptr_t) constant_data;
+    for (int i = 0; i < params->num_constants; i++) {
+        union {
+            int i;
+            unsigned u;
+            float f;
+        } *data = (void *) (data_base + params->constants[i].offset);
+        int id = params->constants[i].id;
+
+        switch (params->constants[i].type) {
+        case PL_VAR_SINT:  pl_msg(log, lev, "  constant_id=%d: %d", id, data->i); break;
+        case PL_VAR_UINT:  pl_msg(log, lev, "  constant_id=%d: %u", id, data->u); break;
+        case PL_VAR_FLOAT: pl_msg(log, lev, "  constant_id=%d: %f", id, data->f); break;
+        default: pl_unreachable();
+        }
+    }
+}
+
+pl_pass pl_pass_create(pl_gpu gpu, const struct pl_pass_params *params)
+{
+    require(params->glsl_shader);
+    switch(params->type) {
+    case PL_PASS_RASTER:
+        require(params->vertex_shader);
+        require(params->vertex_stride % gpu->limits.align_vertex_stride == 0);
+        for (int i = 0; i < params->num_vertex_attribs; i++) {
+            struct pl_vertex_attrib va = params->vertex_attribs[i];
+            require(va.name);
+            require(va.fmt);
+            require(va.fmt->caps & PL_FMT_CAP_VERTEX);
+            require(va.offset + va.fmt->texel_size <= params->vertex_stride);
+        }
+
+        require(params->target_format);
+        require(params->target_format->caps & PL_FMT_CAP_RENDERABLE);
+        require(!params->blend_params || params->target_format->caps & PL_FMT_CAP_BLENDABLE);
+        require(!params->blend_params || params->load_target);
+        break;
+    case PL_PASS_COMPUTE:
+        require(gpu->glsl.compute);
+        break;
+    case PL_PASS_INVALID:
+    case PL_PASS_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+    size_t num_var_comps = 0;
+    for (int i = 0; i < params->num_variables; i++) {
+        struct pl_var var = params->variables[i];
+        num_var_comps += var.dim_v * var.dim_m * var.dim_a;
+        require(var.name);
+        require(pl_var_glsl_type_name(var));
+    }
+    require(num_var_comps <= gpu->limits.max_variable_comps);
+
+    require(params->num_constants <= gpu->limits.max_constants);
+    for (int i = 0; i < params->num_constants; i++)
+        require(params->constants[i].type);
+
+    for (int i = 0; i < params->num_descriptors; i++) {
+        struct pl_desc desc = params->descriptors[i];
+        require(desc.name);
+
+        // enforce disjoint descriptor bindings for each namespace
+        int namespace = pl_desc_namespace(gpu, desc.type);
+        for (int j = i+1; j < params->num_descriptors; j++) {
+            struct pl_desc other = params->descriptors[j];
+            require(desc.binding != other.binding ||
+                    namespace != pl_desc_namespace(gpu, other.type));
+        }
+    }
+
+    require(params->push_constants_size <= gpu->limits.max_pushc_size);
+    require(params->push_constants_size == PL_ALIGN2(params->push_constants_size, 4));
+
+    log_shader_sources(gpu->log, PL_LOG_DEBUG, params);
+    log_spec_constants(gpu->log, PL_LOG_DEBUG, params, params->constant_data);
+
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    pl_pass pass = impl->pass_create(gpu, params);
+    if (!pass)
+        goto error;
+
+    return pass;
+
+error:
+    log_shader_sources(gpu->log, PL_LOG_ERR, params);
+    pl_log_stack_trace(gpu->log, PL_LOG_ERR);
+    pl_debug_abort();
+    return NULL;
+}
+
+void pl_pass_destroy(pl_gpu gpu, pl_pass *pass)
+{
+    if (!*pass)
+        return;
+
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    impl->pass_destroy(gpu, *pass);
+    *pass = NULL;
+}
+
+void pl_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params)
+{
+    pl_pass pass = params->pass;
+    struct pl_pass_run_params new = *params;
+
+    for (int i = 0; i < pass->params.num_descriptors; i++) {
+        struct pl_desc desc = pass->params.descriptors[i];
+        struct pl_desc_binding db = params->desc_bindings[i];
+        require(db.object);
+        switch (desc.type) {
+        case PL_DESC_SAMPLED_TEX: {
+            pl_tex tex = db.object;
+            pl_fmt fmt = tex->params.format;
+            require(tex->params.sampleable);
+            require(db.sample_mode != PL_TEX_SAMPLE_LINEAR || (fmt->caps & PL_FMT_CAP_LINEAR));
+            break;
+        }
+        case PL_DESC_STORAGE_IMG: {
+            pl_tex tex = db.object;
+            pl_fmt fmt = tex->params.format;
+            require(tex->params.storable);
+            require(desc.access != PL_DESC_ACCESS_READWRITE || (fmt->caps & PL_FMT_CAP_READWRITE));
+            break;
+        }
+        case PL_DESC_BUF_UNIFORM: {
+            pl_buf buf = db.object;
+            require(buf->params.uniform);
+            break;
+        }
+        case PL_DESC_BUF_STORAGE: {
+            pl_buf buf = db.object;
+            require(buf->params.storable);
+            break;
+        }
+        case PL_DESC_BUF_TEXEL_UNIFORM: {
+            pl_buf buf = db.object;
+            require(buf->params.uniform && buf->params.format);
+            break;
+        }
+        case PL_DESC_BUF_TEXEL_STORAGE: {
+            pl_buf buf = db.object;
+            pl_fmt fmt = buf->params.format;
+            require(buf->params.storable && buf->params.format);
+            require(desc.access != PL_DESC_ACCESS_READWRITE || (fmt->caps & PL_FMT_CAP_READWRITE));
+            break;
+        }
+        case PL_DESC_INVALID:
+        case PL_DESC_TYPE_COUNT:
+            pl_unreachable();
+        }
+    }
+
+    for (int i = 0; i < params->num_var_updates; i++) {
+        struct pl_var_update vu = params->var_updates[i];
+        require(vu.index >= 0 && vu.index < pass->params.num_variables);
+        require(vu.data);
+    }
+
+    require(params->push_constants || !pass->params.push_constants_size);
+
+    switch (pass->params.type) {
+    case PL_PASS_RASTER: {
+        switch (pass->params.vertex_type) {
+        case PL_PRIM_TRIANGLE_LIST:
+            require(params->vertex_count % 3 == 0);
+            // fall through
+        case PL_PRIM_TRIANGLE_STRIP:
+            require(params->vertex_count >= 3);
+            break;
+        case PL_PRIM_TYPE_COUNT:
+            pl_unreachable();
+        }
+
+        require(!params->vertex_data ^ !params->vertex_buf);
+        if (params->vertex_buf) {
+            pl_buf vertex_buf = params->vertex_buf;
+            require(vertex_buf->params.drawable);
+            if (!params->index_data && !params->index_buf) {
+                // Cannot bounds check indexed draws
+                size_t vert_size = params->vertex_count * pass->params.vertex_stride;
+                require(params->buf_offset + vert_size <= vertex_buf->params.size);
+            }
+        }
+
+        require(!params->index_data || !params->index_buf);
+        if (params->index_buf) {
+            pl_buf index_buf = params->index_buf;
+            require(!params->vertex_data);
+            require(index_buf->params.drawable);
+            size_t index_size = pl_index_buf_size(params);
+            require(params->index_offset + index_size <= index_buf->params.size);
+        }
+
+        pl_tex target = params->target;
+        require(target);
+        require(pl_tex_params_dimension(target->params) == 2);
+        require(target->params.format->signature == pass->params.target_format->signature);
+        require(target->params.renderable);
+        pl_rect2d *vp = &new.viewport;
+        pl_rect2d *sc = &new.scissors;
+
+        // Sanitize viewport/scissors
+        if (!vp->x0 && !vp->x1)
+            vp->x1 = target->params.w;
+        if (!vp->y0 && !vp->y1)
+            vp->y1 = target->params.h;
+
+        if (!sc->x0 && !sc->x1)
+            sc->x1 = target->params.w;
+        if (!sc->y0 && !sc->y1)
+            sc->y1 = target->params.h;
+
+        // Constrain the scissors to the target dimension (to sanitize the
+        // underlying graphics API calls)
+        sc->x0 = PL_CLAMP(sc->x0, 0, target->params.w);
+        sc->y0 = PL_CLAMP(sc->y0, 0, target->params.h);
+        sc->x1 = PL_CLAMP(sc->x1, 0, target->params.w);
+        sc->y1 = PL_CLAMP(sc->y1, 0, target->params.h);
+
+        // Scissors wholly outside target -> silently drop pass (also needed
+        // to ensure we don't cause UB by specifying invalid scissors)
+        if (!pl_rect_w(*sc) || !pl_rect_h(*sc))
+            return;
+
+        require(pl_rect_w(*vp) > 0);
+        require(pl_rect_h(*vp) > 0);
+        require(pl_rect_w(*sc) > 0);
+        require(pl_rect_h(*sc) > 0);
+
+        if (!pass->params.load_target)
+            pl_tex_invalidate(gpu, target);
+        break;
+    }
+    case PL_PASS_COMPUTE:
+        for (int i = 0; i < PL_ARRAY_SIZE(params->compute_groups); i++) {
+            require(params->compute_groups[i] >= 0);
+            require(params->compute_groups[i] <= gpu->limits.max_dispatch[i]);
+        }
+        break;
+    case PL_PASS_INVALID:
+    case PL_PASS_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    impl->pass_run(gpu, &new);
+
+error:
+    return;
+}
+
+void pl_gpu_flush(pl_gpu gpu)
+{
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    if (impl->gpu_flush)
+        impl->gpu_flush(gpu);
+}
+
+void pl_gpu_finish(pl_gpu gpu)
+{
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    impl->gpu_finish(gpu);
+}
+
+bool pl_gpu_is_failed(pl_gpu gpu)
+{
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    if (!impl->gpu_is_failed)
+        return false;
+
+    return impl->gpu_is_failed(gpu);
+}
+
+pl_sync pl_sync_create(pl_gpu gpu, enum pl_handle_type handle_type)
+{
+    require(handle_type);
+    require(handle_type & gpu->export_caps.sync);
+    require(PL_ISPOT(handle_type));
+
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    return impl->sync_create(gpu, handle_type);
+
+error:
+    return NULL;
+}
+
+void pl_sync_destroy(pl_gpu gpu, pl_sync *sync)
+{
+    if (!*sync)
+        return;
+
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    impl->sync_destroy(gpu, *sync);
+    *sync = NULL;
+}
+
+bool pl_tex_export(pl_gpu gpu, pl_tex tex, pl_sync sync)
+{
+    require(tex->params.import_handle || tex->params.export_handle);
+
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    return impl->tex_export(gpu, tex, sync);
+
+error:
+    if (tex->params.debug_tag)
+        PL_ERR(gpu, "  for texture: %s", tex->params.debug_tag);
+    return false;
+}
+
+pl_timer pl_timer_create(pl_gpu gpu)
+{
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    if (!impl->timer_create)
+        return NULL;
+
+    return impl->timer_create(gpu);
+}
+
+void pl_timer_destroy(pl_gpu gpu, pl_timer *timer)
+{
+    if (!*timer)
+        return;
+
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    impl->timer_destroy(gpu, *timer);
+    *timer = NULL;
+}
+
+uint64_t pl_timer_query(pl_gpu gpu, pl_timer timer)
+{
+    if (!timer)
+        return 0;
+
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    return impl->timer_query(gpu, timer);
+}
diff --git a/src/gpu.h b/src/gpu.h
new file mode 100644
index 0000000..e915a50
--- /dev/null
+++ b/src/gpu.h
@@ -0,0 +1,207 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+#include "log.h"
+
+#include <libplacebo/gpu.h>
+#include <libplacebo/dispatch.h>
+
+// To avoid having to include drm_fourcc.h
+#ifndef DRM_FORMAT_MOD_LINEAR
+#define DRM_FORMAT_MOD_LINEAR   UINT64_C(0x0)
+#define DRM_FORMAT_MOD_INVALID  ((UINT64_C(1) << 56) - 1)
+#endif
+
+// This struct must be the first member of the gpu's priv struct. The `pl_gpu`
+// helpers will cast the priv struct to this struct!
+
+#define GPU_PFN(name) __typeof__(pl_##name) *name
+struct pl_gpu_fns {
+    // This is a pl_dispatch used (on the pl_gpu itself!) for the purposes of
+    // dispatching compute shaders for performing various emulation tasks (e.g.
+    // partial clears, blits or emulated texture transfers, see below).
+    //
+    // Warning: Care must be taken to avoid recursive calls.
+    pl_dispatch dp;
+
+    // Internal cache, or NULL. Set by the user (via pl_gpu_set_cache).
+    _Atomic(pl_cache) cache;
+
+    // Destructors: These also free the corresponding objects, but they
+    // must not be called on NULL. (The NULL checks are done by the pl_*_destroy
+    // wrappers)
+    void (*destroy)(pl_gpu gpu);
+    void (*tex_destroy)(pl_gpu, pl_tex);
+    void (*buf_destroy)(pl_gpu, pl_buf);
+    void (*pass_destroy)(pl_gpu, pl_pass);
+    void (*sync_destroy)(pl_gpu, pl_sync);
+    void (*timer_destroy)(pl_gpu, pl_timer);
+
+    GPU_PFN(tex_create);
+    GPU_PFN(tex_invalidate); // optional
+    GPU_PFN(tex_clear_ex); // optional if no blittable formats
+    GPU_PFN(tex_blit); // optional if no blittable formats
+    GPU_PFN(tex_upload);
+    GPU_PFN(tex_download);
+    GPU_PFN(tex_poll); // optional: if NULL, textures are always free to use
+    GPU_PFN(buf_create);
+    GPU_PFN(buf_write);
+    GPU_PFN(buf_read);
+    GPU_PFN(buf_copy);
+    GPU_PFN(buf_export); // optional if !gpu->export_caps.buf
+    GPU_PFN(buf_poll); // optional: if NULL, buffers are always free to use
+    GPU_PFN(desc_namespace);
+    GPU_PFN(pass_create);
+    GPU_PFN(pass_run);
+    GPU_PFN(sync_create); // optional if !gpu->export_caps.sync
+    GPU_PFN(tex_export); // optional if !gpu->export_caps.sync
+    GPU_PFN(timer_create); // optional
+    GPU_PFN(timer_query); // optional
+    GPU_PFN(gpu_flush); // optional
+    GPU_PFN(gpu_finish);
+    GPU_PFN(gpu_is_failed); // optional
+};
+#undef GPU_PFN
+
+// All resources such as textures and buffers allocated from the GPU must be
+// destroyed before calling pl_destroy.
+void pl_gpu_destroy(pl_gpu gpu);
+
+// Returns true if the device supports interop. This is considered to be
+// the case if at least one of `gpu->export/import_caps` is nonzero.
+static inline bool pl_gpu_supports_interop(pl_gpu gpu)
+{
+    return gpu->export_caps.tex ||
+           gpu->import_caps.tex ||
+           gpu->export_caps.buf ||
+           gpu->import_caps.buf ||
+           gpu->export_caps.sync ||
+           gpu->import_caps.sync;
+}
+
+// Returns the GPU-internal `pl_dispatch` and `pl_cache` objects.
+pl_dispatch pl_gpu_dispatch(pl_gpu gpu);
+pl_cache pl_gpu_cache(pl_gpu gpu);
+
+// GPU-internal helpers: these should not be used outside of GPU implementations
+
+// This performs several tasks. It sorts the format list, logs GPU metadata,
+// performs verification and fixes up backwards compatibility fields. This
+// should be returned as the last step when creating a `pl_gpu`.
+pl_gpu pl_gpu_finalize(struct pl_gpu_t *gpu);
+
+// Look up the right GLSL image format qualifier from a partially filled-in
+// pl_fmt, or NULL if the format does not have a legal matching GLSL name.
+//
+// `components` may differ from fmt->num_components (for emulated formats)
+const char *pl_fmt_glsl_format(pl_fmt fmt, int components);
+
+// Look up the right fourcc from a partially filled-in pl_fmt, or 0 if the
+// format does not have a legal matching fourcc format.
+uint32_t pl_fmt_fourcc(pl_fmt fmt);
+
+// Compute the total size (in bytes) of a texture transfer operation
+size_t pl_tex_transfer_size(const struct pl_tex_transfer_params *par);
+
+// Split a tex transfer into slices. For emulated formats, `texel_fmt` gives
+// the format of the underlying texel buffer.
+//
+// Returns the number of slices, or 0 on error (e.g. no SSBOs available).
+// `out_slices` must be freed by caller (on success).
+int pl_tex_transfer_slices(pl_gpu gpu, pl_fmt texel_fmt,
+                           const struct pl_tex_transfer_params *params,
+                           struct pl_tex_transfer_params **out_slices);
+
+// Helper that wraps pl_tex_upload/download using texture upload buffers to
+// ensure that params->buf is always set.
+bool pl_tex_upload_pbo(pl_gpu gpu, const struct pl_tex_transfer_params *params);
+bool pl_tex_download_pbo(pl_gpu gpu, const struct pl_tex_transfer_params *params);
+
+// This requires that params.buf has been set and is of type PL_BUF_TEXEL_*
+bool pl_tex_upload_texel(pl_gpu gpu, const struct pl_tex_transfer_params *params);
+bool pl_tex_download_texel(pl_gpu gpu, const struct pl_tex_transfer_params *params);
+
+// Both `src` and `dst must be storable. `src` must also be sampleable, if the
+// blit requires linear sampling. Returns false if these conditions are unmet.
+bool pl_tex_blit_compute(pl_gpu gpu, const struct pl_tex_blit_params *params);
+
+// Helper to do a 2D blit with stretch and scale using a raster pass
+void pl_tex_blit_raster(pl_gpu gpu, const struct pl_tex_blit_params *params);
+
+// Helper for GPU-accelerated endian swapping
+//
+// Note: `src` and `dst` can be the same buffer, for an in-place operation. In
+// this case, `src_offset` and `dst_offset` must be the same.
+struct pl_buf_copy_swap_params {
+    // Source of the copy operation. Must be `storable`.
+    pl_buf src;
+    size_t src_offset;
+
+    // Destination of the copy operation. Must be `storable`.
+    pl_buf dst;
+    size_t dst_offset;
+
+    // Number of bytes to copy. Must be a multiple of 4.
+    size_t size;
+
+    // Underlying word size. Must be 2 (for 16-bit swap) or 4 (for 32-bit swap)
+    int wordsize;
+};
+
+bool pl_buf_copy_swap(pl_gpu gpu, const struct pl_buf_copy_swap_params *params);
+
+void pl_pass_run_vbo(pl_gpu gpu, const struct pl_pass_run_params *params);
+
+// Make a deep-copy of the pass params. Note: cached_program etc. are not
+// copied, but cleared explicitly.
+struct pl_pass_params pl_pass_params_copy(void *alloc, const struct pl_pass_params *params);
+
+// Helper to compute the size of an index buffer
+static inline size_t pl_index_buf_size(const struct pl_pass_run_params *params)
+{
+    switch (params->index_fmt) {
+    case PL_INDEX_UINT16: return params->vertex_count * sizeof(uint16_t);
+    case PL_INDEX_UINT32: return params->vertex_count * sizeof(uint32_t);
+    case PL_INDEX_FORMAT_COUNT: break;
+    }
+
+    pl_unreachable();
+}
+
+// Helper to compute the size of a vertex buffer required to fit all indices
+size_t pl_vertex_buf_size(const struct pl_pass_run_params *params);
+
+// Utility function for pretty-printing UUIDs
+#define UUID_SIZE 16
+#define PRINT_UUID(uuid) (print_uuid((char[3 * UUID_SIZE]){0}, (uuid)))
+const char *print_uuid(char buf[3 * UUID_SIZE], const uint8_t uuid[UUID_SIZE]);
+
+// Helper to pretty-print fourcc codes
+#define PRINT_FOURCC(fcc)       \
+    (!(fcc) ? "" : (char[5]) {  \
+        (fcc) & 0xFF,           \
+        ((fcc) >> 8) & 0xFF,    \
+        ((fcc) >> 16) & 0xFF,   \
+        ((fcc) >> 24) & 0xFF    \
+    })
+
+#define DRM_MOD_SIZE 26
+#define PRINT_DRM_MOD(mod) (print_drm_mod((char[DRM_MOD_SIZE]){0}, (mod)))
+const char *print_drm_mod(char buf[DRM_MOD_SIZE], uint64_t mod);
diff --git a/src/gpu/utils.c b/src/gpu/utils.c
new file mode 100644
index 0000000..40ca84d
--- /dev/null
+++ b/src/gpu/utils.c
@@ -0,0 +1,1288 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include "common.h"
+#include "shaders.h"
+#include "gpu.h"
+
+// GPU-internal helpers
+
+static int cmp_fmt(const void *pa, const void *pb)
+{
+    pl_fmt a = *(pl_fmt *)pa;
+    pl_fmt b = *(pl_fmt *)pb;
+
+    // Always prefer non-opaque formats
+    if (a->opaque != b->opaque)
+        return PL_CMP(a->opaque, b->opaque);
+
+    // Always prefer non-emulated formats
+    if (a->emulated != b->emulated)
+        return PL_CMP(a->emulated, b->emulated);
+
+    int ca = __builtin_popcount(a->caps),
+        cb = __builtin_popcount(b->caps);
+    if (ca != cb)
+        return -PL_CMP(ca, cb); // invert to sort higher values first
+
+    // If the population count is the same but the caps are different, prefer
+    // the caps with a "lower" value (which tend to be more fundamental caps)
+    if (a->caps != b->caps)
+        return PL_CMP(a->caps, b->caps);
+
+    // If the capabilities are equal, sort based on the component attributes
+    for (int i = 0; i < PL_ARRAY_SIZE(a->component_depth); i++) {
+        int da = a->component_depth[i],
+            db = b->component_depth[i];
+        if (da != db)
+            return PL_CMP(da, db);
+
+        int ha = a->host_bits[i],
+            hb = b->host_bits[i];
+        if (ha != hb)
+            return PL_CMP(ha, hb);
+
+        int oa = a->sample_order[i],
+            ob = b->sample_order[i];
+        if (oa != ob)
+            return PL_CMP(oa, ob);
+    }
+
+    // Fall back to sorting by the name (for stability)
+    return strcmp(a->name, b->name);
+}
+
+#define FMT_BOOL(letter, cap) ((cap) ? (letter) : '-')
+#define FMT_IDX4(f) (f)[0], (f)[1], (f)[2], (f)[3]
+
+static void print_formats(pl_gpu gpu)
+{
+    if (!pl_msg_test(gpu->log, PL_LOG_DEBUG))
+        return;
+
+#define CAP_HEADER "%-12s"
+#define CAP_FIELDS "%c%c%c%c%c%c%c%c%c%c%c%c"
+#define CAP_VALUES \
+    FMT_BOOL('S', fmt->caps & PL_FMT_CAP_SAMPLEABLE),       \
+    FMT_BOOL('s', fmt->caps & PL_FMT_CAP_STORABLE),         \
+    FMT_BOOL('L', fmt->caps & PL_FMT_CAP_LINEAR),           \
+    FMT_BOOL('R', fmt->caps & PL_FMT_CAP_RENDERABLE),       \
+    FMT_BOOL('b', fmt->caps & PL_FMT_CAP_BLENDABLE),        \
+    FMT_BOOL('B', fmt->caps & PL_FMT_CAP_BLITTABLE),        \
+    FMT_BOOL('V', fmt->caps & PL_FMT_CAP_VERTEX),           \
+    FMT_BOOL('u', fmt->caps & PL_FMT_CAP_TEXEL_UNIFORM),    \
+    FMT_BOOL('t', fmt->caps & PL_FMT_CAP_TEXEL_STORAGE),    \
+    FMT_BOOL('H', fmt->caps & PL_FMT_CAP_HOST_READABLE),    \
+    FMT_BOOL('W', fmt->caps & PL_FMT_CAP_READWRITE),        \
+    FMT_BOOL('G', fmt->gatherable)
+
+    PL_DEBUG(gpu,  "GPU texture formats:");
+    PL_DEBUG(gpu,  "    %-20s %-6s %-4s %-4s " CAP_HEADER " %-3s %-13s %-13s %-10s %-10s %-6s",
+            "NAME", "TYPE", "SIZE", "COMP", "CAPS", "EMU", "DEPTH", "HOST_BITS",
+            "GLSL_TYPE", "GLSL_FMT", "FOURCC");
+    for (int n = 0; n < gpu->num_formats; n++) {
+        pl_fmt fmt = gpu->formats[n];
+
+        static const char *types[] = {
+            [PL_FMT_UNKNOWN] = "UNKNOWN",
+            [PL_FMT_UNORM]   = "UNORM",
+            [PL_FMT_SNORM]   = "SNORM",
+            [PL_FMT_UINT]    = "UINT",
+            [PL_FMT_SINT]    = "SINT",
+            [PL_FMT_FLOAT]   = "FLOAT",
+        };
+
+        static const char idx_map[4] = {'R', 'G', 'B', 'A'};
+        char indices[4] = {' ', ' ', ' ', ' '};
+        if (!fmt->opaque) {
+            for (int i = 0; i < fmt->num_components; i++)
+                indices[i] = idx_map[fmt->sample_order[i]];
+        }
+
+
+        PL_DEBUG(gpu, "    %-20s %-6s %-4zu %c%c%c%c " CAP_FIELDS " %-3s "
+                 "{%-2d %-2d %-2d %-2d} {%-2d %-2d %-2d %-2d} %-10s %-10s %-6s",
+                 fmt->name, types[fmt->type], fmt->texel_size,
+                 FMT_IDX4(indices), CAP_VALUES, fmt->emulated ? "y" : "n",
+                 FMT_IDX4(fmt->component_depth), FMT_IDX4(fmt->host_bits),
+                 PL_DEF(fmt->glsl_type, ""), PL_DEF(fmt->glsl_format, ""),
+                 PRINT_FOURCC(fmt->fourcc));
+
+#undef CAP_HEADER
+#undef CAP_FIELDS
+#undef CAP_VALUES
+
+        for (int i = 0; i < fmt->num_modifiers; i++) {
+            PL_TRACE(gpu, "        modifiers[%d]: %s",
+                     i, PRINT_DRM_MOD(fmt->modifiers[i]));
+        }
+    }
+}
+
+pl_gpu pl_gpu_finalize(struct pl_gpu_t *gpu)
+{
+    // Sort formats
+    qsort(gpu->formats, gpu->num_formats, sizeof(pl_fmt), cmp_fmt);
+
+    // Verification
+    pl_assert(gpu->limits.max_tex_2d_dim);
+    pl_assert(gpu->limits.max_variable_comps || gpu->limits.max_ubo_size);
+    pl_assert(gpu->limits.max_ubo_size    <= gpu->limits.max_buf_size);
+    pl_assert(gpu->limits.max_ssbo_size   <= gpu->limits.max_buf_size);
+    pl_assert(gpu->limits.max_vbo_size    <= gpu->limits.max_buf_size);
+    pl_assert(gpu->limits.max_mapped_size <= gpu->limits.max_buf_size);
+
+    for (int n = 0; n < gpu->num_formats; n++) {
+        pl_fmt fmt = gpu->formats[n];
+        pl_assert(fmt->name);
+        pl_assert(fmt->type);
+        pl_assert(fmt->num_components);
+        pl_assert(fmt->internal_size);
+        pl_assert(fmt->opaque ? !fmt->texel_size : fmt->texel_size);
+        pl_assert(!fmt->gatherable || (fmt->caps & PL_FMT_CAP_SAMPLEABLE));
+        for (int i = 0; i < fmt->num_components; i++) {
+            pl_assert(fmt->component_depth[i]);
+            pl_assert(fmt->opaque ? !fmt->host_bits[i] : fmt->host_bits[i]);
+        }
+        for (int i = 0; i < fmt->num_planes; i++)
+            pl_assert(fmt->planes[i].format);
+
+        enum pl_fmt_caps texel_caps = PL_FMT_CAP_VERTEX |
+                                      PL_FMT_CAP_TEXEL_UNIFORM |
+                                      PL_FMT_CAP_TEXEL_STORAGE;
+
+        if (fmt->caps & texel_caps) {
+            pl_assert(fmt->glsl_type);
+            pl_assert(!fmt->opaque);
+        }
+        if (!fmt->opaque) {
+            pl_assert(fmt->texel_size && fmt->texel_align);
+            pl_assert((fmt->texel_size % fmt->texel_align) == 0);
+            pl_assert(fmt->internal_size == fmt->texel_size || fmt->emulated);
+        } else {
+            pl_assert(!fmt->texel_size && !fmt->texel_align);
+            pl_assert(!(fmt->caps & PL_FMT_CAP_HOST_READABLE));
+        }
+
+        // Assert uniqueness of name
+        for (int o = n + 1; o < gpu->num_formats; o++)
+            pl_assert(strcmp(fmt->name, gpu->formats[o]->name) != 0);
+    }
+
+    // Print info
+    PL_INFO(gpu, "GPU information:");
+
+#define LOG(fmt, field) \
+    PL_INFO(gpu, "      %-26s %" fmt, #field ":", gpu->LOG_STRUCT.field)
+
+#define LOG_STRUCT glsl
+    PL_INFO(gpu, "    GLSL version: %d%s", gpu->glsl.version,
+           gpu->glsl.vulkan ? " (vulkan)" : gpu->glsl.gles ? " es" : "");
+    if (gpu->glsl.compute) {
+        LOG("zu", max_shmem_size);
+        LOG(PRIu32, max_group_threads);
+        LOG(PRIu32, max_group_size[0]);
+        LOG(PRIu32, max_group_size[1]);
+        LOG(PRIu32, max_group_size[2]);
+    }
+    LOG(PRIu32, subgroup_size);
+    LOG(PRIi16, min_gather_offset);
+    LOG(PRIi16, max_gather_offset);
+#undef LOG_STRUCT
+
+#define LOG_STRUCT limits
+    PL_INFO(gpu, "    Limits:");
+    // pl_gpu
+    LOG("d", thread_safe);
+    LOG("d", callbacks);
+    // pl_buf
+    LOG("zu", max_buf_size);
+    LOG("zu", max_ubo_size);
+    LOG("zu", max_ssbo_size);
+    LOG("zu", max_vbo_size);
+    LOG("zu", max_mapped_size);
+    LOG(PRIu64, max_buffer_texels);
+    LOG("zu", align_host_ptr);
+    LOG("d", host_cached);
+    // pl_tex
+    LOG(PRIu32, max_tex_1d_dim);
+    LOG(PRIu32, max_tex_2d_dim);
+    LOG(PRIu32, max_tex_3d_dim);
+    LOG("d", blittable_1d_3d);
+    LOG("d", buf_transfer);
+    LOG("zu", align_tex_xfer_pitch);
+    LOG("zu", align_tex_xfer_offset);
+    // pl_pass
+    LOG("zu", max_variable_comps);
+    LOG("zu", max_constants);
+    LOG("zu", max_pushc_size);
+    LOG("zu", align_vertex_stride);
+    if (gpu->glsl.compute) {
+        LOG(PRIu32, max_dispatch[0]);
+        LOG(PRIu32, max_dispatch[1]);
+        LOG(PRIu32, max_dispatch[2]);
+    }
+    LOG(PRIu32, fragment_queues);
+    LOG(PRIu32, compute_queues);
+#undef LOG_STRUCT
+#undef LOG
+
+    if (pl_gpu_supports_interop(gpu)) {
+        PL_INFO(gpu, "    External API interop:");
+
+        PL_INFO(gpu, "      UUID: %s", PRINT_UUID(gpu->uuid));
+        PL_INFO(gpu, "      PCI: %04x:%02x:%02x:%x",
+                gpu->pci.domain, gpu->pci.bus, gpu->pci.device, gpu->pci.function);
+        PL_INFO(gpu, "      buf export caps: 0x%x",
+                (unsigned int) gpu->export_caps.buf);
+        PL_INFO(gpu, "      buf import caps: 0x%x",
+                (unsigned int) gpu->import_caps.buf);
+        PL_INFO(gpu, "      tex export caps: 0x%x",
+                (unsigned int) gpu->export_caps.tex);
+        PL_INFO(gpu, "      tex import caps: 0x%x",
+                (unsigned int) gpu->import_caps.tex);
+        PL_INFO(gpu, "      sync export caps: 0x%x",
+                (unsigned int) gpu->export_caps.sync);
+        PL_INFO(gpu, "      sync import caps: 0x%x",
+                (unsigned int) gpu->import_caps.sync);
+    }
+
+    print_formats(gpu);
+
+    // Finally, create a `pl_dispatch` object for internal operations
+    struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    atomic_init(&impl->cache, NULL);
+    impl->dp = pl_dispatch_create(gpu->log, gpu);
+    return gpu;
+}
+
+struct glsl_fmt {
+    enum pl_fmt_type type;
+    int num_components;
+    int depth[4];
+    const char *glsl_format;
+};
+
+// List taken from the GLSL specification. (Yes, GLSL supports only exactly
+// these formats with exactly these names)
+static const struct glsl_fmt pl_glsl_fmts[] = {
+    {PL_FMT_FLOAT, 1, {16},             "r16f"},
+    {PL_FMT_FLOAT, 1, {32},             "r32f"},
+    {PL_FMT_FLOAT, 2, {16, 16},         "rg16f"},
+    {PL_FMT_FLOAT, 2, {32, 32},         "rg32f"},
+    {PL_FMT_FLOAT, 4, {16, 16, 16, 16}, "rgba16f"},
+    {PL_FMT_FLOAT, 4, {32, 32, 32, 32}, "rgba32f"},
+    {PL_FMT_FLOAT, 3, {11, 11, 10},     "r11f_g11f_b10f"},
+
+    {PL_FMT_UNORM, 1, {8},              "r8"},
+    {PL_FMT_UNORM, 1, {16},             "r16"},
+    {PL_FMT_UNORM, 2, {8,  8},          "rg8"},
+    {PL_FMT_UNORM, 2, {16, 16},         "rg16"},
+    {PL_FMT_UNORM, 4, {8,  8,  8,  8},  "rgba8"},
+    {PL_FMT_UNORM, 4, {16, 16, 16, 16}, "rgba16"},
+    {PL_FMT_UNORM, 4, {10, 10, 10,  2}, "rgb10_a2"},
+
+    {PL_FMT_SNORM, 1, {8},              "r8_snorm"},
+    {PL_FMT_SNORM, 1, {16},             "r16_snorm"},
+    {PL_FMT_SNORM, 2, {8,  8},          "rg8_snorm"},
+    {PL_FMT_SNORM, 2, {16, 16},         "rg16_snorm"},
+    {PL_FMT_SNORM, 4, {8,  8,  8,  8},  "rgba8_snorm"},
+    {PL_FMT_SNORM, 4, {16, 16, 16, 16}, "rgba16_snorm"},
+
+    {PL_FMT_UINT,  1, {8},              "r8ui"},
+    {PL_FMT_UINT,  1, {16},             "r16ui"},
+    {PL_FMT_UINT,  1, {32},             "r32ui"},
+    {PL_FMT_UINT,  2, {8,  8},          "rg8ui"},
+    {PL_FMT_UINT,  2, {16, 16},         "rg16ui"},
+    {PL_FMT_UINT,  2, {32, 32},         "rg32ui"},
+    {PL_FMT_UINT,  4, {8,  8,  8,  8},  "rgba8ui"},
+    {PL_FMT_UINT,  4, {16, 16, 16, 16}, "rgba16ui"},
+    {PL_FMT_UINT,  4, {32, 32, 32, 32}, "rgba32ui"},
+    {PL_FMT_UINT,  4, {10, 10, 10,  2}, "rgb10_a2ui"},
+
+    {PL_FMT_SINT,  1, {8},              "r8i"},
+    {PL_FMT_SINT,  1, {16},             "r16i"},
+    {PL_FMT_SINT,  1, {32},             "r32i"},
+    {PL_FMT_SINT,  2, {8,  8},          "rg8i"},
+    {PL_FMT_SINT,  2, {16, 16},         "rg16i"},
+    {PL_FMT_SINT,  2, {32, 32},         "rg32i"},
+    {PL_FMT_SINT,  4, {8,  8,  8,  8},  "rgba8i"},
+    {PL_FMT_SINT,  4, {16, 16, 16, 16}, "rgba16i"},
+    {PL_FMT_SINT,  4, {32, 32, 32, 32}, "rgba32i"},
+};
+
+const char *pl_fmt_glsl_format(pl_fmt fmt, int components)
+{
+    if (fmt->opaque)
+        return NULL;
+
+    for (int n = 0; n < PL_ARRAY_SIZE(pl_glsl_fmts); n++) {
+        const struct glsl_fmt *gfmt = &pl_glsl_fmts[n];
+
+        if (fmt->type != gfmt->type)
+            continue;
+        if (components != gfmt->num_components)
+            continue;
+
+        // The component order is irrelevant, so we need to sort the depth
+        // based on the component's index
+        int depth[4] = {0};
+        for (int i = 0; i < fmt->num_components; i++)
+            depth[fmt->sample_order[i]] = fmt->component_depth[i];
+
+        // Copy over any emulated components
+        for (int i = fmt->num_components; i < components; i++)
+            depth[i] = gfmt->depth[i];
+
+        for (int i = 0; i < PL_ARRAY_SIZE(depth); i++) {
+            if (depth[i] != gfmt->depth[i])
+                goto next_fmt;
+        }
+
+        return gfmt->glsl_format;
+
+next_fmt: ; // equivalent to `continue`
+    }
+
+    return NULL;
+}
+
+#define FOURCC(a,b,c,d) ((uint32_t)(a)        | ((uint32_t)(b) << 8) | \
+                        ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24))
+
+struct pl_fmt_fourcc {
+    const char *name;
+    uint32_t fourcc;
+};
+
+static const struct pl_fmt_fourcc pl_fmt_fourccs[] = {
+    // 8 bpp red
+    {"r8",          FOURCC('R','8',' ',' ')},
+    // 16 bpp red
+    {"r16",         FOURCC('R','1','6',' ')},
+    // 16 bpp rg
+    {"rg8",         FOURCC('G','R','8','8')},
+    {"gr8",         FOURCC('R','G','8','8')},
+    // 32 bpp rg
+    {"rg16",        FOURCC('G','R','3','2')},
+    {"gr16",        FOURCC('R','G','3','2')},
+    // 8 bpp rgb: N/A
+    // 16 bpp rgb
+    {"argb4",       FOURCC('B','A','1','2')},
+    {"abgr4",       FOURCC('R','A','1','2')},
+    {"rgba4",       FOURCC('A','B','1','2')},
+    {"bgra4",       FOURCC('A','R','1','2')},
+
+    {"a1rgb5",      FOURCC('B','A','1','5')},
+    {"a1bgr5",      FOURCC('R','A','1','5')},
+    {"rgb5a1",      FOURCC('A','B','1','5')},
+    {"bgr5a1",      FOURCC('A','R','1','5')},
+
+    {"rgb565",      FOURCC('B','G','1','6')},
+    {"bgr565",      FOURCC('R','G','1','6')},
+    // 24 bpp rgb
+    {"rgb8",        FOURCC('B','G','2','4')},
+    {"bgr8",        FOURCC('R','G','2','4')},
+    // 32 bpp rgb
+    {"argb8",       FOURCC('B','A','2','4')},
+    {"abgr8",       FOURCC('R','A','2','4')},
+    {"rgba8",       FOURCC('A','B','2','4')},
+    {"bgra8",       FOURCC('A','R','2','4')},
+
+    {"a2rgb10",     FOURCC('B','A','3','0')},
+    {"a2bgr10",     FOURCC('R','A','3','0')},
+    {"rgb10a2",     FOURCC('A','B','3','0')},
+    {"bgr10a2",     FOURCC('A','R','3','0')},
+    // 64bpp rgb
+    {"rgba16hf",    FOURCC('A','B','4','H')},
+    {"bgra16hf",    FOURCC('A','R','4','H')},
+
+    // packed 16-bit formats
+    // rx10:        N/A
+    // rxgx10:      N/A
+    {"rxgxbxax10",  FOURCC('A','B','1','0')},
+    // rx12:        N/A
+    // rxgx12:      N/A
+    // rxgxbxax12:  N/A
+
+    // planar formats
+    {"g8_b8_r8_420",    FOURCC('Y','U','1','2')},
+    {"g8_b8_r8_422",    FOURCC('Y','U','1','6')},
+    {"g8_b8_r8_444",    FOURCC('Y','U','2','4')},
+    // g16_b18_r8_*:    N/A
+    // gx10_bx10_rx10_42*: N/A
+    {"gx10_bx10_rx10_444", FOURCC('Q','4','1','0')},
+    // gx12_bx12_rx12_*:N/A
+    {"g8_br8_420",      FOURCC('N','V','1','2')},
+    {"g8_br8_422",      FOURCC('N','V','1','6')},
+    {"g8_br8_444",      FOURCC('N','V','2','4')},
+    {"g16_br16_420",    FOURCC('P','0','1','6')},
+    // g16_br16_422:    N/A
+    // g16_br16_444:    N/A
+    {"gx10_bxrx10_420", FOURCC('P','0','1','0')},
+    {"gx10_bxrx10_422", FOURCC('P','2','1','0')},
+    // gx10_bxrx10_444: N/A
+    {"gx12_bxrx12_420", FOURCC('P','0','1','2')},
+    // gx12_bxrx12_422: N/A
+    // gx12_bxrx12_444: N/A
+};
+
+uint32_t pl_fmt_fourcc(pl_fmt fmt)
+{
+    for (int n = 0; n < PL_ARRAY_SIZE(pl_fmt_fourccs); n++) {
+        const struct pl_fmt_fourcc *fourcc = &pl_fmt_fourccs[n];
+        if (strcmp(fmt->name, fourcc->name) == 0)
+            return fourcc->fourcc;
+    }
+
+    return 0; // no matching format
+}
+
+size_t pl_tex_transfer_size(const struct pl_tex_transfer_params *par)
+{
+    int w = pl_rect_w(par->rc), h = pl_rect_h(par->rc), d = pl_rect_d(par->rc);
+    size_t pixel_pitch = par->tex->params.format->texel_size;
+
+    // This generates the absolute bare minimum size of a buffer required to
+    // hold the data of a texture upload/download, by including stride padding
+    // only where strictly necessary.
+    return (d - 1) * par->depth_pitch + (h - 1) * par->row_pitch + w * pixel_pitch;
+}
+
+int pl_tex_transfer_slices(pl_gpu gpu, pl_fmt texel_fmt,
+                           const struct pl_tex_transfer_params *params,
+                           struct pl_tex_transfer_params **out_slices)
+{
+    PL_ARRAY(struct pl_tex_transfer_params) slices = {0};
+    size_t max_size = params->buf ? gpu->limits.max_buf_size : SIZE_MAX;
+
+    pl_fmt fmt = params->tex->params.format;
+    if (fmt->emulated && texel_fmt) {
+        size_t max_texel = gpu->limits.max_buffer_texels * texel_fmt->texel_size;
+        max_size = PL_MIN(gpu->limits.max_ssbo_size, max_texel);
+    }
+
+    int slice_w = pl_rect_w(params->rc);
+    int slice_h = pl_rect_h(params->rc);
+    int slice_d = pl_rect_d(params->rc);
+
+    slice_d = PL_MIN(slice_d, max_size / params->depth_pitch);
+    if (!slice_d) {
+        slice_d = 1;
+        slice_h = PL_MIN(slice_h, max_size / params->row_pitch);
+        if (!slice_h) {
+            slice_h = 1;
+            slice_w = PL_MIN(slice_w, max_size / fmt->texel_size);
+            pl_assert(slice_w);
+        }
+    }
+
+    for (int z = 0; z < pl_rect_d(params->rc); z += slice_d) {
+        for (int y = 0; y < pl_rect_h(params->rc); y += slice_h) {
+            for (int x = 0; x < pl_rect_w(params->rc); x += slice_w) {
+                struct pl_tex_transfer_params slice = *params;
+                slice.callback = NULL;
+                slice.rc.x0 = params->rc.x0 + x;
+                slice.rc.y0 = params->rc.y0 + y;
+                slice.rc.z0 = params->rc.z0 + z;
+                slice.rc.x1 = PL_MIN(slice.rc.x0 + slice_w, params->rc.x1);
+                slice.rc.y1 = PL_MIN(slice.rc.y0 + slice_h, params->rc.y1);
+                slice.rc.z1 = PL_MIN(slice.rc.z0 + slice_d, params->rc.z1);
+
+                const size_t offset = z * params->depth_pitch +
+                                      y * params->row_pitch +
+                                      x * fmt->texel_size;
+                if (slice.ptr) {
+                    slice.ptr = (uint8_t *) slice.ptr + offset;
+                } else {
+                    slice.buf_offset += offset;
+                }
+
+                PL_ARRAY_APPEND(NULL, slices, slice);
+            }
+        }
+    }
+
+    *out_slices = slices.elem;
+    return slices.num;
+}
+
+bool pl_tex_upload_pbo(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+    if (params->buf)
+        return pl_tex_upload(gpu, params);
+
+    struct pl_buf_params bufparams = {
+        .size = pl_tex_transfer_size(params),
+        .debug_tag = PL_DEBUG_TAG,
+    };
+
+    struct pl_tex_transfer_params fixed = *params;
+    fixed.ptr = NULL;
+
+    // If we can import host pointers directly, and the function is being used
+    // asynchronously, then we can use host pointer import to skip a memcpy. In
+    // the synchronous case, we still force a host memcpy to avoid stalling the
+    // host until the GPU memcpy completes.
+    bool can_import = gpu->import_caps.buf & PL_HANDLE_HOST_PTR;
+    can_import &= !params->no_import;
+    can_import &= params->callback != NULL;
+    can_import &= bufparams.size > (32 << 10); // 32 KiB
+    if (can_import) {
+        bufparams.import_handle = PL_HANDLE_HOST_PTR;
+        bufparams.shared_mem = (struct pl_shared_mem) {
+            .handle.ptr = params->ptr,
+            .size = bufparams.size,
+            .offset = 0,
+        };
+
+        // Suppress errors for this test because it may fail, in which case we
+        // want to silently fall back.
+        pl_log_level_cap(gpu->log, PL_LOG_DEBUG);
+        fixed.buf = pl_buf_create(gpu, &bufparams);
+        pl_log_level_cap(gpu->log, PL_LOG_NONE);
+    }
+
+    if (!fixed.buf) {
+        bufparams.import_handle = 0;
+        bufparams.host_writable = true;
+        fixed.buf = pl_buf_create(gpu, &bufparams);
+        if (!fixed.buf)
+            return false;
+        pl_buf_write(gpu, fixed.buf, 0, params->ptr, bufparams.size);
+        if (params->callback)
+            params->callback(params->priv);
+        fixed.callback = NULL;
+    }
+
+    bool ok = pl_tex_upload(gpu, &fixed);
+    pl_buf_destroy(gpu, &fixed.buf);
+    return ok;
+}
+
+struct pbo_cb_ctx {
+    pl_gpu gpu;
+    pl_buf buf;
+    void *ptr;
+    void (*callback)(void *priv);
+    void *priv;
+};
+
+static void pbo_download_cb(void *priv)
+{
+    struct pbo_cb_ctx *p = priv;
+    pl_buf_read(p->gpu, p->buf, 0, p->ptr, p->buf->params.size);
+    pl_buf_destroy(p->gpu, &p->buf);
+
+    // Run the original callback
+    p->callback(p->priv);
+    pl_free(priv);
+};
+
+bool pl_tex_download_pbo(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+    if (params->buf)
+        return pl_tex_download(gpu, params);
+
+    pl_buf buf = NULL;
+    struct pl_buf_params bufparams = {
+        .size = pl_tex_transfer_size(params),
+        .debug_tag = PL_DEBUG_TAG,
+    };
+
+    // If we can import host pointers directly, we can avoid an extra memcpy
+    // (sometimes). In the cases where it isn't avoidable, the extra memcpy
+    // will happen inside VRAM, which is typically faster anyway.
+    bool can_import = gpu->import_caps.buf & PL_HANDLE_HOST_PTR;
+    can_import &= !params->no_import;
+    can_import &= bufparams.size > (32 << 10); // 32 KiB
+    if (can_import) {
+        bufparams.import_handle = PL_HANDLE_HOST_PTR;
+        bufparams.shared_mem = (struct pl_shared_mem) {
+            .handle.ptr = params->ptr,
+            .size = bufparams.size,
+            .offset = 0,
+        };
+
+        // Suppress errors for this test because it may fail, in which case we
+        // want to silently fall back.
+        pl_log_level_cap(gpu->log, PL_LOG_DEBUG);
+        buf = pl_buf_create(gpu, &bufparams);
+        pl_log_level_cap(gpu->log, PL_LOG_NONE);
+    }
+
+    if (!buf) {
+        // Fallback when host pointer import is not supported
+        bufparams.import_handle = 0;
+        bufparams.host_readable = true;
+        buf = pl_buf_create(gpu, &bufparams);
+    }
+
+    if (!buf)
+        return false;
+
+    struct pl_tex_transfer_params newparams = *params;
+    newparams.ptr = NULL;
+    newparams.buf = buf;
+
+    // If the transfer is asynchronous, propagate our host read asynchronously
+    if (params->callback && !bufparams.import_handle) {
+        newparams.callback = pbo_download_cb;
+        newparams.priv = pl_alloc_struct(NULL, struct pbo_cb_ctx, {
+            .gpu = gpu,
+            .buf = buf,
+            .ptr = params->ptr,
+            .callback = params->callback,
+            .priv = params->priv,
+        });
+    }
+
+    if (!pl_tex_download(gpu, &newparams)) {
+        pl_buf_destroy(gpu, &buf);
+        return false;
+    }
+
+    if (!params->callback) {
+        while (pl_buf_poll(gpu, buf, 10000000)) // 10 ms
+            PL_TRACE(gpu, "pl_tex_download: synchronous/blocking (slow path)");
+    }
+
+    bool ok;
+    if (bufparams.import_handle) {
+        // Buffer download completion already means the host pointer contains
+        // the valid data, no more need to copy. (Note: this applies even for
+        // asynchronous downloads)
+        ok = true;
+        pl_buf_destroy(gpu, &buf);
+    } else if (!params->callback) {
+        // Synchronous read back to the host pointer
+        ok = pl_buf_read(gpu, buf, 0, params->ptr, bufparams.size);
+        pl_buf_destroy(gpu, &buf);
+    } else {
+        // Nothing left to do here, the rest will be done by pbo_download_cb
+        ok = true;
+    }
+
+    return ok;
+}
+
+bool pl_tex_upload_texel(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+    const int threads = PL_MIN(256, pl_rect_w(params->rc));
+    pl_tex tex = params->tex;
+    pl_fmt fmt = tex->params.format;
+    pl_require(gpu, params->buf);
+
+    pl_dispatch dp = pl_gpu_dispatch(gpu);
+    pl_shader sh = pl_dispatch_begin(dp);
+    if (!sh_try_compute(sh, threads, 1, false, 0)) {
+        PL_ERR(gpu, "Failed emulating texture transfer!");
+        pl_dispatch_abort(dp, &sh);
+        return false;
+    }
+
+    ident_t buf = sh_desc(sh, (struct pl_shader_desc) {
+        .binding.object = params->buf,
+        .desc = {
+            .name = "data",
+            .type = PL_DESC_BUF_TEXEL_STORAGE,
+        },
+    });
+
+    ident_t img = sh_desc(sh, (struct pl_shader_desc) {
+        .binding.object = params->tex,
+        .desc = {
+            .name = "image",
+            .type = PL_DESC_STORAGE_IMG,
+            .access = PL_DESC_ACCESS_WRITEONLY,
+        },
+    });
+
+    // If the transfer width is a natural multiple of the thread size, we
+    // can skip the bounds check. Otherwise, make sure we aren't blitting out
+    // of the range since this would read out of bounds.
+    int groups_x = PL_DIV_UP(pl_rect_w(params->rc), threads);
+    if (groups_x * threads != pl_rect_w(params->rc)) {
+        GLSL("if (gl_GlobalInvocationID.x >= %d) \n"
+             "    return;                        \n",
+             pl_rect_w(params->rc));
+    }
+
+    // fmt->texel_align contains the size of an individual color value
+    assert(fmt->texel_size == fmt->num_components * fmt->texel_align);
+    GLSL("vec4 color = vec4(0.0, 0.0, 0.0, 1.0);                        \n"
+         "ivec3 pos = ivec3(gl_GlobalInvocationID);                     \n"
+         "ivec3 tex_pos = pos + ivec3("$", "$", "$");                   \n"
+         "int base = "$" + pos.z * "$" + pos.y * "$" + pos.x * "$";     \n",
+         SH_INT_DYN(params->rc.x0), SH_INT_DYN(params->rc.y0), SH_INT_DYN(params->rc.z0),
+         SH_INT_DYN(params->buf_offset),
+         SH_INT(params->depth_pitch / fmt->texel_align),
+         SH_INT(params->row_pitch / fmt->texel_align),
+         SH_INT(fmt->texel_size / fmt->texel_align));
+
+    for (int i = 0; i < fmt->num_components; i++)
+        GLSL("color[%d] = imageLoad("$", base + %d).r; \n", i, buf, i);
+
+    int dims = pl_tex_params_dimension(tex->params);
+    static const char *coord_types[] = {
+        [1] = "int",
+        [2] = "ivec2",
+        [3] = "ivec3",
+    };
+
+    GLSL("imageStore("$", %s(tex_pos), color);\n", img, coord_types[dims]);
+    return pl_dispatch_compute(dp, pl_dispatch_compute_params(
+        .shader = &sh,
+        .dispatch_size = {
+            groups_x,
+            pl_rect_h(params->rc),
+            pl_rect_d(params->rc),
+        },
+    ));
+
+error:
+    return false;
+}
+
+bool pl_tex_download_texel(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+    const int threads = PL_MIN(256, pl_rect_w(params->rc));
+    pl_tex tex = params->tex;
+    pl_fmt fmt = tex->params.format;
+    pl_require(gpu, params->buf);
+
+    pl_dispatch dp = pl_gpu_dispatch(gpu);
+    pl_shader sh = pl_dispatch_begin(dp);
+    if (!sh_try_compute(sh, threads, 1, false, 0)) {
+        PL_ERR(gpu, "Failed emulating texture transfer!");
+        pl_dispatch_abort(dp, &sh);
+        return false;
+    }
+
+    ident_t buf = sh_desc(sh, (struct pl_shader_desc) {
+        .binding.object = params->buf,
+        .desc = {
+            .name = "data",
+            .type = PL_DESC_BUF_TEXEL_STORAGE,
+        },
+    });
+
+    ident_t img = sh_desc(sh, (struct pl_shader_desc) {
+        .binding.object = params->tex,
+        .desc = {
+            .name = "image",
+            .type = PL_DESC_STORAGE_IMG,
+            .access = PL_DESC_ACCESS_READONLY,
+        },
+    });
+
+    int groups_x = PL_DIV_UP(pl_rect_w(params->rc), threads);
+    if (groups_x * threads != pl_rect_w(params->rc)) {
+        GLSL("if (gl_GlobalInvocationID.x >= %d) \n"
+             "    return;                        \n",
+             pl_rect_w(params->rc));
+    }
+
+    int dims = pl_tex_params_dimension(tex->params);
+    static const char *coord_types[] = {
+        [1] = "int",
+        [2] = "ivec2",
+        [3] = "ivec3",
+    };
+
+    assert(fmt->texel_size == fmt->num_components * fmt->texel_align);
+    GLSL("ivec3 pos = ivec3(gl_GlobalInvocationID);                     \n"
+         "ivec3 tex_pos = pos + ivec3("$", "$", "$");                   \n"
+         "int base = "$" + pos.z * "$" + pos.y * "$" + pos.x * "$";     \n"
+         "vec4 color = imageLoad("$", %s(tex_pos));                     \n",
+         SH_INT_DYN(params->rc.x0), SH_INT_DYN(params->rc.y0), SH_INT_DYN(params->rc.z0),
+         SH_INT_DYN(params->buf_offset),
+         SH_INT(params->depth_pitch / fmt->texel_align),
+         SH_INT(params->row_pitch / fmt->texel_align),
+         SH_INT(fmt->texel_size / fmt->texel_align),
+         img, coord_types[dims]);
+
+    for (int i = 0; i < fmt->num_components; i++)
+        GLSL("imageStore("$", base + %d, vec4(color[%d])); \n", buf, i, i);
+
+    return pl_dispatch_compute(dp, pl_dispatch_compute_params(
+        .shader = &sh,
+        .dispatch_size = {
+            groups_x,
+            pl_rect_h(params->rc),
+            pl_rect_d(params->rc),
+        },
+    ));
+
+error:
+    return false;
+}
+
+bool pl_tex_blit_compute(pl_gpu gpu, const struct pl_tex_blit_params *params)
+{
+    if (!params->dst->params.storable)
+        return false;
+
+    // Normalize `dst_rc`, moving all flipping to `src_rc` instead.
+    pl_rect3d src_rc = params->src_rc;
+    pl_rect3d dst_rc = params->dst_rc;
+    if (pl_rect_w(dst_rc) < 0) {
+        PL_SWAP(src_rc.x0, src_rc.x1);
+        PL_SWAP(dst_rc.x0, dst_rc.x1);
+    }
+    if (pl_rect_h(dst_rc) < 0) {
+        PL_SWAP(src_rc.y0, src_rc.y1);
+        PL_SWAP(dst_rc.y0, dst_rc.y1);
+    }
+    if (pl_rect_d(dst_rc) < 0) {
+        PL_SWAP(src_rc.z0, src_rc.z1);
+        PL_SWAP(dst_rc.z0, dst_rc.z1);
+    }
+
+    bool needs_scaling = false;
+    needs_scaling |= pl_rect_w(dst_rc) != abs(pl_rect_w(src_rc));
+    needs_scaling |= pl_rect_h(dst_rc) != abs(pl_rect_h(src_rc));
+    needs_scaling |= pl_rect_d(dst_rc) != abs(pl_rect_d(src_rc));
+
+    // Exception: fast path for 1-pixel blits, which don't require scaling
+    bool is_1pixel = abs(pl_rect_w(src_rc)) == 1 && abs(pl_rect_h(src_rc)) == 1;
+    needs_scaling &= !is_1pixel;
+
+    // Manual trilinear interpolation would be too slow to justify
+    bool needs_sampling = needs_scaling && params->sample_mode != PL_TEX_SAMPLE_NEAREST;
+    needs_sampling |= !params->src->params.storable;
+    if (needs_sampling && !params->src->params.sampleable)
+        return false;
+
+    const int threads = 256;
+    int bw = PL_MIN(32, pl_rect_w(dst_rc));
+    int bh = PL_MIN(threads / bw, pl_rect_h(dst_rc));
+    pl_dispatch dp = pl_gpu_dispatch(gpu);
+    pl_shader sh = pl_dispatch_begin(dp);
+    if (!sh_try_compute(sh, bw, bh, false, 0)) {
+        pl_dispatch_abort(dp, &sh);
+        return false;
+    }
+
+    // Avoid over-writing into `dst`
+    int groups_x = PL_DIV_UP(pl_rect_w(dst_rc), bw);
+    if (groups_x * bw != pl_rect_w(dst_rc)) {
+        GLSL("if (gl_GlobalInvocationID.x >= %d) \n"
+             "    return;                        \n",
+             pl_rect_w(dst_rc));
+    }
+
+    int groups_y = PL_DIV_UP(pl_rect_h(dst_rc), bh);
+    if (groups_y * bh != pl_rect_h(dst_rc)) {
+        GLSL("if (gl_GlobalInvocationID.y >= %d) \n"
+             "    return;                        \n",
+             pl_rect_h(dst_rc));
+    }
+
+    ident_t dst = sh_desc(sh, (struct pl_shader_desc) {
+        .binding.object = params->dst,
+        .desc = {
+            .name   = "dst",
+            .type   = PL_DESC_STORAGE_IMG,
+            .access = PL_DESC_ACCESS_WRITEONLY,
+        },
+    });
+
+    static const char *vecs[] = {
+        [1] = "float",
+        [2] = "vec2",
+        [3] = "vec3",
+        [4] = "vec4",
+    };
+
+    static const char *ivecs[] = {
+        [1] = "int",
+        [2] = "ivec2",
+        [3] = "ivec3",
+        [4] = "ivec4",
+    };
+
+    int src_dims = pl_tex_params_dimension(params->src->params);
+    int dst_dims = pl_tex_params_dimension(params->dst->params);
+    GLSL("ivec3 pos = ivec3(gl_GlobalInvocationID); \n"
+         "%s dst_pos = %s(pos + ivec3(%d, %d, %d)); \n",
+         ivecs[dst_dims], ivecs[dst_dims],
+         params->dst_rc.x0, params->dst_rc.y0, params->dst_rc.z0);
+
+    if (needs_sampling || (needs_scaling && params->src->params.sampleable)) {
+
+        ident_t src = sh_desc(sh, (struct pl_shader_desc) {
+            .desc = {
+                .name = "src",
+                .type = PL_DESC_SAMPLED_TEX,
+            },
+            .binding = {
+                .object = params->src,
+                .address_mode = PL_TEX_ADDRESS_CLAMP,
+                .sample_mode = params->sample_mode,
+            }
+        });
+
+        if (is_1pixel) {
+            GLSL("%s fpos = %s(0.5); \n", vecs[src_dims], vecs[src_dims]);
+        } else {
+            GLSL("vec3 fpos = (vec3(pos) + vec3(0.5)) / vec3(%d.0, %d.0, %d.0); \n",
+                 pl_rect_w(dst_rc), pl_rect_h(dst_rc), pl_rect_d(dst_rc));
+        }
+
+        GLSL("%s src_pos = %s(0.5);             \n"
+             "src_pos.x = mix(%f, %f, fpos.x);  \n",
+             vecs[src_dims], vecs[src_dims],
+             (float) src_rc.x0 / params->src->params.w,
+             (float) src_rc.x1 / params->src->params.w);
+
+        if (params->src->params.h) {
+            GLSL("src_pos.y = mix(%f, %f, fpos.y); \n",
+                 (float) src_rc.y0 / params->src->params.h,
+                 (float) src_rc.y1 / params->src->params.h);
+        }
+
+        if (params->src->params.d) {
+            GLSL("src_pos.z = mix(%f, %f, fpos.z); \n",
+                 (float) src_rc.z0 / params->src->params.d,
+                 (float) src_rc.z1 / params->src->params.d);
+        }
+
+        GLSL("imageStore("$", dst_pos, textureLod("$", src_pos, 0.0)); \n",
+             dst, src);
+
+    } else {
+
+        ident_t src = sh_desc(sh, (struct pl_shader_desc) {
+            .binding.object = params->src,
+            .desc = {
+                .name   = "src",
+                .type   = PL_DESC_STORAGE_IMG,
+                .access = PL_DESC_ACCESS_READONLY,
+            },
+        });
+
+        if (is_1pixel) {
+            GLSL("ivec3 src_pos = ivec3(0); \n");
+        } else if (needs_scaling) {
+            GLSL("ivec3 src_pos = ivec3(vec3(%f, %f, %f) * vec3(pos)); \n",
+                 fabs((float) pl_rect_w(src_rc) / pl_rect_w(dst_rc)),
+                 fabs((float) pl_rect_h(src_rc) / pl_rect_h(dst_rc)),
+                 fabs((float) pl_rect_d(src_rc) / pl_rect_d(dst_rc)));
+        } else {
+            GLSL("ivec3 src_pos = pos; \n");
+        }
+
+        GLSL("src_pos = ivec3(%d, %d, %d) * src_pos + ivec3(%d, %d, %d);    \n"
+             "imageStore("$", dst_pos, imageLoad("$", %s(src_pos)));        \n",
+             src_rc.x1 < src_rc.x0 ? -1 : 1,
+             src_rc.y1 < src_rc.y0 ? -1 : 1,
+             src_rc.z1 < src_rc.z0 ? -1 : 1,
+             src_rc.x0, src_rc.y0, src_rc.z0,
+             dst, src, ivecs[src_dims]);
+
+    }
+
+    return pl_dispatch_compute(dp, pl_dispatch_compute_params(
+        .shader = &sh,
+        .dispatch_size = {
+            groups_x,
+            groups_y,
+            pl_rect_d(dst_rc),
+        },
+    ));
+}
+
+void pl_tex_blit_raster(pl_gpu gpu, const struct pl_tex_blit_params *params)
+{
+    enum pl_fmt_type src_type = params->src->params.format->type;
+    enum pl_fmt_type dst_type = params->dst->params.format->type;
+
+    // Only for 2D textures
+    pl_assert(params->src->params.h && !params->src->params.d);
+    pl_assert(params->dst->params.h && !params->dst->params.d);
+
+    // Integer textures are not supported
+    pl_assert(src_type != PL_FMT_UINT && src_type != PL_FMT_SINT);
+    pl_assert(dst_type != PL_FMT_UINT && dst_type != PL_FMT_SINT);
+
+    pl_rect2df src_rc = {
+        .x0 = params->src_rc.x0, .x1 = params->src_rc.x1,
+        .y0 = params->src_rc.y0, .y1 = params->src_rc.y1,
+    };
+    pl_rect2d dst_rc = {
+        .x0 = params->dst_rc.x0, .x1 = params->dst_rc.x1,
+        .y0 = params->dst_rc.y0, .y1 = params->dst_rc.y1,
+    };
+
+    pl_dispatch dp = pl_gpu_dispatch(gpu);
+    pl_shader sh = pl_dispatch_begin(dp);
+    sh->output = PL_SHADER_SIG_COLOR;
+
+    ident_t pos, src = sh_bind(sh, params->src, PL_TEX_ADDRESS_CLAMP,
+        params->sample_mode, "src_tex", &src_rc, &pos, NULL);
+
+    GLSL("vec4 color = textureLod("$", "$", 0.0); \n", src, pos);
+
+    pl_dispatch_finish(dp, pl_dispatch_params(
+        .shader = &sh,
+        .target = params->dst,
+        .rect = dst_rc,
+    ));
+}
+
+bool pl_buf_copy_swap(pl_gpu gpu, const struct pl_buf_copy_swap_params *params)
+{
+    pl_buf src = params->src, dst = params->dst;
+    pl_require(gpu, src->params.storable && dst->params.storable);
+    pl_require(gpu, params->src_offset % sizeof(unsigned) == 0);
+    pl_require(gpu, params->dst_offset % sizeof(unsigned) == 0);
+    pl_require(gpu, params->src_offset + params->size <= src->params.size);
+    pl_require(gpu, params->dst_offset + params->size <= dst->params.size);
+    pl_require(gpu, src != dst || params->src_offset == params->dst_offset);
+    pl_require(gpu, params->size % sizeof(unsigned) == 0);
+    pl_require(gpu, params->wordsize == sizeof(uint16_t) ||
+                    params->wordsize == sizeof(uint32_t));
+
+    const size_t words = params->size / sizeof(unsigned);
+    const size_t src_off = params->src_offset / sizeof(unsigned);
+    const size_t dst_off = params->dst_offset / sizeof(unsigned);
+
+    const int threads = PL_MIN(256, words);
+    pl_dispatch dp = pl_gpu_dispatch(gpu);
+    pl_shader sh = pl_dispatch_begin(dp);
+    if (!sh_try_compute(sh, threads, 1, false, 0)) {
+        pl_dispatch_abort(dp, &sh);
+        return false;
+    }
+
+    const size_t groups = PL_DIV_UP(words, threads);
+    if (groups * threads > words) {
+        GLSL("if (gl_GlobalInvocationID.x >= %zu) \n"
+             "    return;                         \n",
+             words);
+    }
+
+    sh_desc(sh, (struct pl_shader_desc) {
+        .binding.object = src,
+        .desc = {
+            .name = "SrcBuf",
+            .type = PL_DESC_BUF_STORAGE,
+            .access = src == dst ? PL_DESC_ACCESS_READWRITE : PL_DESC_ACCESS_READONLY,
+        },
+        .num_buffer_vars = 1,
+        .buffer_vars = &(struct pl_buffer_var) {
+            .var = {
+                .name = "src",
+                .type = PL_VAR_UINT,
+                .dim_v = 1,
+                .dim_m = 1,
+                .dim_a = src_off + words,
+            },
+        },
+    });
+
+    if (src != dst) {
+        sh_desc(sh, (struct pl_shader_desc) {
+            .binding.object = dst,
+            .desc = {
+                .name = "DstBuf",
+                .type = PL_DESC_BUF_STORAGE,
+                .access = PL_DESC_ACCESS_WRITEONLY,
+            },
+            .num_buffer_vars = 1,
+            .buffer_vars = &(struct pl_buffer_var) {
+                .var = {
+                    .name = "dst",
+                    .type = PL_VAR_UINT,
+                    .dim_v = 1,
+                    .dim_m = 1,
+                    .dim_a = dst_off + words,
+                },
+            },
+        });
+    } else {
+        GLSL("#define dst src \n");
+    }
+
+    GLSL("// pl_buf_copy_swap                               \n"
+         "{                                                 \n"
+         "uint word = src["$" + gl_GlobalInvocationID.x];   \n"
+         "word = (word & 0xFF00FF00u) >> 8 |                \n"
+         "       (word & 0x00FF00FFu) << 8;                 \n",
+         SH_UINT(src_off));
+    if (params->wordsize > 2) {
+        GLSL("word = (word & 0xFFFF0000u) >> 16 |           \n"
+             "       (word & 0x0000FFFFu) << 16;            \n");
+    }
+    GLSL("dst["$" + gl_GlobalInvocationID.x] = word;        \n"
+         "}                                                 \n",
+         SH_UINT(dst_off));
+
+    return pl_dispatch_compute(dp, pl_dispatch_compute_params(
+        .shader = &sh,
+        .dispatch_size = {groups, 1, 1},
+    ));
+
+error:
+    if (src->params.debug_tag || dst->params.debug_tag) {
+        PL_ERR(gpu, "  for buffers: src %s, dst %s",
+               src->params.debug_tag, dst->params.debug_tag);
+    }
+    return false;
+}
+
+void pl_pass_run_vbo(pl_gpu gpu, const struct pl_pass_run_params *params)
+{
+    if (!params->vertex_data && !params->index_data)
+        return pl_pass_run(gpu, params);
+
+    struct pl_pass_run_params newparams = *params;
+    pl_buf vert = NULL, index = NULL;
+
+    if (params->vertex_data) {
+        vert = pl_buf_create(gpu, pl_buf_params(
+            .size = pl_vertex_buf_size(params),
+            .initial_data = params->vertex_data,
+            .drawable = true,
+        ));
+
+        if (!vert) {
+            PL_ERR(gpu, "Failed allocating vertex buffer!");
+            return;
+        }
+
+        newparams.vertex_buf = vert;
+        newparams.vertex_data = NULL;
+    }
+
+    if (params->index_data) {
+        index = pl_buf_create(gpu, pl_buf_params(
+            .size = pl_index_buf_size(params),
+            .initial_data = params->index_data,
+            .drawable = true,
+        ));
+
+        if (!index) {
+            PL_ERR(gpu, "Failed allocating index buffer!");
+            return;
+        }
+
+        newparams.index_buf = index;
+        newparams.index_data = NULL;
+    }
+
+    pl_pass_run(gpu, &newparams);
+    pl_buf_destroy(gpu, &vert);
+    pl_buf_destroy(gpu, &index);
+}
+
+struct pl_pass_params pl_pass_params_copy(void *alloc, const struct pl_pass_params *params)
+{
+    struct pl_pass_params new = *params;
+
+    new.glsl_shader = pl_str0dup0(alloc, new.glsl_shader);
+    new.vertex_shader = pl_str0dup0(alloc, new.vertex_shader);
+    if (new.blend_params)
+        new.blend_params = pl_memdup_ptr(alloc, new.blend_params);
+
+#define DUPNAMES(field)                                                 \
+    do {                                                                \
+        size_t _size = new.num_##field * sizeof(new.field[0]);          \
+        new.field = pl_memdup(alloc, new.field, _size);                 \
+        for (int j = 0; j < new.num_##field; j++)                       \
+            new.field[j].name = pl_str0dup0(alloc, new.field[j].name);  \
+    } while (0)
+
+    DUPNAMES(variables);
+    DUPNAMES(descriptors);
+    DUPNAMES(vertex_attribs);
+
+#undef DUPNAMES
+
+    new.constant_data = NULL;
+    new.constants = pl_memdup(alloc, new.constants,
+                              new.num_constants * sizeof(new.constants[0]));
+
+    return new;
+}
+
+size_t pl_vertex_buf_size(const struct pl_pass_run_params *params)
+{
+    if (!params->index_data)
+        return params->vertex_count * params->pass->params.vertex_stride;
+
+    int num_vertices = 0;
+    const void *idx = params->index_data;
+    switch (params->index_fmt) {
+    case PL_INDEX_UINT16:
+        for (int i = 0; i < params->vertex_count; i++)
+            num_vertices = PL_MAX(num_vertices, ((const uint16_t *) idx)[i]);
+        break;
+    case PL_INDEX_UINT32:
+        for (int i = 0; i < params->vertex_count; i++)
+            num_vertices = PL_MAX(num_vertices, ((const uint32_t *) idx)[i]);
+        break;
+    case PL_INDEX_FORMAT_COUNT: pl_unreachable();
+    }
+
+    return (num_vertices + 1) * params->pass->params.vertex_stride;
+}
+
+const char *print_uuid(char buf[3 * UUID_SIZE], const uint8_t uuid[UUID_SIZE])
+{
+    static const char *hexdigits = "0123456789ABCDEF";
+    for (int i = 0; i < UUID_SIZE; i++) {
+        uint8_t x = uuid[i];
+        buf[3 * i + 0] = hexdigits[x >> 4];
+        buf[3 * i + 1] = hexdigits[x & 0xF];
+        buf[3 * i + 2] = i == UUID_SIZE - 1 ? '\0' : ':';
+    }
+
+    return buf;
+}
+
+const char *print_drm_mod(char buf[DRM_MOD_SIZE], uint64_t mod)
+{
+    switch (mod) {
+    case DRM_FORMAT_MOD_LINEAR: return "LINEAR";
+    case DRM_FORMAT_MOD_INVALID: return "INVALID";
+    }
+
+    uint8_t vendor = mod >> 56;
+    uint64_t val = mod & ((1ULL << 56) - 1);
+
+    const char *name = NULL;
+    switch (vendor) {
+    case 0x00: name = "NONE"; break;
+    case 0x01: name = "INTEL"; break;
+    case 0x02: name = "AMD"; break;
+    case 0x03: name = "NVIDIA"; break;
+    case 0x04: name = "SAMSUNG"; break;
+    case 0x08: name = "ARM"; break;
+    }
+
+    if (name) {
+        snprintf(buf, DRM_MOD_SIZE, "%s 0x%"PRIx64, name, val);
+    } else {
+        snprintf(buf, DRM_MOD_SIZE, "0x%02x 0x%"PRIx64, vendor, val);
+    }
+
+    return buf;
+}
diff --git a/src/hash.h b/src/hash.h
new file mode 100644
index 0000000..2513919
--- /dev/null
+++ b/src/hash.h
@@ -0,0 +1,162 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+#define GOLDEN_RATIO_64 UINT64_C(0x9e3779b97f4a7c15)
+
+static inline void pl_hash_merge(uint64_t *accum, uint64_t hash) {
+    *accum ^= hash + GOLDEN_RATIO_64 + (*accum << 6) + (*accum >> 2);
+}
+
+static inline uint64_t pl_mem_hash(const void *mem, size_t size);
+#define pl_var_hash(x) pl_mem_hash(&(x), sizeof(x))
+
+static inline uint64_t pl_str_hash(pl_str str)
+{
+    return pl_mem_hash(str.buf, str.len);
+}
+
+static inline uint64_t pl_str0_hash(const char *str)
+{
+    return pl_mem_hash(str, str ? strlen(str) : 0);
+}
+
+#ifdef PL_HAVE_XXHASH
+
+#define XXH_NAMESPACE pl_
+#define XXH_INLINE_ALL
+#define XXH_NO_STREAM
+#include <xxhash.h>
+
+XXH_FORCE_INLINE uint64_t pl_mem_hash(const void *mem, size_t size)
+{
+    return XXH3_64bits(mem, size);
+}
+
+#else // !PL_HAVE_XXHASH
+
+/*
+   SipHash reference C implementation
+   Modified for use by libplacebo:
+    - Hard-coded a fixed key (k0 and k1)
+    - Hard-coded the output size to 64 bits
+    - Return the result vector directly
+
+   Copyright (c) 2012-2016 Jean-Philippe Aumasson
+   <jeanphilippe.aumasson@gmail.com>
+   Copyright (c) 2012-2014 Daniel J. Bernstein <djb@cr.yp.to>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+/* default: SipHash-2-4 */
+#define cROUNDS 2
+#define dROUNDS 4
+
+#define ROTL(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b))))
+
+#define U8TO64_LE(p)                                                           \
+    (((uint64_t)((p)[0])) | ((uint64_t)((p)[1]) << 8) |                        \
+     ((uint64_t)((p)[2]) << 16) | ((uint64_t)((p)[3]) << 24) |                 \
+     ((uint64_t)((p)[4]) << 32) | ((uint64_t)((p)[5]) << 40) |                 \
+     ((uint64_t)((p)[6]) << 48) | ((uint64_t)((p)[7]) << 56))
+
+#define SIPROUND                                                               \
+    do {                                                                       \
+        v0 += v1;                                                              \
+        v1 = ROTL(v1, 13);                                                     \
+        v1 ^= v0;                                                              \
+        v0 = ROTL(v0, 32);                                                     \
+        v2 += v3;                                                              \
+        v3 = ROTL(v3, 16);                                                     \
+        v3 ^= v2;                                                              \
+        v0 += v3;                                                              \
+        v3 = ROTL(v3, 21);                                                     \
+        v3 ^= v0;                                                              \
+        v2 += v1;                                                              \
+        v1 = ROTL(v1, 17);                                                     \
+        v1 ^= v2;                                                              \
+        v2 = ROTL(v2, 32);                                                     \
+    } while (0)
+
+static inline uint64_t pl_mem_hash(const void *mem, size_t size)
+{
+    if (!size)
+        return 0x8533321381b8254bULL;
+
+    uint64_t v0 = 0x736f6d6570736575ULL;
+    uint64_t v1 = 0x646f72616e646f6dULL;
+    uint64_t v2 = 0x6c7967656e657261ULL;
+    uint64_t v3 = 0x7465646279746573ULL;
+    uint64_t k0 = 0xfe9f075098ddb0faULL;
+    uint64_t k1 = 0x68f7f03510e5285cULL;
+    uint64_t m;
+    int i;
+    const uint8_t *buf = mem;
+    const uint8_t *end = buf + size - (size % sizeof(uint64_t));
+    const int left = size & 7;
+    uint64_t b = ((uint64_t) size) << 56;
+    v3 ^= k1;
+    v2 ^= k0;
+    v1 ^= k1;
+    v0 ^= k0;
+
+    for (; buf != end; buf += 8) {
+        m = U8TO64_LE(buf);
+        v3 ^= m;
+
+        for (i = 0; i < cROUNDS; ++i)
+            SIPROUND;
+
+        v0 ^= m;
+    }
+
+    switch (left) {
+    case 7: b |= ((uint64_t) buf[6]) << 48; // fall through
+    case 6: b |= ((uint64_t) buf[5]) << 40; // fall through
+    case 5: b |= ((uint64_t) buf[4]) << 32; // fall through
+    case 4: b |= ((uint64_t) buf[3]) << 24; // fall through
+    case 3: b |= ((uint64_t) buf[2]) << 16; // fall through
+    case 2: b |= ((uint64_t) buf[1]) << 8;  // fall through
+    case 1: b |= ((uint64_t) buf[0]); break;
+    case 0: break;
+    }
+
+    v3 ^= b;
+
+    for (i = 0; i < cROUNDS; ++i)
+        SIPROUND;
+
+    v0 ^= b;
+
+    v2 ^= 0xff;
+
+    for (i = 0; i < dROUNDS; ++i)
+        SIPROUND;
+
+    b = v0 ^ v1 ^ v2 ^ v3;
+    return b;
+}
+
+#endif // PL_HAVE_XXHASH
diff --git a/src/include/libplacebo/cache.h b/src/include/libplacebo/cache.h
new file mode 100644
index 0000000..5897ac8
--- /dev/null
+++ b/src/include/libplacebo/cache.h
@@ -0,0 +1,200 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_CACHE_H_
+#define LIBPLACEBO_CACHE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <libplacebo/config.h>
+#include <libplacebo/common.h>
+#include <libplacebo/log.h>
+
+PL_API_BEGIN
+
+typedef struct pl_cache_obj {
+    // Cache object key. This will uniquely identify this cached object.
+    uint64_t key;
+
+    // Cache data pointer and length. 0-length cached objects are invalid
+    // and will be silently dropped. You can explicitly remove a cached
+    // object by overwriting it with a length 0 object.
+    void *data;
+    size_t size;
+
+    // Free callback, to free memory associated with `data`. (Optional)
+    // Will be called when the object is either explicitly deleted, culled
+    // due to hitting size limits, or on pl_cache_destroy().
+    void (*free)(void *data);
+} pl_cache_obj;
+
+struct pl_cache_params {
+    // Optional `pl_log` that is used for logging internal events related
+    // to the cache, such as insertions, saving and loading.
+    pl_log log;
+
+    // Size limits. If 0, no limit is imposed.
+    //
+    // Note: libplacebo will never detect or invalidate stale cache entries, so
+    // setting an upper size limit is strongly recommended
+    size_t max_object_size;
+    size_t max_total_size;
+
+    // Optional external callback to call after a cached object is modified
+    // (including deletion and (re-)insertion). Note that this is not called on
+    // objects which are merely pruned from the cache due to `max_total_size`,
+    // so users must rely on some external mechanism to prune stale entries or
+    // enforce size limits.
+    //
+    // Note: `pl_cache_load` does not trigger this callback.
+    // Note: Ownership of `obj` does *not* pass to the caller.
+    // Note: This function must be thread safe.
+    void (*set)(void *priv, pl_cache_obj obj);
+
+    // Optional external callback to call on a cache miss. Ownership of the
+    // returned object passes to the `pl_cache`. Objects returned by this
+    // callback *should* have a valid `free` callback, unless lifetime can be
+    // externally managed and guaranteed to outlive the `pl_cache`.
+    //
+    // Note: This function must be thread safe.
+    pl_cache_obj (*get)(void *priv, uint64_t key);
+
+    // External context for insert/lookup.
+    void *priv;
+};
+
+#define pl_cache_params(...) (&(struct pl_cache_params) { __VA_ARGS__ })
+PL_API extern const struct pl_cache_params pl_cache_default_params;
+
+// Thread-safety: Safe
+//
+// Note: In any context in which `pl_cache` is used, users may also pass NULL
+// to disable caching. In other words, NULL is a valid `pl_cache`.
+typedef const struct pl_cache_t {
+    struct pl_cache_params params;
+} *pl_cache;
+
+// Create a new cache. This function will never fail.
+PL_API pl_cache pl_cache_create(const struct pl_cache_params *params);
+
+// Destroy a `pl_cache` object, including all underlying objects.
+PL_API void pl_cache_destroy(pl_cache *cache);
+
+// Explicitly clear all objects in the cache without destroying it. This is
+// similar to `pl_cache_destroy`, but the cache remains valid afterwards.
+//
+// Note: Objects destroyed in this way *not* propagated to the `set` callback.
+PL_API void pl_cache_reset(pl_cache cache);
+
+// Return the current internal number of objects and total size (bytes)
+PL_API int pl_cache_objects(pl_cache cache);
+PL_API size_t pl_cache_size(pl_cache cache);
+
+// --- Cache saving and loading APIs
+
+// Serialize the internal state of a `pl_cache` into an abstract cache
+// object that can be e.g. saved to disk and loaded again later. Returns the
+// number of objects saved.
+//
+// Note: Using `save/load` is largely redundant with using `insert/lookup`
+// callbacks, and the user should decide whether to use the explicit API or the
+// callback-based API.
+PL_API int pl_cache_save_ex(pl_cache cache,
+                            void (*write)(void *priv, size_t size, const void *ptr),
+                            void *priv);
+
+// Load the result of a previous `pl_cache_save` call. Any duplicate entries in
+// the `pl_cache` will be overwritten. Returns the number of objects loaded, or
+// a negative number on serious error (e.g. corrupt header)
+//
+// Note: This does not trigger the `update` callback.
+PL_API int pl_cache_load_ex(pl_cache cache,
+                            bool (*read)(void *priv, size_t size, void *ptr),
+                            void *priv);
+
+// --- Convenience wrappers around pl_cache_save/load_ex
+
+// Writes data directly to a pointer. Returns the number of bytes that *would*
+// have been written, so this can be used on a size 0 buffer to get the required
+// total size.
+PL_API size_t pl_cache_save(pl_cache cache, uint8_t *data, size_t size);
+
+// Reads data directly from a pointer. This still reads from `data`, so it does
+// not avoid a copy.
+PL_API int pl_cache_load(pl_cache cache, const uint8_t *data, size_t size);
+
+// Writes/loads data to/from a FILE stream at the current position.
+#define pl_cache_save_file(c, file) pl_cache_save_ex(c, pl_write_file_cb, file)
+#define pl_cache_load_file(c, file) pl_cache_load_ex(c, pl_read_file_cb,  file)
+
+static inline void pl_write_file_cb(void *priv, size_t size, const void *ptr)
+{
+    (void) fwrite(ptr, 1, size, (FILE *) priv);
+}
+
+static inline bool pl_read_file_cb(void *priv, size_t size, void *ptr)
+{
+    return fread(ptr, 1, size, (FILE *) priv) == size;
+}
+
+// --- Object modification API. Mostly intended for internal use.
+
+// Insert a new cached object into a `pl_cache`. Returns whether successful.
+// Overwrites any existing cached object with that signature, so this can be
+// used to e.g. delete objects as well (set their size to 0). On success,
+// ownership of `obj` passes to the `pl_cache`.
+//
+// Note: If `object.free` is NULL, this will perform an internal memdup. To
+// bypass this (e.g. when directly adding externally managed memory), you can
+// set the `free` callback to an explicit noop function.
+//
+// Note: `obj->data/free` will be reset to NULL on successful insertion.
+PL_API bool pl_cache_try_set(pl_cache cache, pl_cache_obj *obj);
+
+// Variant of `pl_cache_try_set` that simply frees `obj` on failure.
+PL_API void pl_cache_set(pl_cache cache, pl_cache_obj *obj);
+
+// Looks up `obj->key` in the object cache. If successful, `obj->data` is
+// set to memory owned by the caller, which must be either explicitly
+// re-inserted, or explicitly freed (using obj->free).
+//
+// Note: On failure, `obj->data/size/free` are reset to NULL.
+PL_API bool pl_cache_get(pl_cache cache, pl_cache_obj *obj);
+
+// Run a callback on every object currently stored in `cache`.
+//
+// Note: Running any `pl_cache_*` function on `cache` from this callback is
+// undefined behavior.
+PL_API void pl_cache_iterate(pl_cache cache,
+                             void (*cb)(void *priv, pl_cache_obj obj),
+                             void *priv);
+
+// Utility wrapper to free a `pl_cache_obj` if necessary (and sanitize it)
+static inline void pl_cache_obj_free(pl_cache_obj *obj)
+{
+    if (obj->free)
+        obj->free(obj->data);
+    obj->data = NULL;
+    obj->free = NULL;
+    obj->size = 0;
+}
+
+PL_API_END
+
+#endif // LIBPLACEBO_CACHE_H_
diff --git a/src/include/libplacebo/colorspace.h b/src/include/libplacebo/colorspace.h
new file mode 100644
index 0000000..6663019
--- /dev/null
+++ b/src/include/libplacebo/colorspace.h
@@ -0,0 +1,719 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_COLORSPACE_H_
+#define LIBPLACEBO_COLORSPACE_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <libplacebo/common.h>
+
+PL_API_BEGIN
+
+// The underlying color representation (e.g. RGB, XYZ or YCbCr)
+enum pl_color_system {
+    PL_COLOR_SYSTEM_UNKNOWN = 0,
+    // YCbCr-like color systems:
+    PL_COLOR_SYSTEM_BT_601,      // ITU-R Rec. BT.601 (SD)
+    PL_COLOR_SYSTEM_BT_709,      // ITU-R Rec. BT.709 (HD)
+    PL_COLOR_SYSTEM_SMPTE_240M,  // SMPTE-240M
+    PL_COLOR_SYSTEM_BT_2020_NC,  // ITU-R Rec. BT.2020 (non-constant luminance)
+    PL_COLOR_SYSTEM_BT_2020_C,   // ITU-R Rec. BT.2020 (constant luminance)
+    PL_COLOR_SYSTEM_BT_2100_PQ,  // ITU-R Rec. BT.2100 ICtCp PQ variant
+    PL_COLOR_SYSTEM_BT_2100_HLG, // ITU-R Rec. BT.2100 ICtCp HLG variant
+    PL_COLOR_SYSTEM_DOLBYVISION, // Dolby Vision (see pl_dovi_metadata)
+    PL_COLOR_SYSTEM_YCGCO,       // YCgCo (derived from RGB)
+    // Other color systems:
+    PL_COLOR_SYSTEM_RGB,         // Red, Green and Blue
+    PL_COLOR_SYSTEM_XYZ,         // Digital Cinema Distribution Master (XYZ)
+    PL_COLOR_SYSTEM_COUNT
+};
+
+PL_API bool pl_color_system_is_ycbcr_like(enum pl_color_system sys);
+
+// Returns true for color systems that are linear transformations of the RGB
+// equivalent, i.e. are simple matrix multiplications. For color systems with
+// this property, `pl_color_repr_decode` is sufficient for conversion to RGB.
+PL_API bool pl_color_system_is_linear(enum pl_color_system sys);
+
+// Guesses the best YCbCr-like colorspace based on a image given resolution.
+// This only picks conservative values. (In particular, BT.2020 is never
+// auto-guessed, even for 4K resolution content)
+PL_API enum pl_color_system pl_color_system_guess_ycbcr(int width, int height);
+
+// Friendly names for the canonical channel names and order.
+enum pl_channel {
+    PL_CHANNEL_NONE = -1,
+    PL_CHANNEL_A = 3, // alpha
+    // RGB system
+    PL_CHANNEL_R = 0,
+    PL_CHANNEL_G = 1,
+    PL_CHANNEL_B = 2,
+    // YCbCr-like systems
+    PL_CHANNEL_Y = 0,
+    PL_CHANNEL_CB = 1,
+    PL_CHANNEL_CR = 2,
+    // Aliases for Cb/Cr
+    PL_CHANNEL_U = 1,
+    PL_CHANNEL_V = 2
+    // There are deliberately no names for the XYZ system to avoid
+    // confusion due to PL_CHANNEL_Y.
+};
+
+// The numerical range of the representation (where applicable).
+enum pl_color_levels {
+    PL_COLOR_LEVELS_UNKNOWN = 0,
+    PL_COLOR_LEVELS_LIMITED,    // Limited/TV range, e.g. 16-235
+    PL_COLOR_LEVELS_FULL,       // Full/PC range, e.g. 0-255
+    PL_COLOR_LEVELS_COUNT,
+
+    // Compatibility aliases
+    PL_COLOR_LEVELS_TV = PL_COLOR_LEVELS_LIMITED,
+    PL_COLOR_LEVELS_PC = PL_COLOR_LEVELS_FULL,
+};
+
+// The alpha representation mode.
+enum pl_alpha_mode {
+    PL_ALPHA_UNKNOWN = 0,   // or no alpha channel present
+    PL_ALPHA_INDEPENDENT,   // alpha channel is separate from the video
+    PL_ALPHA_PREMULTIPLIED, // alpha channel is multiplied into the colors
+    PL_ALPHA_MODE_COUNT,
+};
+
+// The underlying bit-wise representation of a color sample. For example,
+// a 10-bit TV-range YCbCr value uploaded to a 16 bit texture would have
+// sample_depth=16 color_depth=10 bit_shift=0.
+//
+// For another example, a 12-bit XYZ full range sample shifted to 16-bits with
+// the lower 4 bits all set to 0 would have sample_depth=16 color_depth=12
+// bit_shift=4. (libavcodec likes outputting this type of `xyz12`)
+//
+// To explain the meaning of `sample_depth` further; the consideration factor
+// here is the fact that GPU sampling will normalized the sampled color to the
+// range 0.0 - 1.0 in a manner dependent on the number of bits in the texture
+// format. So if you upload a 10-bit YCbCr value unpadded as 16-bit color
+// samples, all of the sampled values will be extremely close to 0.0. In such a
+// case, `pl_color_repr_normalize` would return a high scaling factor, which
+// would pull the color up to their 16-bit range.
+struct pl_bit_encoding {
+    int sample_depth; // the number of bits the color is stored/sampled as
+    int color_depth;  // the effective number of bits of the color information
+    int bit_shift;    // a representational bit shift applied to the color
+};
+
+// Returns whether two bit encodings are exactly identical.
+PL_API bool pl_bit_encoding_equal(const struct pl_bit_encoding *b1,
+                                  const struct pl_bit_encoding *b2);
+
+// Parsed metadata from the Dolby Vision RPU
+struct pl_dovi_metadata {
+    // Colorspace transformation metadata
+    float nonlinear_offset[3];  // input offset ("ycc_to_rgb_offset")
+    pl_matrix3x3 nonlinear;     // before PQ, also called "ycc_to_rgb"
+    pl_matrix3x3 linear;        // after PQ, also called "rgb_to_lms"
+
+    // Reshape data, grouped by component
+    struct pl_reshape_data {
+        uint8_t num_pivots;
+        float pivots[9]; // normalized to [0.0, 1.0] based on BL bit depth
+        uint8_t method[8]; // 0 = polynomial, 1 = MMR
+        // Note: these must be normalized (divide by coefficient_log2_denom)
+        float poly_coeffs[8][3]; // x^0, x^1, x^2, unused must be 0
+        uint8_t mmr_order[8]; // 1, 2 or 3
+        float mmr_constant[8];
+        float mmr_coeffs[8][3 /* order */][7];
+    } comp[3];
+};
+
+// Struct describing the underlying color system and representation. This
+// information is needed to convert an encoded color to a normalized RGB triple
+// in the range 0-1.
+struct pl_color_repr {
+    enum pl_color_system sys;
+    enum pl_color_levels levels;
+    enum pl_alpha_mode alpha;
+    struct pl_bit_encoding bits; // or {0} if unknown
+
+    // Metadata for PL_COLOR_SYSTEM_DOLBYVISION. Note that, for the sake of
+    // efficiency, this is treated purely as an opaque reference - functions
+    // like pl_color_repr_equal will merely do a pointer equality test.
+    //
+    // The only functions that actually dereference it in any way are
+    // pl_color_repr_decode,  pl_shader_decode_color and pl_render_image(_mix).
+    const struct pl_dovi_metadata *dovi;
+};
+
+// Some common color representations. It's worth pointing out that all of these
+// presets leave `alpha` and `bits` as unknown - that is, only the system and
+// levels are predefined
+PL_API extern const struct pl_color_repr pl_color_repr_unknown;
+PL_API extern const struct pl_color_repr pl_color_repr_rgb;
+PL_API extern const struct pl_color_repr pl_color_repr_sdtv;
+PL_API extern const struct pl_color_repr pl_color_repr_hdtv;  // also Blu-ray
+PL_API extern const struct pl_color_repr pl_color_repr_uhdtv; // SDR, NCL system
+PL_API extern const struct pl_color_repr pl_color_repr_jpeg;
+
+// Returns whether two colorspace representations are exactly identical.
+PL_API bool pl_color_repr_equal(const struct pl_color_repr *c1,
+                                const struct pl_color_repr *c2);
+
+// Replaces unknown values in the first struct by those of the second struct.
+PL_API void pl_color_repr_merge(struct pl_color_repr *orig,
+                                const struct pl_color_repr *update);
+
+// This function normalizes the color representation such that
+// color_depth=sample_depth and bit_shift=0; and returns the scaling factor
+// that must be multiplied into the color value to accomplish this, assuming
+// it has already been sampled by the GPU. If unknown, the color and sample
+// depth will both be inferred as 8 bits for the purposes of this conversion.
+PL_API float pl_color_repr_normalize(struct pl_color_repr *repr);
+
+// Guesses the best color levels based on the specified color levels and
+// falling back to using the color system instead. YCbCr-like systems are
+// assumed to be TV range, otherwise this defaults to PC range.
+PL_API enum pl_color_levels pl_color_levels_guess(const struct pl_color_repr *repr);
+
+// The colorspace's primaries (gamut)
+enum pl_color_primaries {
+    PL_COLOR_PRIM_UNKNOWN = 0,
+    // Standard gamut:
+    PL_COLOR_PRIM_BT_601_525,   // ITU-R Rec. BT.601 (525-line = NTSC, SMPTE-C)
+    PL_COLOR_PRIM_BT_601_625,   // ITU-R Rec. BT.601 (625-line = PAL, SECAM)
+    PL_COLOR_PRIM_BT_709,       // ITU-R Rec. BT.709 (HD), also sRGB
+    PL_COLOR_PRIM_BT_470M,      // ITU-R Rec. BT.470 M
+    PL_COLOR_PRIM_EBU_3213,     // EBU Tech. 3213-E / JEDEC P22 phosphors
+    // Wide gamut:
+    PL_COLOR_PRIM_BT_2020,      // ITU-R Rec. BT.2020 (UltraHD)
+    PL_COLOR_PRIM_APPLE,        // Apple RGB
+    PL_COLOR_PRIM_ADOBE,        // Adobe RGB (1998)
+    PL_COLOR_PRIM_PRO_PHOTO,    // ProPhoto RGB (ROMM)
+    PL_COLOR_PRIM_CIE_1931,     // CIE 1931 RGB primaries
+    PL_COLOR_PRIM_DCI_P3,       // DCI-P3 (Digital Cinema)
+    PL_COLOR_PRIM_DISPLAY_P3,   // DCI-P3 (Digital Cinema) with D65 white point
+    PL_COLOR_PRIM_V_GAMUT,      // Panasonic V-Gamut (VARICAM)
+    PL_COLOR_PRIM_S_GAMUT,      // Sony S-Gamut
+    PL_COLOR_PRIM_FILM_C,       // Traditional film primaries with Illuminant C
+    PL_COLOR_PRIM_ACES_AP0,     // ACES Primaries #0 (ultra wide)
+    PL_COLOR_PRIM_ACES_AP1,     // ACES Primaries #1
+    PL_COLOR_PRIM_COUNT
+};
+
+PL_API bool pl_color_primaries_is_wide_gamut(enum pl_color_primaries prim);
+
+// Guesses the best primaries based on a resolution. This always guesses
+// conservatively, i.e. it will never return a wide gamut color space even if
+// the resolution is 4K.
+PL_API enum pl_color_primaries pl_color_primaries_guess(int width, int height);
+
+// The colorspace's transfer function (gamma / EOTF)
+enum pl_color_transfer {
+    PL_COLOR_TRC_UNKNOWN = 0,
+    // Standard dynamic range:
+    PL_COLOR_TRC_BT_1886,       // ITU-R Rec. BT.1886 (CRT emulation + OOTF)
+    PL_COLOR_TRC_SRGB,          // IEC 61966-2-4 sRGB (CRT emulation)
+    PL_COLOR_TRC_LINEAR,        // Linear light content
+    PL_COLOR_TRC_GAMMA18,       // Pure power gamma 1.8
+    PL_COLOR_TRC_GAMMA20,       // Pure power gamma 2.0
+    PL_COLOR_TRC_GAMMA22,       // Pure power gamma 2.2
+    PL_COLOR_TRC_GAMMA24,       // Pure power gamma 2.4
+    PL_COLOR_TRC_GAMMA26,       // Pure power gamma 2.6
+    PL_COLOR_TRC_GAMMA28,       // Pure power gamma 2.8
+    PL_COLOR_TRC_PRO_PHOTO,     // ProPhoto RGB (ROMM)
+    PL_COLOR_TRC_ST428,         // Digital Cinema Distribution Master (XYZ)
+    // High dynamic range:
+    PL_COLOR_TRC_PQ,            // ITU-R BT.2100 PQ (perceptual quantizer), aka SMPTE ST2048
+    PL_COLOR_TRC_HLG,           // ITU-R BT.2100 HLG (hybrid log-gamma), aka ARIB STD-B67
+    PL_COLOR_TRC_V_LOG,         // Panasonic V-Log (VARICAM)
+    PL_COLOR_TRC_S_LOG1,        // Sony S-Log1
+    PL_COLOR_TRC_S_LOG2,        // Sony S-Log2
+    PL_COLOR_TRC_COUNT
+};
+
+// Returns the nominal peak of a given transfer function, relative to the
+// reference white. This refers to the highest encodable signal level.
+// Always equal to 1.0 for SDR curves.
+//
+// Note: For HLG in particular, which is scene-referred, this returns the
+// highest nominal peak in scene-referred space (3.77), which may be different
+// from the actual peak in display space after application of the HLG OOTF.
+PL_API float pl_color_transfer_nominal_peak(enum pl_color_transfer trc);
+
+static inline bool pl_color_transfer_is_hdr(enum pl_color_transfer trc)
+{
+    return pl_color_transfer_nominal_peak(trc) > 1.0;
+}
+
+// This defines the display-space standard reference white level (in cd/m^2)
+// that is assumed for SDR content, for use when mapping between HDR and SDR in
+// display space. See ITU-R Report BT.2408 for more information.
+#define PL_COLOR_SDR_WHITE 203.0f
+
+// This defines the assumed contrast level of an unknown SDR display. This
+// will be used to determine the black point in the absence of any tagged
+// minimum luminance, relative to the tagged maximum luminance (or
+// PL_COLOR_SDR_WHITE in the absence of all tagging)
+#define PL_COLOR_SDR_CONTRAST 1000.0f
+
+// This defines the default black point assumed for "infinite contrast" HDR
+// displays. This is not exactly 0.0 because a value of 0.0 is interpreted
+// as "unknown / missing metadata" inside struct pl_hdr_metadata, and also
+// to avoid numerical issues in a variety of tone mapping functions.
+// Essentially, a black level below this number is functionally meaningless
+// inside libplacebo, and will be clamped to this value regardless.
+//
+// The value used here (1e-6) is about one 13-bit PQ step above absolute zero,
+// which is a small fraction of the human JND at this brightness level, and also
+// about 3 bits above the floating point machine epsilon.
+#define PL_COLOR_HDR_BLACK 1e-6f
+
+// This defines the assumed peak brightness of a HLG display with no HDR10
+// metadata. This is set to the brightness of a "nominal" HLG reference display.
+#define PL_COLOR_HLG_PEAK 1000.0f
+
+// Represents a single CIE xy coordinate (e.g. CIE Yxy with Y = 1.0)
+struct pl_cie_xy {
+    float x, y;
+};
+
+// Creates a pl_cie_xyz from raw XYZ values
+static inline struct pl_cie_xy pl_cie_from_XYZ(float X, float Y, float Z)
+{
+    float k = 1.0f / (X + Y + Z);
+    struct pl_cie_xy xy = { k * X, k * Y };
+    return xy;
+}
+
+// Recovers (X / Y) from a CIE xy value.
+static inline float pl_cie_X(struct pl_cie_xy xy)
+{
+    return xy.x / xy.y;
+}
+
+// Recovers (Z / Y) from a CIE xy value.
+static inline float pl_cie_Z(struct pl_cie_xy xy)
+{
+    return (1 - xy.x - xy.y) / xy.y;
+}
+
+static inline bool pl_cie_xy_equal(const struct pl_cie_xy *a,
+                                   const struct pl_cie_xy *b)
+{
+    return a->x == b->x && a->y == b->y;
+}
+
+// Computes the CIE xy chromaticity coordinates of a CIE D-series illuminant
+// with the given correlated color temperature.
+//
+// `temperature` must be between 2500 K and 25000 K, inclusive.
+PL_API struct pl_cie_xy pl_white_from_temp(float temperature);
+
+// Represents the raw physical primaries corresponding to a color space.
+struct pl_raw_primaries {
+    struct pl_cie_xy red, green, blue, white;
+};
+
+// Returns whether two raw primaries are exactly identical.
+PL_API bool pl_raw_primaries_equal(const struct pl_raw_primaries *a,
+                                   const struct pl_raw_primaries *b);
+
+// Returns whether two raw primaries are approximately equal
+PL_API bool pl_raw_primaries_similar(const struct pl_raw_primaries *a,
+                                     const struct pl_raw_primaries *b);
+
+// Replaces unknown values in the first struct by those of the second struct.
+PL_API void pl_raw_primaries_merge(struct pl_raw_primaries *orig,
+                                   const struct pl_raw_primaries *update);
+
+// Returns the raw primaries for a given color space.
+PL_API const struct pl_raw_primaries *pl_raw_primaries_get(enum pl_color_primaries prim);
+
+enum pl_hdr_scaling {
+    PL_HDR_NORM = 0,        // 0.0 is absolute black, 1.0 is PL_COLOR_SDR_WHITE
+    PL_HDR_SQRT,            // sqrt() of PL_HDR_NORM values
+    PL_HDR_NITS,            // absolute brightness in raw cd/m²
+    PL_HDR_PQ,              // absolute brightness in PQ (0.0 to 1.0)
+    PL_HDR_SCALING_COUNT,
+};
+
+// Generic helper for performing HDR scale conversions.
+PL_API float pl_hdr_rescale(enum pl_hdr_scaling from, enum pl_hdr_scaling to, float x);
+
+enum pl_hdr_metadata_type {
+    PL_HDR_METADATA_ANY = 0,
+    PL_HDR_METADATA_NONE,
+    PL_HDR_METADATA_HDR10,          // HDR10 static mastering display metadata
+    PL_HDR_METADATA_HDR10PLUS,      // HDR10+ dynamic metadata
+    PL_HDR_METADATA_CIE_Y,          // CIE Y derived dynamic luminance metadata
+    PL_HDR_METADATA_TYPE_COUNT,
+};
+
+// Bezier curve for HDR metadata
+struct pl_hdr_bezier {
+    float target_luma;      // target luminance (cd/m²) for this OOTF
+    float knee_x, knee_y;   // cross-over knee point (0-1)
+    float anchors[15];      // intermediate bezier curve control points (0-1)
+    uint8_t num_anchors;
+};
+
+// Represents raw HDR metadata as defined by SMPTE 2086 / CTA 861.3, which is
+// often attached to HDR sources and can be forwarded to HDR-capable displays,
+// or used to guide the libplacebo built-in tone mapping. Values left as 0
+// are treated as unknown by libplacebo.
+//
+// Note: This means that a value of `min_luma == 0.0` gets treated as "minimum
+// luminance not known", which in practice may end up inferring a default
+// contrast of 1000:1 for SDR transfer functions. To avoid this, the user should
+// set these fields to a low positive value, e.g. PL_COLOR_HDR_BLACK, to signal
+// a "zero" black point (i.e. infinite contrast display).
+struct pl_hdr_metadata {
+    // --- PL_HDR_METADATA_HDR10
+    // Mastering display metadata.
+    struct pl_raw_primaries prim;   // mastering display primaries
+    float min_luma, max_luma;       // min/max luminance (in cd/m²)
+
+    // Content light level. (Note: this is ignored by libplacebo itself)
+    float max_cll;                  // max content light level (in cd/m²)
+    float max_fall;                 // max frame average light level (in cd/m²)
+
+    // --- PL_HDR_METADATA_HDR10PLUS
+    float scene_max[3];             // maxSCL in cd/m² per component (RGB)
+    float scene_avg;                // average of maxRGB in cd/m²
+    struct pl_hdr_bezier ootf;      // reference OOTF (optional)
+
+    // --- PL_HDR_METADATA_CIE_Y
+    float max_pq_y;                 // maximum PQ luminance (in PQ, 0-1)
+    float avg_pq_y;                 // averaged PQ luminance (in PQ, 0-1)
+};
+
+PL_API extern const struct pl_hdr_metadata pl_hdr_metadata_empty; // equal to {0}
+PL_API extern const struct pl_hdr_metadata pl_hdr_metadata_hdr10; // generic HDR10 display
+
+// Returns whether two sets of HDR metadata are exactly identical.
+PL_API bool pl_hdr_metadata_equal(const struct pl_hdr_metadata *a,
+                                  const struct pl_hdr_metadata *b);
+
+// Replaces unknown values in the first struct by those of the second struct.
+PL_API void pl_hdr_metadata_merge(struct pl_hdr_metadata *orig,
+                                  const struct pl_hdr_metadata *update);
+
+// Returns `true` if `data` contains a complete set of a given metadata type.
+// Note: for PL_HDR_METADATA_HDR10, only `min_luma` and `max_luma` are
+// considered - CLL/FALL and primaries are irrelevant for HDR tone-mapping.
+PL_API bool pl_hdr_metadata_contains(const struct pl_hdr_metadata *data,
+                                     enum pl_hdr_metadata_type type);
+
+// Rendering intent for colorspace transformations. These constants match the
+// ICC specification (Table 23)
+enum pl_rendering_intent {
+    PL_INTENT_AUTO = -1, // not a valid ICC intent, but used to auto-infer
+    PL_INTENT_PERCEPTUAL = 0,
+    PL_INTENT_RELATIVE_COLORIMETRIC = 1,
+    PL_INTENT_SATURATION = 2,
+    PL_INTENT_ABSOLUTE_COLORIMETRIC = 3
+};
+
+// Struct describing a physical color space. This information is needed to
+// turn a normalized RGB triple into its physical meaning, as well as to convert
+// between color spaces.
+struct pl_color_space {
+    enum pl_color_primaries primaries;
+    enum pl_color_transfer transfer;
+
+    // HDR metadata for this color space, if present. (Optional)
+    struct pl_hdr_metadata hdr;
+};
+
+#define pl_color_space(...) (&(struct pl_color_space) { __VA_ARGS__ })
+
+// Returns whether or not a color space is considered as effectively HDR.
+// This is true when the effective signal peak is greater than the SDR
+// reference white (1.0), taking into account `csp->hdr`.
+PL_API bool pl_color_space_is_hdr(const struct pl_color_space *csp);
+
+// Returns whether or not a color space is "black scaled", in which case 0.0 is
+// the true black point. This is true for SDR signals other than BT.1886, as
+// well as for HLG.
+PL_API bool pl_color_space_is_black_scaled(const struct pl_color_space *csp);
+
+struct pl_nominal_luma_params {
+    // The color space to infer luminance from
+    const struct pl_color_space *color;
+
+    // Which type of metadata to draw values from
+    enum pl_hdr_metadata_type metadata;
+
+    // This field controls the scaling of `out_*`
+    enum pl_hdr_scaling scaling;
+
+    // Fields to write the detected nominal luminance to. (Optional)
+    //
+    // For SDR displays, this will default to a contrast level of 1000:1 unless
+    // indicated otherwise in the `min/max_luma` static HDR10 metadata fields.
+    float *out_min;
+    float *out_max;
+
+    // Field to write the detected average luminance to, or 0.0 in the absence
+    // of dynamic metadata. (Optional)
+    float *out_avg;
+};
+
+#define pl_nominal_luma_params(...) \
+    (&(struct pl_nominal_luma_params) { __VA_ARGS__ })
+
+// Returns the effective luminance described by a pl_color_space.
+PL_API void pl_color_space_nominal_luma_ex(const struct pl_nominal_luma_params *params);
+
+// Backwards compatibility wrapper for `pl_color_space_nominal_luma_ex`
+PL_DEPRECATED PL_API void pl_color_space_nominal_luma(const struct pl_color_space *csp,
+                                                      float *out_min, float *out_max);
+
+// Replaces unknown values in the first struct by those of the second struct.
+PL_API void pl_color_space_merge(struct pl_color_space *orig,
+                                 const struct pl_color_space *update);
+
+// Returns whether two colorspaces are exactly identical.
+PL_API bool pl_color_space_equal(const struct pl_color_space *c1,
+                                 const struct pl_color_space *c2);
+
+// Go through a color-space and explicitly default all unknown fields to
+// reasonable values. After this function is called, none of the values will be
+// PL_COLOR_*_UNKNOWN or 0.0, except for the dynamic HDR metadata fields.
+PL_API void pl_color_space_infer(struct pl_color_space *space);
+
+// Like `pl_color_space_infer`, but takes default values from the reference
+// color space (excluding certain special cases like HDR or wide gamut).
+PL_API void pl_color_space_infer_ref(struct pl_color_space *space,
+                                     const struct pl_color_space *ref);
+
+// Infer both the source and destination gamut simultaneously, and also adjust
+// values for optimal display. This is mostly the same as
+// `pl_color_space_infer(src)` followed by `pl_color_space_infer_ref`, but also
+// takes into account the SDR contrast levels and PQ black points. This is
+// basically the logic used by `pl_shader_color_map` and `pl_renderer` to
+// decide the output color space in a conservative way and compute the final
+// end-to-end color transformation that needs to be done.
+PL_API void pl_color_space_infer_map(struct pl_color_space *src,
+                                     struct pl_color_space *dst);
+
+// Some common color spaces. Note: These don't necessarily have all fields
+// filled, in particular `hdr` is left unset.
+PL_API extern const struct pl_color_space pl_color_space_unknown;
+PL_API extern const struct pl_color_space pl_color_space_srgb;
+PL_API extern const struct pl_color_space pl_color_space_bt709;
+PL_API extern const struct pl_color_space pl_color_space_hdr10;
+PL_API extern const struct pl_color_space pl_color_space_bt2020_hlg;
+PL_API extern const struct pl_color_space pl_color_space_monitor; // typical display
+
+// This represents metadata about extra operations to perform during colorspace
+// conversion, which correspond to artistic adjustments of the color.
+struct pl_color_adjustment {
+    // Brightness boost. 0.0 = neutral, 1.0 = solid white, -1.0 = solid black
+    float brightness;
+    // Contrast boost. 1.0 = neutral, 0.0 = solid black
+    float contrast;
+    // Saturation gain. 1.0 = neutral, 0.0 = grayscale
+    float saturation;
+    // Hue shift, corresponding to a rotation around the [U, V] subvector, in
+    // radians. 0.0 = neutral
+    float hue;
+    // Gamma adjustment. 1.0 = neutral, 0.0 = solid black
+    float gamma;
+    // Color temperature shift. 0.0 = 6500 K, -1.0 = 3000 K, 1.0 = 10000 K
+    float temperature;
+};
+
+#define PL_COLOR_ADJUSTMENT_NEUTRAL \
+    .contrast       = 1.0,           \
+    .saturation     = 1.0,           \
+    .gamma          = 1.0,
+
+#define pl_color_adjustment(...) (&(struct pl_color_adjustment) { PL_COLOR_ADJUSTMENT_NEUTRAL __VA_ARGS__ })
+PL_API extern const struct pl_color_adjustment pl_color_adjustment_neutral;
+
+// Represents the chroma placement with respect to the luma samples. This is
+// only relevant for YCbCr-like colorspaces with chroma subsampling.
+enum pl_chroma_location {
+    PL_CHROMA_UNKNOWN = 0,
+    PL_CHROMA_LEFT,             // MPEG2/4, H.264
+    PL_CHROMA_CENTER,           // MPEG1, JPEG
+    PL_CHROMA_TOP_LEFT,
+    PL_CHROMA_TOP_CENTER,
+    PL_CHROMA_BOTTOM_LEFT,
+    PL_CHROMA_BOTTOM_CENTER,
+    PL_CHROMA_COUNT,
+};
+
+// Fills *x and *y with the offset in luma pixels corresponding to a given
+// chroma location.
+//
+// Note: PL_CHROMA_UNKNOWN defaults to PL_CHROMA_LEFT
+PL_API void pl_chroma_location_offset(enum pl_chroma_location loc, float *x, float *y);
+
+// Returns an RGB->XYZ conversion matrix for a given set of primaries.
+// Multiplying this into the RGB color transforms it to CIE XYZ, centered
+// around the color space's white point.
+PL_API pl_matrix3x3 pl_get_rgb2xyz_matrix(const struct pl_raw_primaries *prim);
+
+// Similar to pl_get_rgb2xyz_matrix, but gives the inverse transformation.
+PL_API pl_matrix3x3 pl_get_xyz2rgb_matrix(const struct pl_raw_primaries *prim);
+
+// Returns a primary adaptation matrix, which converts from one set of
+// primaries to another. This is an RGB->RGB transformation. For rendering
+// intents other than PL_INTENT_ABSOLUTE_COLORIMETRIC, the white point is
+// adapted using the Bradford matrix.
+PL_API pl_matrix3x3 pl_get_color_mapping_matrix(const struct pl_raw_primaries *src,
+                                                const struct pl_raw_primaries *dst,
+                                                enum pl_rendering_intent intent);
+
+// Return a chromatic adaptation matrix, which converts from one white point to
+// another, using the Bradford matrix. This is an RGB->RGB transformation.
+PL_API pl_matrix3x3 pl_get_adaptation_matrix(struct pl_cie_xy src, struct pl_cie_xy dst);
+
+// Returns true if 'b' is entirely contained in 'a'. Useful for figuring out if
+// colorimetric clipping will occur or not.
+PL_API bool pl_primaries_superset(const struct pl_raw_primaries *a,
+                                  const struct pl_raw_primaries *b);
+
+// Returns true if `prim` forms a nominally valid set of primaries. This does
+// not check whether or not these primaries are actually physically realisable,
+// merely that they satisfy the requirements for colorspace math (to avoid NaN).
+PL_API bool pl_primaries_valid(const struct pl_raw_primaries *prim);
+
+// Returns true if two primaries are 'compatible', which is the case if
+// they preserve the relationship between primaries (red=red, green=green,
+// blue=blue). In other words, this is false for synthetic primaries that have
+// channels misordered from the convention (e.g. for some test ICC profiles).
+PL_API bool pl_primaries_compatible(const struct pl_raw_primaries *a,
+                                    const struct pl_raw_primaries *b);
+
+// Clip points in the first gamut (src) to be fully contained inside the second
+// gamut (dst). Only works on compatible primaries (pl_primaries_compatible).
+PL_API struct pl_raw_primaries
+pl_primaries_clip(const struct pl_raw_primaries *src,
+                  const struct pl_raw_primaries *dst);
+
+// Primary-dependent RGB->LMS matrix for the IPTPQc4 color system. This is
+// derived from the HPE XYZ->LMS matrix with 4% crosstalk added.
+PL_API pl_matrix3x3 pl_ipt_rgb2lms(const struct pl_raw_primaries *prim);
+PL_API pl_matrix3x3 pl_ipt_lms2rgb(const struct pl_raw_primaries *prim);
+
+// Primary-independent L'M'S' -> IPT matrix for the IPTPQc4 color system, and
+// its inverse. This is identical to the Ebner & Fairchild (1998) IPT matrix.
+PL_API extern const pl_matrix3x3 pl_ipt_lms2ipt;
+PL_API extern const pl_matrix3x3 pl_ipt_ipt2lms;
+
+// Cone types involved in human vision
+enum pl_cone {
+    PL_CONE_L = 1 << 0,
+    PL_CONE_M = 1 << 1,
+    PL_CONE_S = 1 << 2,
+
+    // Convenience aliases
+    PL_CONE_NONE = 0,
+    PL_CONE_LM   = PL_CONE_L | PL_CONE_M,
+    PL_CONE_MS   = PL_CONE_M | PL_CONE_S,
+    PL_CONE_LS   = PL_CONE_L | PL_CONE_S,
+    PL_CONE_LMS  = PL_CONE_L | PL_CONE_M | PL_CONE_S,
+};
+
+// Structure describing parameters for simulating color blindness
+struct pl_cone_params {
+    enum pl_cone cones; // Which cones are *affected* by the vision model
+    float strength;     // Coefficient for how strong the defect is
+                        // (1.0 = Unaffected, 0.0 = Full blindness)
+};
+
+#define pl_cone_params(...) (&(struct pl_cone_params) { __VA_ARGS__ })
+
+// Built-in color blindness models
+PL_API extern const struct pl_cone_params pl_vision_normal;        // No distortion (92%)
+PL_API extern const struct pl_cone_params pl_vision_protanomaly;   // Red deficiency (0.66%)
+PL_API extern const struct pl_cone_params pl_vision_protanopia;    // Red absence (0.59%)
+PL_API extern const struct pl_cone_params pl_vision_deuteranomaly; // Green deficiency (2.7%)
+PL_API extern const struct pl_cone_params pl_vision_deuteranopia;  // Green absence (0.56%)
+PL_API extern const struct pl_cone_params pl_vision_tritanomaly;   // Blue deficiency (0.01%)
+PL_API extern const struct pl_cone_params pl_vision_tritanopia;    // Blue absence (0.016%)
+PL_API extern const struct pl_cone_params pl_vision_monochromacy;  // Blue cones only (<0.001%)
+PL_API extern const struct pl_cone_params pl_vision_achromatopsia; // Rods only (<0.0001%)
+
+// Returns a cone adaptation matrix. Applying this to an RGB color in the given
+// color space will apply the given cone adaptation coefficients for simulating
+// a type of color blindness.
+//
+// For the color blindness models which don't entail complete loss of a cone,
+// you can partially counteract the effect by using a similar model with the
+// `strength` set to its inverse. For example, to partially counteract
+// deuteranomaly, you could generate a cone matrix for PL_CONE_M with the
+// strength 2.0 (or some other number above 1.0).
+PL_API pl_matrix3x3 pl_get_cone_matrix(const struct pl_cone_params *params,
+                                       const struct pl_raw_primaries *prim);
+
+// Returns a color decoding matrix for a given combination of source color
+// representation and adjustment parameters. This mutates `repr` to reflect the
+// change. If `params` is NULL, it defaults to &pl_color_adjustment_neutral.
+//
+// This function always performs a conversion to RGB. To convert to other
+// colorspaces (e.g. between YUV systems), obtain a second YUV->RGB matrix
+// and invert it using `pl_transform3x3_invert`.
+//
+// Note: For BT.2020 constant-luminance, this outputs chroma information in the
+// range [-0.5, 0.5]. Since the CL system conversion is non-linear, further
+// processing must be done by the caller. The channel order is CrYCb.
+//
+// Note: For BT.2100 ICtCp, this outputs in the color space L'M'S'. Further
+// non-linear processing must be done by the caller.
+//
+// Note: XYZ system is expected to be in DCDM X'Y'Z' encoding (ST 428-1), in
+// practice this means normalizing by (48.0 / 52.37) factor and applying 2.6 gamma
+PL_API pl_transform3x3 pl_color_repr_decode(struct pl_color_repr *repr,
+                                            const struct pl_color_adjustment *params);
+
+// Common struct to describe an ICC profile
+struct pl_icc_profile {
+    // Points to the in-memory representation of the ICC profile. This is
+    // allowed to be NULL, in which case the `pl_icc_profile` represents "no
+    // profile”.
+    const void *data;
+    size_t len;
+
+    // If a profile is set, this signature must uniquely identify it (including
+    // across restarts, for caching), ideally using a checksum of the profile
+    // contents. The user is free to choose the method of determining this
+    // signature, but note the existence of the
+    // `pl_icc_profile_compute_signature` helper.
+    uint64_t signature;
+};
+
+#define pl_icc_profile(...) &(struct pl_icc_profile) { __VA_ARGS__ }
+
+// This doesn't do a comparison of the actual contents, only of the signature.
+PL_API bool pl_icc_profile_equal(const struct pl_icc_profile *p1,
+                                 const struct pl_icc_profile *p2);
+
+// Sets `signature` to a hash of `profile->data`, if non-NULL. Provided as a
+// convenience function for the sake of users ingesting arbitrary ICC profiles
+// from sources where they can't reliably detect profile changes.
+//
+// Note: This is based on a very fast hash, and will compute a signature for
+// even large (10 MB) ICC profiles in, typically, a fraction of a millisecond.
+PL_API void pl_icc_profile_compute_signature(struct pl_icc_profile *profile);
+
+PL_API_END
+
+#endif // LIBPLACEBO_COLORSPACE_H_
diff --git a/src/include/libplacebo/common.h b/src/include/libplacebo/common.h
new file mode 100644
index 0000000..806730c
--- /dev/null
+++ b/src/include/libplacebo/common.h
@@ -0,0 +1,244 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_COMMON_H_
+#define LIBPLACEBO_COMMON_H_
+
+#include <stdbool.h>
+
+#include <libplacebo/config.h>
+
+PL_API_BEGIN
+
+// Some common utility types. These are overloaded to support 2D, 3D and
+// integer/float variants.
+typedef struct pl_rect2d {
+    int x0, y0;
+    int x1, y1;
+} pl_rect2d;
+
+typedef struct pl_rect3d {
+    int x0, y0, z0;
+    int x1, y1, z1;
+} pl_rect3d;
+
+typedef struct pl_rect2df {
+    float x0, y0;
+    float x1, y1;
+} pl_rect2df;
+
+typedef struct pl_rect3df {
+    float x0, y0, z0;
+    float x1, y1, z1;
+} pl_rect3df;
+
+// These macros will work for any of the above pl_rect variants (with enough
+// dimensions). Careful: double-evaluation hazard
+#define pl_rect_w(r) ((r).x1 - (r).x0)
+#define pl_rect_h(r) ((r).y1 - (r).y0)
+#define pl_rect_d(r) ((r).z1 - (r).z0)
+
+#define pl_rect2d_eq(a, b) \
+    ((a).x0 == (b).x0 && (a).x1 == (b).x1 && \
+     (a).y0 == (b).y0 && (a).y1 == (b).y1)
+
+#define pl_rect3d_eq(a, b) \
+    ((a).x0 == (b).x0 && (a).x1 == (b).x1 && \
+     (a).y0 == (b).y0 && (a).y1 == (b).y1 && \
+     (a).z0 == (b).z0 && (a).z1 == (b).z1)
+
+// "Normalize" a rectangle: This ensures d1 >= d0 for all dimensions.
+PL_API void pl_rect2d_normalize(pl_rect2d *rc);
+PL_API void pl_rect3d_normalize(pl_rect3d *rc);
+
+PL_API void pl_rect2df_normalize(pl_rect2df *rc);
+PL_API void pl_rect3df_normalize(pl_rect3df *rc);
+
+// Return the rounded form of a rect.
+PL_API pl_rect2d pl_rect2df_round(const pl_rect2df *rc);
+PL_API pl_rect3d pl_rect3df_round(const pl_rect3df *rc);
+
+// Represents a row-major matrix, i.e. the following matrix
+//     [ a11 a12 a13 ]
+//     [ a21 a22 a23 ]
+//     [ a31 a32 a33 ]
+// is represented in C like this:
+//   { { a11, a12, a13 },
+//     { a21, a22, a23 },
+//     { a31, a32, a33 } };
+typedef struct pl_matrix3x3 {
+    float m[3][3];
+} pl_matrix3x3;
+
+PL_API extern const pl_matrix3x3 pl_matrix3x3_identity;
+
+// Applies a matrix to a float vector in-place.
+PL_API void pl_matrix3x3_apply(const pl_matrix3x3 *mat, float vec[3]);
+
+// Applies a matrix to a pl_rect3df
+PL_API void pl_matrix3x3_apply_rc(const pl_matrix3x3 *mat, pl_rect3df *rc);
+
+// Scales a color matrix by a linear factor.
+PL_API void pl_matrix3x3_scale(pl_matrix3x3 *mat, float scale);
+
+// Inverts a matrix. Only use where precision is not that important.
+PL_API void pl_matrix3x3_invert(pl_matrix3x3 *mat);
+
+// Composes/multiplies two matrices. Multiples B into A, i.e.
+// A := A * B
+PL_API void pl_matrix3x3_mul(pl_matrix3x3 *a, const pl_matrix3x3 *b);
+
+// Flipped version of `pl_matrix3x3_mul`.
+// B := A * B
+PL_API void pl_matrix3x3_rmul(const pl_matrix3x3 *a, pl_matrix3x3 *b);
+
+// Represents an affine transformation, which is basically a 3x3 matrix
+// together with a column vector to add onto the output.
+typedef struct pl_transform3x3 {
+    pl_matrix3x3 mat;
+    float c[3];
+} pl_transform3x3;
+
+PL_API extern const pl_transform3x3 pl_transform3x3_identity;
+
+// Applies a transform to a float vector in-place.
+PL_API void pl_transform3x3_apply(const pl_transform3x3 *t, float vec[3]);
+
+// Applies a transform to a pl_rect3df
+PL_API void pl_transform3x3_apply_rc(const pl_transform3x3 *t, pl_rect3df *rc);
+
+// Scales the output of a transform by a linear factor. Since an affine
+// transformation is non-linear, this does not commute. If you want to scale
+// the *input* of a transform, use pl_matrix3x3_scale on `t.mat`.
+PL_API void pl_transform3x3_scale(pl_transform3x3 *t, float scale);
+
+// Inverts a transform. Only use where precision is not that important.
+PL_API void pl_transform3x3_invert(pl_transform3x3 *t);
+
+// 2D analog of the above structs. Since these are featured less prominently,
+// we omit some of the other helper functions.
+typedef struct pl_matrix2x2 {
+    float m[2][2];
+} pl_matrix2x2;
+
+PL_API extern const pl_matrix2x2 pl_matrix2x2_identity;
+PL_API pl_matrix2x2 pl_matrix2x2_rotation(float angle);
+
+PL_API void pl_matrix2x2_apply(const pl_matrix2x2 *mat, float vec[2]);
+PL_API void pl_matrix2x2_apply_rc(const pl_matrix2x2 *mat, pl_rect2df *rc);
+
+PL_API void pl_matrix2x2_mul(pl_matrix2x2 *a, const pl_matrix2x2 *b);
+PL_API void pl_matrix2x2_rmul(const pl_matrix2x2 *a, pl_matrix2x2 *b);
+
+PL_API void pl_matrix2x2_scale(pl_matrix2x2 *mat, float scale);
+PL_API void pl_matrix2x2_invert(pl_matrix2x2 *mat);
+
+typedef struct pl_transform2x2 {
+    pl_matrix2x2 mat;
+    float c[2];
+} pl_transform2x2;
+
+PL_API extern const pl_transform2x2 pl_transform2x2_identity;
+
+PL_API void pl_transform2x2_apply(const pl_transform2x2 *t, float vec[2]);
+PL_API void pl_transform2x2_apply_rc(const pl_transform2x2 *t, pl_rect2df *rc);
+
+PL_API void pl_transform2x2_mul(pl_transform2x2 *a, const pl_transform2x2 *b);
+PL_API void pl_transform2x2_rmul(const pl_transform2x2 *a, pl_transform2x2 *b);
+
+PL_API void pl_transform2x2_scale(pl_transform2x2 *t, float scale);
+PL_API void pl_transform2x2_invert(pl_transform2x2 *t);
+
+// Compute new bounding box of a transformation (as applied to a given rect).
+PL_API pl_rect2df pl_transform2x2_bounds(const pl_transform2x2 *t,
+                                         const pl_rect2df *rc);
+
+// Helper functions for dealing with aspect ratios and stretched/scaled rects.
+
+// Return the (absolute) aspect ratio (width/height) of a given pl_rect2df.
+// This will always be a positive number, even if `rc` is flipped.
+PL_API float pl_rect2df_aspect(const pl_rect2df *rc);
+
+// Set the aspect of a `rc` to a given aspect ratio with an extra 'panscan'
+// factor choosing the balance between shrinking and growing the `rc` to meet
+// this aspect ratio.
+//
+// Notes:
+// - If `panscan` is 0.0, this function will only ever shrink the `rc`.
+// - If `panscan` is 1.0, this function will only ever grow the `rc`.
+// - If `panscan` is 0.5, this function is area-preserving.
+PL_API void pl_rect2df_aspect_set(pl_rect2df *rc, float aspect, float panscan);
+
+// Set one rect's aspect to that of another
+#define pl_rect2df_aspect_copy(rc, src, panscan) \
+    pl_rect2df_aspect_set((rc), pl_rect2df_aspect(src), (panscan))
+
+// 'Fit' one rect inside another. `rc` will be set to the same size and aspect
+// ratio as `src`, but with the size limited to fit inside the original `rc`.
+// Like `pl_rect2df_aspect_set`, `panscan` controls the pan&scan factor.
+PL_API void pl_rect2df_aspect_fit(pl_rect2df *rc, const pl_rect2df *src, float panscan);
+
+// Scale rect in each direction while keeping it centered.
+PL_API void pl_rect2df_stretch(pl_rect2df *rc, float stretch_x, float stretch_y);
+
+// Offset rect by an arbitrary offset factor. If the corresponding dimension
+// of a rect is flipped, so too is the applied offset.
+PL_API void pl_rect2df_offset(pl_rect2df *rc, float offset_x, float offset_y);
+
+// Scale a rect uniformly in both dimensions.
+#define pl_rect2df_zoom(rc, zoom) pl_rect2df_stretch((rc), (zoom), (zoom))
+
+// Rotation in degrees clockwise
+typedef int pl_rotation;
+enum {
+    PL_ROTATION_0   = 0,
+    PL_ROTATION_90  = 1,
+    PL_ROTATION_180 = 2,
+    PL_ROTATION_270 = 3,
+    PL_ROTATION_360 = 4, // equivalent to PL_ROTATION_0
+
+    // Note: Values outside the range [0,4) are legal, including negatives.
+};
+
+// Constrains to the interval [PL_ROTATION_0, PL_ROTATION_360).
+static inline pl_rotation pl_rotation_normalize(pl_rotation rot)
+{
+    return (rot % PL_ROTATION_360 + PL_ROTATION_360) % PL_ROTATION_360;
+}
+
+// Rotates the coordinate system of a `pl_rect2d(f)` in a certain direction.
+// For example, calling this with PL_ROTATION_90 will correspond to rotating
+// the coordinate system 90° to the right (so the x axis becomes the y axis).
+//
+// The resulting rect is re-normalized in the same coordinate system.
+PL_API void pl_rect2df_rotate(pl_rect2df *rc, pl_rotation rot);
+
+// Returns the aspect ratio in a rotated frame of reference.
+static inline float pl_aspect_rotate(float aspect, pl_rotation rot)
+{
+    return (rot % PL_ROTATION_180) ? 1.0 / aspect : aspect;
+}
+
+#define pl_rect2df_aspect_set_rot(rc, aspect, rot, panscan) \
+    pl_rect2df_aspect_set((rc), pl_aspect_rotate((aspect), (rot)), (panscan))
+
+#define pl_rect2df_aspect_copy_rot(rc, src, panscan, rot) \
+    pl_rect2df_aspect_set_rot((rc), pl_rect2df_aspect(src), (rot), (panscan))
+
+PL_API_END
+
+#endif // LIBPLACEBO_COMMON_H_
diff --git a/src/include/libplacebo/config.h.in b/src/include/libplacebo/config.h.in
new file mode 100644
index 0000000..2ed6290
--- /dev/null
+++ b/src/include/libplacebo/config.h.in
@@ -0,0 +1,102 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_CONFIG_H_
+#define LIBPLACEBO_CONFIG_H_
+
+// Increased any time the library changes in a fundamental/major way.
+#define PL_MAJOR_VER @majorver@
+
+// Increased any time the API changes. (Note: Does not reset when PL_MAJOR_VER
+// is increased)
+#define PL_API_VER @apiver@
+
+// Increased any time a fix is made to a given API version.
+#define PL_FIX_VER (pl_fix_ver())
+
+// Friendly name (`git describe`) for the overall version of the library
+#define PL_VERSION (pl_version())
+
+// Feature tests. These aren't described in further detail, but may be useful
+// for programmers wanting to programmatically check for feature support
+// in their compiled libplacebo versions.
+@extra_defs@
+
+// Extra compiler-specific stuff
+#ifndef PL_DEPRECATED
+# if defined(_MSC_VER)
+# define PL_DEPRECATED
+# else
+# define PL_DEPRECATED __attribute__((deprecated))
+# endif
+#endif
+
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
+#ifndef PL_DEPRECATED_ENUMERATOR
+# if (defined(__GNUC__) && (__GNUC__ >= 6)) || __has_feature(enumerator_attributes)
+# define PL_DEPRECATED_ENUMERATOR PL_DEPRECATED
+# else
+# define PL_DEPRECATED_ENUMERATOR
+# endif
+#endif
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+# ifdef PL_EXPORT
+#  define PL_API __declspec(dllexport)
+# else
+#  ifndef PL_STATIC
+#   define PL_API __declspec(dllimport)
+#  else
+#   define PL_API
+#  endif
+# endif
+#else
+# define PL_API __attribute__ ((visibility ("default")))
+#endif
+
+// C++ compatibility
+#ifdef __cplusplus
+# define PL_API_BEGIN extern "C" {
+# define PL_API_END }
+#else
+# define PL_API_BEGIN
+# define PL_API_END
+#endif
+
+#ifndef __cplusplus
+// Disable this warning because libplacebo's params macros override fields
+# pragma GCC diagnostic ignored "-Woverride-init"
+#endif
+
+// Extra helper macros
+#define PL_TOSTRING_INNER(x) #x
+#define PL_TOSTRING(x) PL_TOSTRING_INNER(x)
+
+// Deprecated macro for back-compatibility
+#define PL_STRUCT(name) struct name##_t
+
+PL_API_BEGIN
+
+PL_API int pl_fix_ver(void);
+PL_API const char *pl_version(void);
+
+PL_API_END
+
+#endif // LIBPLACEBO_CONFIG_H_
diff --git a/src/include/libplacebo/d3d11.h b/src/include/libplacebo/d3d11.h
new file mode 100644
index 0000000..8ecba30
--- /dev/null
+++ b/src/include/libplacebo/d3d11.h
@@ -0,0 +1,248 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_D3D11_H_
+#define LIBPLACEBO_D3D11_H_
+
+#include <windows.h>
+#include <d3d11.h>
+#include <dxgi1_2.h>
+#include <libplacebo/gpu.h>
+#include <libplacebo/swapchain.h>
+
+PL_API_BEGIN
+
+// Structure representing the actual D3D11 device and associated GPU instance
+typedef const struct pl_d3d11_t {
+    pl_gpu gpu;
+
+    // The D3D11 device in use. The user is free to use this for their own
+    // purposes, including taking a reference to the device (with AddRef) and
+    // using it beyond the lifetime of the pl_d3d11 that created it (though if
+    // this is done with debug enabled, it will confuse the leak checker.)
+    ID3D11Device *device;
+
+    // True if the device is using a software (WARP) adapter
+    bool software;
+} *pl_d3d11;
+
+struct pl_d3d11_params {
+    // The Direct3D 11 device to use. Optional, if NULL then libplacebo will
+    // create its own ID3D11Device using the options below. If set, all the
+    // options below will be ignored.
+    ID3D11Device *device;
+
+    // --- Adapter selection options
+
+    // The adapter to use. This overrides adapter_luid.
+    IDXGIAdapter *adapter;
+
+    // The LUID of the adapter to use. If adapter and adapter_luid are unset,
+    // the default adapter will be used instead.
+    LUID adapter_luid;
+
+    // Allow a software (WARP) adapter when selecting the adapter automatically.
+    // Note that sometimes the default adapter will be a software adapter. This
+    // is because, on Windows 8 and up, if there are no hardware adapters,
+    // Windows will pretend the WARP adapter is the default hardware adapter.
+    bool allow_software;
+
+    // Always use a software adapter. This is mainly for testing purposes.
+    bool force_software;
+
+    // --- Device creation options
+
+    // Enable the debug layer (D3D11_CREATE_DEVICE_DEBUG)
+    // Also logs IDXGIInfoQueue messages
+    bool debug;
+
+    // Extra flags to pass to D3D11CreateDevice (D3D11_CREATE_DEVICE_FLAG).
+    // libplacebo should be compatible with any flags passed here.
+    UINT flags;
+
+    // The minimum and maximum allowable feature levels for the created device.
+    // libplacebo will attempt to create a device with the highest feature level
+    // between min_feature_level and max_feature_level (inclusive.) If there are
+    // no supported feature levels in this range, `pl_d3d11_create` will either
+    // return NULL or fall back to the software adapter, depending on whether
+    // `allow_software` is set.
+    //
+    // Normally there is no reason to set `max_feature_level` other than to test
+    // if a program works at lower feature levels.
+    //
+    // Note that D3D_FEATURE_LEVEL_9_3 and below (known as 10level9) are highly
+    // restrictive. These feature levels are supported on a best-effort basis.
+    // They represent very old DirectX 9 compatible PC and laptop hardware
+    // (2001-2007, GeForce FX, 6, 7, ATI R300-R500, GMA 950-X3000) and some
+    // less-old mobile devices (Surface RT, Surface 2.) Basic video rendering
+    // should work, but the full pl_gpu API will not be available and advanced
+    // shaders will probably fail. The hardware is probably too slow for these
+    // anyway.
+    //
+    // Known restrictions of 10level9 devices include:
+    //   D3D_FEATURE_LEVEL_9_3 and below:
+    //   - `pl_pass_run_params->index_buf` will not work (but `index_data` will)
+    //   - Dimensions of 3D textures must be powers of two
+    //   - Shaders cannot use gl_FragCoord
+    //   - Shaders cannot use texelFetch
+    //   D3D_FEATURE_LEVEL_9_2 and below:
+    //   - Fragment shaders have no dynamic flow control and very strict limits
+    //     on the number of constants, temporary registers and instructions.
+    //     Whether a shader meets the requirements will depend on how it's
+    //     compiled and optimized, but it's likely that only simple shaders will
+    //     work.
+    //   D3D_FEATURE_LEVEL_9_1:
+    //   - No high-bit-depth formats with PL_FMT_CAP_RENDERABLE or
+    //     PL_FMT_CAP_LINEAR
+    //
+    // If these restrictions are undesirable and you don't need to support
+    // ancient hardware, set `min_feature_level` to D3D_FEATURE_LEVEL_10_0.
+    int min_feature_level; // Defaults to D3D_FEATURE_LEVEL_9_1 if unset
+    int max_feature_level; // Defaults to D3D_FEATURE_LEVEL_12_1 if unset
+
+    // Allow up to N in-flight frames. Similar to swapchain_depth for Vulkan and
+    // OpenGL, though with DXGI this is a device-wide setting that affects all
+    // swapchains (except for waitable swapchains.) See the documentation for
+    // `pl_swapchain_latency` for more information.
+    int max_frame_latency;
+};
+
+// Default/recommended parameters. Should generally be safe and efficient.
+#define PL_D3D11_DEFAULTS   \
+    .allow_software = true,
+
+#define pl_d3d11_params(...) (&(struct pl_d3d11_params) { PL_D3D11_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_d3d11_params pl_d3d11_default_params;
+
+// Creates a new Direct3D 11 device based on the given parameters, or wraps an
+// existing device, and initializes a new GPU instance. If params is left as
+// NULL, it defaults to &pl_d3d11_default_params. If an existing device is
+// provided in params->device, `pl_d3d11_create` will take a reference to it
+// that will be released in `pl_d3d11_destroy`.
+PL_API pl_d3d11 pl_d3d11_create(pl_log log, const struct pl_d3d11_params *params);
+
+// Release the D3D11 device.
+//
+// Note that all libplacebo objects allocated from this pl_d3d11 object (e.g.
+// via `d3d11->gpu` or using `pl_d3d11_create_swapchain`) *must* be explicitly
+// destroyed by the user before calling this.
+PL_API void pl_d3d11_destroy(pl_d3d11 *d3d11);
+
+// For a `pl_gpu` backed by `pl_d3d11`, this function can be used to retrieve
+// the underlying `pl_d3d11`. Returns NULL for any other type of `gpu`.
+PL_API pl_d3d11 pl_d3d11_get(pl_gpu gpu);
+
+struct pl_d3d11_swapchain_params {
+    // The Direct3D 11 swapchain to wrap. Optional. If NULL, libplacebo will
+    // create its own swapchain using the options below. If set, all the
+    // swapchain creation options will be ignored.
+    //
+    // The provided swapchain must have been created by the same device used
+    // by `gpu` and must not have multisampled backbuffers.
+    IDXGISwapChain *swapchain;
+
+    // --- Swapchain creation options
+
+    // Initial framebuffer width and height. If both width and height are set to
+    // 0 and window is non-NULL, the client area of the window is used instead.
+    // For convenience, if either component would be 0, it is set to 1 instead.
+    // This is because Windows can have 0-sized windows, but not 0-sized
+    // swapchains.
+    int width;
+    int height;
+
+    // The handle of the output window. In Windows 8 and up this is optional
+    // because you can output to a CoreWindow or create a composition swapchain
+    // instead.
+    HWND window;
+
+    // A pointer to the CoreWindow to output to. If both this and `window` are
+    // NULL, CreateSwapChainForComposition will be used to create the swapchain.
+    IUnknown *core_window;
+
+    // If set, libplacebo will create a swapchain that uses the legacy bitblt
+    // presentation model (with the DXGI_SWAP_EFFECT_DISCARD swap effect.) This
+    // tends to give worse performance and frame pacing in windowed mode and it
+    // prevents borderless fullscreen optimizations, but it might be necessary
+    // to work around buggy drivers, especially with DXGI 1.2 in the Platform
+    // Update for Windows 7. When unset, libplacebo will try to use the flip
+    // presentation model and only fall back to bitblt if flip is unavailable.
+    bool blit;
+
+    // additional swapchain flags
+    // No validation on these flags is being performed, and swapchain creation
+    // may fail if an unsupported combination is requested.
+    UINT flags;
+
+    // --- Swapchain usage behavior options
+
+    // Disable using a 10-bit swapchain format for SDR output
+    bool disable_10bit_sdr;
+};
+
+#define pl_d3d11_swapchain_params(...) (&(struct pl_d3d11_swapchain_params) { __VA_ARGS__ })
+
+// Creates a new Direct3D 11 swapchain, or wraps an existing one. If an existing
+// swapchain is provided in params->swapchain, `pl_d3d11_create_swapchain` will
+// take a reference to it that will be released in `pl_swapchain_destroy`.
+PL_API pl_swapchain pl_d3d11_create_swapchain(pl_d3d11 d3d11,
+    const struct pl_d3d11_swapchain_params *params);
+
+// Takes a `pl_swapchain` created by pl_d3d11_create_swapchain and returns a
+// reference to the underlying IDXGISwapChain. This increments the refcount, so
+// call IDXGISwapChain::Release when finished with it.
+PL_API IDXGISwapChain *pl_d3d11_swapchain_unwrap(pl_swapchain sw);
+
+struct pl_d3d11_wrap_params {
+    // The D3D11 texture to wrap, or a texture array containing the texture to
+    // wrap. Must be a ID3D11Texture1D, ID3D11Texture2D or ID3D11Texture3D
+    // created by the same device used by `gpu`, must have D3D11_USAGE_DEFAULT,
+    // and must not be mipmapped or multisampled.
+    ID3D11Resource *tex;
+
+    // If tex is a texture array, this is the array member to use as the pl_tex.
+    int array_slice;
+
+    // If tex is a video resource (eg. DXGI_FORMAT_AYUV, DXGI_FORMAT_NV12,
+    // DXGI_FORMAT_P010, etc.,) it can be wrapped as a pl_tex by specifying the
+    // type and size of the shader view. For planar video formats, the plane
+    // that is wrapped depends on the chosen format.
+    //
+    // If tex is not a video resource, these fields are unnecessary. The correct
+    // format will be determined automatically. If tex is not 2D, these fields
+    // are ignored.
+    //
+    // For a list of supported video formats and their corresponding view
+    // formats and sizes, see:
+    // https://microsoft.github.io/DirectX-Specs/d3d/archive/D3D11_3_FunctionalSpec.htm#VideoViews
+    DXGI_FORMAT fmt;
+    int w;
+    int h;
+};
+
+#define pl_d3d11_wrap_params(...) (&(struct pl_d3d11_wrap_params) { __VA_ARGS__ })
+
+// Wraps an external texture into a pl_tex abstraction. `pl_d3d11_wrap` takes a
+// reference to the texture, which is released when `pl_tex_destroy` is called.
+//
+// This function may fail due to incompatible formats, incompatible flags or
+// other reasons, in which case it will return NULL.
+PL_API pl_tex pl_d3d11_wrap(pl_gpu gpu, const struct pl_d3d11_wrap_params *params);
+
+PL_API_END
+
+#endif // LIBPLACEBO_D3D11_H_
diff --git a/src/include/libplacebo/dispatch.h b/src/include/libplacebo/dispatch.h
new file mode 100644
index 0000000..7d43794
--- /dev/null
+++ b/src/include/libplacebo/dispatch.h
@@ -0,0 +1,239 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_DISPATCH_H_
+#define LIBPLACEBO_DISPATCH_H_
+
+#include <libplacebo/shaders.h>
+#include <libplacebo/gpu.h>
+
+PL_API_BEGIN
+
+// Thread-safety: Safe
+typedef struct pl_dispatch_t *pl_dispatch;
+
+// Creates a new shader dispatch object. This object provides a translation
+// layer between generated shaders (pl_shader) and the ra context such that it
+// can be used to execute shaders. This dispatch object will also provide
+// shader caching (for efficient re-use).
+PL_API pl_dispatch pl_dispatch_create(pl_log log, pl_gpu gpu);
+PL_API void pl_dispatch_destroy(pl_dispatch *dp);
+
+// Reset/increments the internal counters of the pl_dispatch. This must be
+// called whenever the user is going to begin with a new frame, in order to
+// perform garbage collection and advance the state of the internal PRNG.
+//
+// Note that shaders generated by `pl_dispatch` are therefore entirely
+// deterministic, as long as the sequence of calls (and inputs to the shader)
+// are the same.
+PL_API void pl_dispatch_reset_frame(pl_dispatch dp);
+
+// Returns a blank pl_shader object, suitable for recording rendering commands.
+// For more information, see the header documentation in `shaders/*.h`.
+PL_API pl_shader pl_dispatch_begin(pl_dispatch dp);
+
+// Struct passed to `info_callback`. Only valid until that function returns.
+struct pl_dispatch_info {
+    // Information about the shader for this shader execution, as well as a
+    // 64-bit signature uniquely identifying it.
+    pl_shader_info shader;
+    uint64_t signature;
+
+    // A list of execution times for this pass, in nanoseconds. May be empty.
+    uint64_t samples[256];
+    int num_samples;
+
+    // As a convenience, this contains the last, average and peak of the above
+    // list of samples. If `num_samples` is 0, these values are also 0.
+    uint64_t last;
+    uint64_t peak;
+    uint64_t average;
+};
+
+// Helper function to make a copy of `pl_dispatch_info`, while overriding
+// (and dereferencing) whatever was previously stored there.
+static inline void pl_dispatch_info_move(struct pl_dispatch_info *dst,
+                                         const struct pl_dispatch_info *src)
+{
+    pl_shader_info_deref(&dst->shader);
+    *dst = *src;
+    dst->shader = pl_shader_info_ref(src->shader);
+}
+
+// Set up a dispatch callback for this `pl_dispatch` object. The given callback
+// will be run for every successfully dispatched shader. Call this again with
+// `cb == NULL` to disable.
+PL_API void pl_dispatch_callback(pl_dispatch dp, void *priv,
+                                 void (*cb)(void *priv,
+                                 const struct pl_dispatch_info *));
+
+struct pl_dispatch_params {
+    // The shader to execute. The pl_dispatch will take over ownership
+    // of this shader, and return it back to the internal pool.
+    //
+    // This shader must have a compatible signature, i.e. inputs
+    // `PL_SHADER_SIG_NONE` and outputs `PL_SHADER_SIG_COLOR`.
+    pl_shader *shader;
+
+    // The texture to render to. This must have params compatible with the
+    // shader, i.e. `target->params.renderable` for fragment shaders and
+    // `target->params.storable` for compute shaders.
+    //
+    // Note: Even when not using compute shaders, users are advised to always
+    // set `target->params.storable` if permitted by the `pl_fmt`, since this
+    // allows the use of compute shaders instead of full-screen quads, which is
+    // faster on some platforms.
+    pl_tex target;
+
+    // The target rect to render to. Optional, if left as {0}, then the
+    // entire texture will be rendered to.
+    pl_rect2d rect;
+
+    // If set, enables and controls the blending for this pass. Optional. When
+    // using this with fragment shaders, `target->params.fmt->caps` must
+    // include `PL_FMT_CAP_BLENDABLE`.
+    const struct pl_blend_params *blend_params;
+
+    // If set, records the execution time of this dispatch into the given
+    // timer object. Optional.
+    //
+    // Note: If this is set, `pl_dispatch` cannot internally measure the
+    // execution time of the shader, which means `pl_dispatch_info.samples` may
+    // be empty as a result.
+    pl_timer timer;
+};
+
+#define pl_dispatch_params(...) (&(struct pl_dispatch_params) { __VA_ARGS__ })
+
+// Dispatch a generated shader (via the pl_shader mechanism). Returns whether
+// or not the dispatch was successful.
+PL_API bool pl_dispatch_finish(pl_dispatch dp, const struct pl_dispatch_params *params);
+
+struct pl_dispatch_compute_params {
+    // The shader to execute. This must be a compute shader with the input
+    // set to PL_SHADER_SIG_NONE. The output, if it has any, is ignored.
+    pl_shader *shader;
+
+    // The number of work groups to dispatch in each dimension. If this is left
+    // as [0} and `width/height` are both set, the number of work groups will
+    // be inferred from the shader's `compute_group_sizes`.
+    int dispatch_size[3];
+
+    // If set, simulate vertex attributes (similar to `pl_dispatch_finish`)
+    // according to the given dimensions. The first two components of the
+    // thread's global ID will be interpreted as the X and Y locations.
+    //
+    // Optional, ignored if either component is left as 0.
+    int width, height;
+
+    // If set, records the execution time of this dispatch into the given
+    // timer object. Optional.
+    //
+    // Note: If this is set, `pl_dispatch` cannot internally measure the
+    // execution time of the shader, which means `pl_dispatch_info.samples` may
+    // be empty as a result.
+    pl_timer timer;
+};
+
+#define pl_dispatch_compute_params(...) (&(struct pl_dispatch_compute_params) { __VA_ARGS__ })
+
+// A variant of `pl_dispatch_finish`, this one only dispatches a compute shader
+// while ignoring its output (if it has one). It's only useful for shaders
+// which have otherwise observable side effects (such as updating state
+// objects).
+PL_API bool pl_dispatch_compute(pl_dispatch dp, const struct pl_dispatch_compute_params *params);
+
+enum pl_vertex_coords {
+    PL_COORDS_ABSOLUTE,     // Absolute/integer `target` coordinates
+    PL_COORDS_RELATIVE,     // Relative `target` coordinates in range [0, 1]
+    PL_COORDS_NORMALIZED,   // GL-normalized coordinates in range  [-1, 1]
+};
+
+struct pl_dispatch_vertex_params {
+    // The shader to execute. This must be a raster shader with the input set
+    // to `PL_SHADER_SIG_NONE` and the output set to `PL_SHADER_SIG_COLOR`.
+    //
+    // Additionally, the shader must not have any attached vertex attributes.
+    pl_shader *shader;
+
+    // The texture to render to. Requires `target->params.renderable`.
+    pl_tex target;
+
+    // The target rect to clip the rendering to. (Optional)
+    pl_rect2d scissors;
+
+    // If set, enables and controls the blending for this pass. Optional. When
+    // enabled, `target->params.fmt->caps` must include `PL_FMT_CAP_BLENDABLE`.
+    const struct pl_blend_params *blend_params;
+
+    // The description of the vertex format, including offsets.
+    //
+    // Note: `location` is ignored and can safely be left unset.
+    const struct pl_vertex_attrib *vertex_attribs;
+    int num_vertex_attribs;
+    size_t vertex_stride;
+
+    // The index of the vertex position in `vertex_attribs`, as well as the
+    // interpretation of its contents.
+    int vertex_position_idx;
+    enum pl_vertex_coords vertex_coords;
+    bool vertex_flipped; // flip all vertex y coordinates
+
+    // Type and number of vertices to render.
+    enum pl_prim_type vertex_type;
+    int vertex_count;
+
+    // Vertex data. See `pl_pass_run_params.vertex_data`.
+    const void *vertex_data;
+    pl_buf vertex_buf;
+    size_t buf_offset;
+
+    // Index data. See `pl_pass_run_params.index_data`. Optional.
+    const void *index_data;
+    enum pl_index_format index_fmt;
+    pl_buf index_buf;
+    size_t index_offset;
+
+    // If set, records the execution time of this dispatch into the given
+    // timer object. Optional.
+    //
+    // Note: If this is set, `pl_dispatch` cannot internally measure the
+    // execution time of the shader, which means `pl_dispatch_info.samples` may
+    // be empty as a result.
+    pl_timer timer;
+};
+
+#define pl_dispatch_vertex_params(...) (&(struct pl_dispatch_vertex_params) { __VA_ARGS__ })
+
+// Dispatch a generated shader using custom vertices, rather than using a quad
+// generated by the dispatch. This allows the use of e.g. custom fragment
+// shaders for things like rendering custom UI elements, or possibly doing
+// advanced things like sampling from a cube map or spherical video.
+PL_API bool pl_dispatch_vertex(pl_dispatch dp, const struct pl_dispatch_vertex_params *params);
+
+// Cancel an active shader without submitting anything. Useful, for example,
+// if the shader was instead merged into a different shader.
+PL_API void pl_dispatch_abort(pl_dispatch dp, pl_shader *sh);
+
+// Deprecated in favor of `pl_cache_save/pl_cache_load` on the `pl_cache`
+// associated with the `pl_gpu` this dispatch is using.
+PL_DEPRECATED PL_API size_t pl_dispatch_save(pl_dispatch dp, uint8_t *out_cache);
+PL_DEPRECATED PL_API void pl_dispatch_load(pl_dispatch dp, const uint8_t *cache);
+
+PL_API_END
+
+#endif // LIBPLACEBO_DISPATCH_H
diff --git a/src/include/libplacebo/dither.h b/src/include/libplacebo/dither.h
new file mode 100644
index 0000000..84f17c7
--- /dev/null
+++ b/src/include/libplacebo/dither.h
@@ -0,0 +1,82 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_DITHER_H_
+#define LIBPLACEBO_DITHER_H_
+
+#include <libplacebo/common.h>
+
+PL_API_BEGIN
+
+// Generates a deterministic NxN bayer (ordered) dither matrix, storing the
+// result in `data`. `size` must be a power of two. The resulting matrix will
+// be roughly uniformly distributed within the range [0,1).
+PL_API void pl_generate_bayer_matrix(float *data, int size);
+
+// Generates a random NxN blue noise texture. storing the result in `data`.
+// `size` must be a positive power of two no larger than 256. The resulting
+// texture will be roughly uniformly distributed within the range [0,1).
+//
+// Note: This function is very, *very* slow for large sizes. Generating a
+// dither matrix with size 256 can take several seconds on a modern processor.
+PL_API void pl_generate_blue_noise(float *data, int size);
+
+// Defines the border of all error diffusion kernels
+#define PL_EDF_MIN_DX (-2)
+#define PL_EDF_MAX_DX  (2)
+#define PL_EDF_MAX_DY  (2)
+
+struct pl_error_diffusion_kernel {
+    const char *name; // Short and concise identifier
+    const char *description; // Longer / friendly name
+
+    // The minimum value such that a (y, x) -> (y, x + y * shift) mapping will
+    // make all error pushing operations affect next column (and after it)
+    // only.
+    //
+    // Higher shift values are significantly more computationally intensive.
+    int shift;
+
+    // The diffusion factor for (y, x) is pattern[y][x - PL_EDF_MIN_DX] / divisor.
+    int pattern[PL_EDF_MAX_DY + 1][PL_EDF_MAX_DX - PL_EDF_MIN_DX + 1];
+    int divisor;
+};
+
+// Algorithms with shift=1:
+PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_simple;
+PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_false_fs;
+// Algorithms with shift=2:
+PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_sierra_lite;
+PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_floyd_steinberg;
+PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_atkinson;
+// Algorithms with shift=3, probably too heavy for low end GPUs:
+PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_jarvis_judice_ninke;
+PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_stucki;
+PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_burkes;
+PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_sierra2;
+PL_API extern const struct pl_error_diffusion_kernel pl_error_diffusion_sierra3;
+
+// A list of built-in error diffusion kernels, terminated by NULL
+PL_API extern const struct pl_error_diffusion_kernel * const pl_error_diffusion_kernels[];
+PL_API extern const int pl_num_error_diffusion_kernels; // excluding trailing NULL
+
+// Find the error diffusion kernel with the given name, or NULL on failure.
+PL_API const struct pl_error_diffusion_kernel *pl_find_error_diffusion_kernel(const char *name);
+
+PL_API_END
+
+#endif // LIBPLACEBO_DITHER_H_
diff --git a/src/include/libplacebo/dummy.h b/src/include/libplacebo/dummy.h
new file mode 100644
index 0000000..c298438
--- /dev/null
+++ b/src/include/libplacebo/dummy.h
@@ -0,0 +1,131 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_DUMMY_H_
+#define LIBPLACEBO_DUMMY_H_
+
+#include <libplacebo/gpu.h>
+
+PL_API_BEGIN
+
+// The functions in this file allow creating and manipulating "dummy" contexts.
+// A dummy context isn't actually mapped by the GPU, all data exists purely on
+// the CPU. It also isn't capable of compiling or executing any shaders, any
+// attempts to do so will simply fail.
+//
+// The main use case for this dummy context is for users who want to generate
+// advanced shaders that depend on specific GLSL features or support for
+// certain types of GPU resources (e.g. LUTs). This dummy context allows such
+// shaders to be generated, with all of the referenced shader objects and
+// textures simply containing their data in a host-accessible way.
+
+struct pl_gpu_dummy_params {
+    // These GPU parameters correspond to their equivalents in `pl_gpu`, and
+    // must obey the same rules as documented there. The values from
+    // `pl_gpu_dummy_default_params` are set to support pretty much everything
+    // and are set for GLSL version 450.
+    //
+    // Individual fields such as `glsl.compute` or `glsl.version` description
+    // can and should be overridden by the user based on their requirements.
+    // Individual limits should ideally be set based on the corresponding
+    // `glGet` queries etc.
+    struct pl_glsl_version glsl;
+    struct pl_gpu_limits limits;
+};
+
+#define PL_GPU_DUMMY_DEFAULTS                                           \
+    .glsl = {                                                           \
+        .version            = 450,                                      \
+        .gles               = false,                                    \
+        .vulkan             = false,                                    \
+        .compute            = true,                                     \
+        .max_shmem_size     = SIZE_MAX,                                 \
+        .max_group_threads  = 1024,                                     \
+        .max_group_size     = { 1024, 1024, 1024 },                     \
+        .subgroup_size      = 32,                                       \
+        .min_gather_offset  = INT16_MIN,                                \
+        .max_gather_offset  = INT16_MAX,                                \
+    },                                                                  \
+    .limits = {                                                         \
+        /* pl_gpu */                                                    \
+        .callbacks          = false,                                    \
+        .thread_safe        = true,                                     \
+        /* pl_buf */                                                    \
+        .max_buf_size       = SIZE_MAX,                                 \
+        .max_ubo_size       = SIZE_MAX,                                 \
+        .max_ssbo_size      = SIZE_MAX,                                 \
+        .max_vbo_size       = SIZE_MAX,                                 \
+        .max_mapped_size    = SIZE_MAX,                                 \
+        .max_buffer_texels  = UINT64_MAX,                               \
+        /* pl_tex */                                                    \
+        .max_tex_1d_dim     = UINT32_MAX,                               \
+        .max_tex_2d_dim     = UINT32_MAX,                               \
+        .max_tex_3d_dim     = UINT32_MAX,                               \
+        .buf_transfer       = true,                                     \
+        .align_tex_xfer_pitch = 1,                                      \
+        .align_tex_xfer_offset = 1,                                     \
+        /* pl_pass */                                                   \
+        .max_variable_comps = SIZE_MAX,                                 \
+        .max_constants      = SIZE_MAX,                                 \
+        .max_pushc_size     = SIZE_MAX,                                 \
+        .max_dispatch       = { UINT32_MAX, UINT32_MAX, UINT32_MAX },   \
+        .fragment_queues    = 0,                                        \
+        .compute_queues     = 0,                                        \
+    },
+
+#define pl_gpu_dummy_params(...) (&(struct pl_gpu_dummy_params) { PL_GPU_DUMMY_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_gpu_dummy_params pl_gpu_dummy_default_params;
+
+// Create a dummy GPU context based on the given parameters. This GPU will have
+// a format for each host-representable type (i.e. intN_t, floats and doubles),
+// in the canonical channel order RGBA. These formats will have every possible
+// capability activated, respectively.
+//
+// If `params` is left as NULL, it defaults to `&pl_gpu_dummy_params`.
+PL_API pl_gpu pl_gpu_dummy_create(pl_log log, const struct pl_gpu_dummy_params *params);
+PL_API void pl_gpu_dummy_destroy(pl_gpu *gpu);
+
+// Back-doors into the `pl_tex` and `pl_buf` representations. These allow you
+// to access the raw data backing this object. Textures are always laid out in
+// a tightly packed manner.
+//
+// For "placeholder" dummy textures, this always returns NULL.
+PL_API uint8_t *pl_buf_dummy_data(pl_buf buf);
+PL_API uint8_t *pl_tex_dummy_data(pl_tex tex);
+
+// Skeleton of `pl_tex_params` containing only the fields relevant to
+// `pl_tex_dummy_create`, plus the extra `sampler_type` field.
+struct pl_tex_dummy_params {
+    int w, h, d;
+    pl_fmt format;
+    enum pl_sampler_type sampler_type;
+    void *user_data;
+};
+
+#define pl_tex_dummy_params(...) (&(struct pl_tex_dummy_params) { __VA_ARGS__ })
+
+// Allows creating a "placeholder" dummy texture. This is basically a texture
+// that isn't even backed by anything. All `pl_tex_*` operations (other than
+// `pl_tex_destroy`) performed on it will simply fail.
+//
+// All of the permissions will be set to `false`, except `sampleable`, which is
+// set to `true`. (So you can use it as an input to shader sampling functions)
+PL_API pl_tex pl_tex_dummy_create(pl_gpu gpu, const struct pl_tex_dummy_params *params);
+
+PL_API_END
+
+#endif // LIBPLACEBO_DUMMY_H_
diff --git a/src/include/libplacebo/filters.h b/src/include/libplacebo/filters.h
new file mode 100644
index 0000000..a95649d
--- /dev/null
+++ b/src/include/libplacebo/filters.h
@@ -0,0 +1,415 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_FILTER_KERNELS_H_
+#define LIBPLACEBO_FILTER_KERNELS_H_
+
+#include <stdbool.h>
+#include <libplacebo/log.h>
+
+PL_API_BEGIN
+
+#define PL_FILTER_MAX_PARAMS 2
+
+// Invocation parameters for a given kernel
+struct pl_filter_ctx {
+    float radius;
+    float params[PL_FILTER_MAX_PARAMS];
+};
+
+// Represents a single filter function, i.e. kernel or windowing function.
+struct pl_filter_function {
+    // The cosmetic name associated with this filter function.
+    const char *name;
+
+    // The radius of the filter function. For resizable filters, this gives
+    // the radius needed to represent a single filter lobe (tap).
+    float radius;
+
+    // If true, the filter function is resizable (see pl_filter_config.radius)
+    bool resizable;
+
+    // If true, the filter function is tunable (see pl_filter_config.params)
+    bool tunable[PL_FILTER_MAX_PARAMS];
+
+    // If the relevant parameter is tunable, this contains the default values.
+    float params[PL_FILTER_MAX_PARAMS];
+
+    // The underlying filter function itself: Computes the weight as a function
+    // of the offset. All filter functions must be normalized such that x=0 is
+    // the center point, and in particular weight(0) = 1.0. The functions may
+    // be undefined for values of x outside [0, radius].
+    double (*weight)(const struct pl_filter_ctx *f, double x);
+
+    // If true, this filter represents an opaque placeholder for a more
+    // sophisticated filter function which does not fit into the pl_filter
+    // framework. `weight()` will always return 0.0.
+    bool opaque;
+};
+
+// Deprecated function, merely checks a->weight == b->weight
+PL_DEPRECATED PL_API bool
+pl_filter_function_eq(const struct pl_filter_function *a,
+                      const struct pl_filter_function *b);
+
+// Box filter: Entirely 1.0 within the radius, entirely 0.0 outside of it.
+// This is also sometimes called a Dirichlet window
+PL_API extern const struct pl_filter_function pl_filter_function_box;
+
+// Triangle filter: Linear transitions from 1.0 at x=0 to 0.0 at x=radius.
+// This is also sometimes called a Bartlett window.
+PL_API extern const struct pl_filter_function pl_filter_function_triangle;
+
+// Cosine filter: Ordinary cosine function, single lobe.
+PL_API extern const struct pl_filter_function pl_filter_function_cosine;
+
+// Hann function: Cosine filter named after Julius von Hann. Also commonly
+// mislabeled as a "Hanning" function, due to its similarly to the Hamming
+// function.
+PL_API extern const struct pl_filter_function pl_filter_function_hann;
+
+// Hamming function: Cosine filter named after Richard Hamming.
+PL_API extern const struct pl_filter_function pl_filter_function_hamming;
+
+// Welch filter: Polynomial function consisting of a single parabolic section.
+PL_API extern const struct pl_filter_function pl_filter_function_welch;
+
+// Kaiser filter: Approximation of the DPSS window using Bessel functions.
+// Also sometimes called a Kaiser-Bessel window.
+// Parameter [0]: Shape (alpha). Determines the trade-off between the main lobe
+//                and the side lobes.
+PL_API extern const struct pl_filter_function pl_filter_function_kaiser;
+
+// Blackman filter: Cosine filter named after Ralph Beebe Blackman.
+// Parameter [0]: Scale (alpha). Influences the shape. The defaults result in
+//                zeros at the third and fourth sidelobes.
+PL_API extern const struct pl_filter_function pl_filter_function_blackman;
+
+// Bohman filter: 2nd order Cosine filter.
+PL_API extern const struct pl_filter_function pl_filter_function_bohman;
+
+// Gaussian function: Similar to the Gaussian distribution, this defines a
+// bell curve function.
+// Parameter [0]: Scale (t), increasing makes the result blurrier.
+PL_API extern const struct pl_filter_function pl_filter_function_gaussian;
+
+// Quadratic function: 2nd order approximation of the gaussian function. Also
+// sometimes called a "quadric" window.
+PL_API extern const struct pl_filter_function pl_filter_function_quadratic;
+
+// Sinc function: Widely used for both kernels and windows, sinc(x) = sin(x)/x.
+PL_API extern const struct pl_filter_function pl_filter_function_sinc;
+
+// Jinc function: Similar to sinc, but extended to the 2D domain. Widely
+// used as the kernel of polar (EWA) filters. Also sometimes called a Sombrero
+// function.
+PL_API extern const struct pl_filter_function pl_filter_function_jinc;
+
+// Sphinx function: Similar to sinc and jinx, but extended to the 3D domain.
+// The name is derived from "spherical" sinc. Can be used to filter 3D signals
+// in theory.
+PL_API extern const struct pl_filter_function pl_filter_function_sphinx;
+
+// B/C-tunable Spline function: This is a family of commonly used spline
+// functions with two tunable parameters. Does not need to be windowed.
+// Parameter [0]: "B"
+// Parameter [1]: "C"
+// Some popular variants of this function are:
+// B = 1.0,  C = 0.0:  "base" Cubic (blurry)
+// B = 0.0,  C = 0.0:  Hermite filter (blocky)
+// B = 0.0,  C = 0.5:  Catmull-Rom filter (sharp)
+// B = 1/3,  C = 1/3:  Mitchell-Netravali filter (soft, doesn't ring)
+// B ≈ 0.37, C ≈ 0.31: Robidoux filter (used by ImageMagick)
+// B ≈ 0.26, C ≈ 0.37: RobidouxSharp filter (sharper variant of Robidoux)
+PL_API extern const struct pl_filter_function pl_filter_function_cubic;
+PL_API extern const struct pl_filter_function pl_filter_function_hermite;
+#define pl_filter_function_bicubic pl_filter_function_cubic
+#define pl_filter_function_bcspline pl_filter_function_cubic
+
+// Cubic splines with 2/3/4 taps. Referred to as "spline16", "spline36", and
+// "spline64" mainly for historical reasons, based on the number of pixels in
+// their window when using them as 2D orthogonal filters. Do not need to be
+// windowed.
+PL_API extern const struct pl_filter_function pl_filter_function_spline16;
+PL_API extern const struct pl_filter_function pl_filter_function_spline36;
+PL_API extern const struct pl_filter_function pl_filter_function_spline64;
+
+// Special filter function for the built-in oversampling algorithm. This is an
+// opaque filter with no meaningful representation. though it has one tunable
+// parameter controlling the threshold at which to switch back to ordinary
+// nearest neighbour sampling. (See `pl_shader_sample_oversample`)
+PL_API extern const struct pl_filter_function pl_filter_function_oversample;
+
+// A list of built-in filter functions, terminated by NULL
+//
+// Note: May contain extra aliases for the above functions.
+PL_API extern const struct pl_filter_function * const pl_filter_functions[];
+PL_API extern const int pl_num_filter_functions; // excluding trailing NULL
+
+// Find the filter function with the given name, or NULL on failure.
+PL_API const struct pl_filter_function *pl_find_filter_function(const char *name);
+
+// Backwards compatibility with the older configuration API. Redundant with
+// `pl_filter_function.name`. May be formally deprecated in the future.
+
+struct pl_filter_function_preset {
+    const char *name;
+    const struct pl_filter_function *function;
+};
+
+// A list of built-in filter function presets, terminated by {0}
+PL_API extern const struct pl_filter_function_preset pl_filter_function_presets[];
+PL_API extern const int pl_num_filter_function_presets; // excluding trailing {0}
+
+// Find the filter function preset with the given name, or NULL on failure.
+PL_API const struct pl_filter_function_preset *pl_find_filter_function_preset(const char *name);
+
+// Different usage domains for a filter
+enum pl_filter_usage {
+    PL_FILTER_UPSCALING    = (1 << 0),
+    PL_FILTER_DOWNSCALING  = (1 << 1),
+    PL_FILTER_FRAME_MIXING = (1 << 2),
+
+    PL_FILTER_SCALING = PL_FILTER_UPSCALING | PL_FILTER_DOWNSCALING,
+    PL_FILTER_ALL     = PL_FILTER_SCALING | PL_FILTER_FRAME_MIXING,
+};
+
+// Represents a tuned combination of filter functions, plus parameters
+struct pl_filter_config {
+    // The cosmetic name associated with this filter config. Optional for
+    // user-provided configs, but always set by built-in configurations.
+    const char *name;
+
+    // Longer / friendly name. Always set for built-in configurations,
+    // except for names which are merely aliases of other filters.
+    const char *description;
+
+    // Allowed and recommended usage domains (respectively)
+    //
+    // When it is desired to maintain a simpler user interface, it may be
+    // recommended to include only scalers whose recommended usage domains
+    // includes the relevant context in which it will be used.
+    enum pl_filter_usage allowed;
+    enum pl_filter_usage recommended;
+
+    // The kernel function and (optionally) windowing function.
+    const struct pl_filter_function *kernel;
+    const struct pl_filter_function *window;
+
+    // The radius. Ignored if !kernel->resizable. Optional, defaults to
+    // kernel->radius if unset.
+    float radius;
+
+    // Parameters for the respective filter function. Ignored if not tunable.
+    float params[PL_FILTER_MAX_PARAMS];
+    float wparams[PL_FILTER_MAX_PARAMS];
+
+    // Represents a clamping coefficient for negative weights. A value of 0.0
+    // (the default) represents no clamping. A value of 1.0 represents full
+    // clamping, i.e. all negative weights will be clamped to 0. Values in
+    // between will be linearly scaled.
+    float clamp;
+
+    // Additional blur coefficient. This effectively stretches the kernel,
+    // without changing the effective radius of the filter radius. Setting this
+    // to a value of 0.0 is equivalent to disabling it. Values significantly
+    // below 1.0 may seriously degrade the visual output, and should be used
+    // with care.
+    float blur;
+
+    // Additional taper coefficient. This essentially flattens the function's
+    // center. The values within [-taper, taper] will return 1.0, with the
+    // actual function being squished into the remainder of [taper, radius].
+    // Defaults to 0.0.
+    float taper;
+
+    // If true, this filter is intended to be used as a polar/2D filter (EWA)
+    // instead of a separable/1D filter. Does not affect the actual sampling,
+    // but provides information about how the results are to be interpreted.
+    bool polar;
+
+    // Antiringing strength. A value of 0.0 disables antiringing, and a value
+    // of 1.0 enables full-strength antiringing. Defaults to 0.0 if
+    // unspecified.
+    //
+    // Note: This is only included in `pl_filter_config` for convenience. Does
+    // not affect the actual filter sampling, but provides information to the
+    // downstream consumer of the `pl_filter`.
+    float antiring;
+};
+
+PL_API bool pl_filter_config_eq(const struct pl_filter_config *a,
+                                const struct pl_filter_config *b);
+
+// Samples a given filter configuration at a given x coordinate, while
+// respecting all parameters of the configuration.
+PL_API double pl_filter_sample(const struct pl_filter_config *c, double x);
+
+// A list of built-in filter configurations. Since they are just combinations
+// of the above filter functions, they are not described in much further
+// detail.
+PL_API extern const struct pl_filter_config pl_filter_spline16;    // 2 taps
+PL_API extern const struct pl_filter_config pl_filter_spline36;    // 3 taps
+PL_API extern const struct pl_filter_config pl_filter_spline64;    // 4 taps
+PL_API extern const struct pl_filter_config pl_filter_nearest;
+PL_API extern const struct pl_filter_config pl_filter_box;
+PL_API extern const struct pl_filter_config pl_filter_bilinear;
+PL_API extern const struct pl_filter_config pl_filter_gaussian;
+// Sinc family (all configured to 3 taps):
+PL_API extern const struct pl_filter_config pl_filter_sinc;        // unwindowed
+PL_API extern const struct pl_filter_config pl_filter_lanczos;     // sinc-sinc
+PL_API extern const struct pl_filter_config pl_filter_ginseng;     // sinc-jinc
+PL_API extern const struct pl_filter_config pl_filter_ewa_jinc;    // unwindowed
+PL_API extern const struct pl_filter_config pl_filter_ewa_lanczos; // jinc-jinc
+PL_API extern const struct pl_filter_config pl_filter_ewa_lanczossharp;
+PL_API extern const struct pl_filter_config pl_filter_ewa_lanczos4sharpest;
+PL_API extern const struct pl_filter_config pl_filter_ewa_ginseng; // jinc-sinc
+PL_API extern const struct pl_filter_config pl_filter_ewa_hann;    // jinc-hann
+// Spline family
+PL_API extern const struct pl_filter_config pl_filter_bicubic;
+PL_API extern const struct pl_filter_config pl_filter_hermite;
+PL_API extern const struct pl_filter_config pl_filter_catmull_rom;
+PL_API extern const struct pl_filter_config pl_filter_mitchell;
+PL_API extern const struct pl_filter_config pl_filter_mitchell_clamp; // clamp = 1.0
+PL_API extern const struct pl_filter_config pl_filter_robidoux;
+PL_API extern const struct pl_filter_config pl_filter_robidouxsharp;
+PL_API extern const struct pl_filter_config pl_filter_ewa_robidoux;
+PL_API extern const struct pl_filter_config pl_filter_ewa_robidouxsharp;
+// Special/opaque filters
+PL_API extern const struct pl_filter_config pl_filter_oversample;
+
+// Backwards compatibility
+#define pl_filter_triangle          pl_filter_bilinear
+#define pl_oversample_frame_mixer   pl_filter_oversample
+
+// A list of built-in filter configs, terminated by NULL
+PL_API extern const struct pl_filter_config * const pl_filter_configs[];
+PL_API extern const int pl_num_filter_configs; // excluding trailing NULL
+
+// Find the filter config with the given name, or NULL on failure.
+// `usage` restricts the valid usage (based on `pl_filter_config.allowed`).
+PL_API const struct pl_filter_config *
+pl_find_filter_config(const char *name, enum pl_filter_usage usage);
+
+// Backward compatibility with the previous filter configuration API. Redundant
+// with pl_filter_config.name/description. May be deprecated in the future.
+struct pl_filter_preset {
+    const char *name;
+    const struct pl_filter_config *filter;
+
+    // Longer / friendly name, or NULL for aliases
+    const char *description;
+};
+
+// A list of built-in filter presets, terminated by {0}
+PL_API extern const struct pl_filter_preset pl_filter_presets[];
+PL_API extern const int pl_num_filter_presets; // excluding trailing {0}
+
+// Find the filter preset with the given name, or NULL on failure.
+PL_API const struct pl_filter_preset *pl_find_filter_preset(const char *name);
+
+// Parameters for filter generation.
+struct pl_filter_params {
+    // The particular filter configuration to be sampled. config.kernel must
+    // be set to a valid pl_filter_function.
+    struct pl_filter_config config;
+
+    // The precision of the resulting LUT. A value of 64 should be fine for
+    // most practical purposes, but higher or lower values may be justified
+    // depending on the use case. This value must be set to something > 0.
+    int lut_entries;
+
+    // --- Polar filers only (config.polar)
+
+    // As a micro-optimization, all samples below this cutoff value will be
+    // ignored when updating the cutoff radius. Setting it to a value of 0.0
+    // disables this optimization.
+    float cutoff;
+
+    // --- Separable filters only (!config.polar)
+
+    // Indicates the maximum row size that is supported by the calling code, or
+    // 0 for no limit.
+    int max_row_size;
+
+    // Indicates the row stride alignment. For some use cases (e.g. uploading
+    // the weights as a texture), there are certain alignment requirements for
+    // each row. The chosen row_size will always be a multiple of this value.
+    // Specifying 0 indicates no alignment requirements.
+    int row_stride_align;
+
+    // --- Deprecated options
+    float filter_scale PL_DEPRECATED; // no effect, use `config.blur` instead
+};
+
+#define pl_filter_params(...) (&(struct pl_filter_params) { __VA_ARGS__ })
+
+// Represents an initialized instance of a particular filter, with a
+// precomputed LUT. The interpretation of the LUT depends on the type of the
+// filter (polar or separable).
+typedef const struct pl_filter_t {
+    // Deep copy of the parameters, for convenience.
+    struct pl_filter_params params;
+
+    // Contains the true radius of the computed filter. This may be
+    // smaller than the configured radius depending on the exact filter
+    // parameters used. Mainly relevant for polar filters, since
+    // it affects the value range of *weights.
+    float radius;
+
+    // Radius of the first zero crossing (main lobe size).
+    float radius_zero;
+
+    // The computed look-up table (LUT). For polar filters, this is interpreted
+    // as a 1D array with dimensions [lut_entries] containing the raw filter
+    // samples on the scale [0, radius]. For separable (non-polar) filters,
+    // this is interpreted as a 2D array with dimensions
+    // [lut_entries][row_stride]. The inner rows contain the `row_size` samples
+    // to convolve with the corresponding input pixels. The outer coordinate is
+    // used to very the fractional offset (phase). So for example, if the
+    // sample position to reconstruct is directly aligned with the source
+    // texels, you would use the values from weights[0]. If the sample position
+    // to reconstruct is exactly half-way between two source texels (180° out
+    // of phase), you would use the values from weights[lut_entries/2].
+    const float *weights;
+
+    // --- separable filters only (!params.config.polar)
+
+    // The number of source texels to convolve over for each row. This value
+    // will never exceed the given `max_row_size`. If the filter ends up
+    // cut off because of this, the bool `insufficient` will be set to true.
+    int row_size;
+    bool insufficient;
+
+    // The separation (in *weights) between each row of the filter. Always
+    // a multiple of params.row_stride_align.
+    int row_stride;
+
+    // --- deprecated / removed fields
+    float radius_cutoff PL_DEPRECATED; // identical to `radius`
+} *pl_filter;
+
+// Generate (compute) a filter instance based on a given filter configuration.
+// The resulting pl_filter must be freed with `pl_filter_free` when no longer
+// needed. Returns NULL if filter generation fails due to invalid parameters
+// (i.e. missing a required parameter).
+PL_API pl_filter pl_filter_generate(pl_log log, const struct pl_filter_params *params);
+PL_API void pl_filter_free(pl_filter *filter);
+
+PL_API_END
+
+#endif // LIBPLACEBO_FILTER_KERNELS_H_
diff --git a/src/include/libplacebo/gamut_mapping.h b/src/include/libplacebo/gamut_mapping.h
new file mode 100644
index 0000000..a92a73b
--- /dev/null
+++ b/src/include/libplacebo/gamut_mapping.h
@@ -0,0 +1,182 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_GAMUT_MAPPING_H_
+#define LIBPLACEBO_GAMUT_MAPPING_H_
+
+#include <libplacebo/common.h>
+#include <libplacebo/colorspace.h>
+
+PL_API_BEGIN
+
+struct pl_gamut_map_params;
+struct pl_gamut_map_function {
+    const char *name;        // Identifier
+    const char *description; // Friendly / longer name
+
+    // The gamut-mapping function itself. Iterates over all values in `lut`,
+    // and adapts them as needed.
+    void (*map)(float *lut, const struct pl_gamut_map_params *params);
+
+    // Returns true if `map` supports both stretching and contracting the
+    // gamut. In this case, `map` is always executed, even if the output gamut
+    // is larger than the input gamut.
+    bool bidirectional;
+
+    // Private data. Unused by libplacebo, but may be accessed by `map`.
+    void *priv;
+};
+
+struct pl_gamut_map_constants {
+    // (Relative) chromaticity protection zone for perceptual mapping [0,1]
+    float perceptual_deadzone;
+
+    // Strength of the perceptual saturation mapping component [0,1]
+    float perceptual_strength;
+
+    // I vs C curve gamma to use for colorimetric clipping [0,10]
+    float colorimetric_gamma;
+
+    // Knee point to use for softclipping methods (perceptual, softclip) [0,1]
+    float softclip_knee;
+
+    // Desaturation strength (for softclip only) [0,1]
+    float softclip_desat;
+};
+
+#define PL_GAMUT_MAP_CONSTANTS    \
+    .colorimetric_gamma  = 1.80f, \
+    .softclip_knee       = 0.70f, \
+    .softclip_desat      = 0.35f, \
+    .perceptual_deadzone = 0.30f, \
+    .perceptual_strength = 0.80f,
+
+struct pl_gamut_map_params {
+    // If `function` is NULL, defaults to `pl_gamut_map_clip`.
+    const struct pl_gamut_map_function *function;
+
+    // The desired input/output primaries. This affects the subjective color
+    // volume in which the desired mapping shall take place.
+    struct pl_raw_primaries input_gamut;
+    struct pl_raw_primaries output_gamut;
+
+    // Minimum/maximum luminance (PQ) of the target display. Note that the same
+    // value applies to both the input and output, since it's assumed that tone
+    // mapping has already happened by this stage. This effectively defines the
+    // legal gamut boundary in RGB space.
+    //
+    // This also defines the I channel value range, for `pl_gamut_map_generate`
+    float min_luma;
+    float max_luma;
+
+    // Common constants, should be initialized to PL_GAMUT_MAP_CONSTANTS if
+    // not intending to override them further.
+    struct pl_gamut_map_constants constants;
+
+    // -- LUT generation options (for `pl_gamut_map_generate` only)
+
+    // The size of the resulting LUT, per channel.
+    //
+    // Note: For quality, it's generally best to increase h > I > C
+    int lut_size_I;
+    int lut_size_C;
+    int lut_size_h;
+
+    // The stride (in number of floats) between elements in the resulting LUT.
+    int lut_stride;
+
+    // -- Removed parameters
+    float chroma_margin PL_DEPRECATED; // non-functional
+};
+
+#define pl_gamut_map_params(...) (&(struct pl_gamut_map_params) {   \
+    .constants = { PL_GAMUT_MAP_CONSTANTS },                        \
+    __VA_ARGS__                                                     \
+})
+
+// Note: Only does pointer equality testing on `function`
+PL_API bool pl_gamut_map_params_equal(const struct pl_gamut_map_params *a,
+                                      const struct pl_gamut_map_params *b);
+
+// Returns true if the given gamut mapping configuration effectively represents
+// a no-op configuration. Gamut mapping can be skipped in this case.
+PL_API bool pl_gamut_map_params_noop(const struct pl_gamut_map_params *params);
+
+// Generate a gamut-mapping LUT for a given configuration. LUT samples are
+// stored as IPTPQc4 values, but the LUT itself is indexed by IChPQc4,spanning
+// the effective range [min_luma, max_luma] × [0, 0.5] × [-pi,pi].
+//
+// This ordering is designed to keep frequently co-occurring values close in
+// memory, while permitting simple wrapping of the 'h' component.
+PL_API void pl_gamut_map_generate(float *out, const struct pl_gamut_map_params *params);
+
+// Samples a gamut mapping function for a single IPTPQc4 value. The input
+// values are updated in-place.
+PL_API void pl_gamut_map_sample(float x[3], const struct pl_gamut_map_params *params);
+
+// Performs no gamut-mapping, just hard clips out-of-range colors per-channel.
+PL_API extern const struct pl_gamut_map_function pl_gamut_map_clip;
+
+// Performs a perceptually balanced (saturation) gamut mapping, using a soft
+// knee function to preserve in-gamut colors, followed by a final softclip
+// operation. This works bidirectionally, meaning it can both compress and
+// expand the gamut. Behaves similar to a blend of `saturation` and `softclip`.
+PL_API extern const struct pl_gamut_map_function pl_gamut_map_perceptual;
+
+// Performs a perceptually balanced gamut mapping using a soft knee function to
+// roll-off clipped regions, and a hue shifting function to preserve saturation.
+PL_API extern const struct pl_gamut_map_function pl_gamut_map_softclip;
+
+// Performs relative colorimetric clipping, while maintaining an exponential
+// relationship between brightness and chromaticity.
+PL_API extern const struct pl_gamut_map_function pl_gamut_map_relative;
+
+// Performs simple RGB->RGB saturation mapping. The input R/G/B channels are
+// mapped directly onto the output R/G/B channels. Will never clip, but will
+// distort all hues and/or result in a faded look.
+PL_API extern const struct pl_gamut_map_function pl_gamut_map_saturation;
+
+// Performs absolute colorimetric clipping. Like pl_gamut_map_relative, but
+// does not adapt the white point.
+PL_API extern const struct pl_gamut_map_function pl_gamut_map_absolute;
+
+// Performs constant-luminance colorimetric clipping, desaturing colors
+// towards white until they're in-range.
+PL_API extern const struct pl_gamut_map_function pl_gamut_map_desaturate;
+
+// Uniformly darkens the input slightly to prevent clipping on blown-out
+// highlights, then clamps colorimetrically to the input gamut boundary,
+// biased slightly to preserve chromaticity over luminance.
+PL_API extern const struct pl_gamut_map_function pl_gamut_map_darken;
+
+// Performs no gamut mapping, but simply highlights out-of-gamut pixels.
+PL_API extern const struct pl_gamut_map_function pl_gamut_map_highlight;
+
+// Linearly/uniformly desaturates the image in order to bring the entire
+// image into the target gamut.
+PL_API extern const struct pl_gamut_map_function pl_gamut_map_linear;
+
+// A list of built-in gamut mapping functions, terminated by NULL
+PL_API extern const struct pl_gamut_map_function * const pl_gamut_map_functions[];
+PL_API extern const int pl_num_gamut_map_functions; // excluding trailing NULL
+
+// Find the gamut mapping function with the given name, or NULL on failure.
+PL_API const struct pl_gamut_map_function *pl_find_gamut_map_function(const char *name);
+
+PL_API_END
+
+#endif // LIBPLACEBO_GAMUT_MAPPING_H_
diff --git a/src/include/libplacebo/gpu.h b/src/include/libplacebo/gpu.h
new file mode 100644
index 0000000..a63fdf7
--- /dev/null
+++ b/src/include/libplacebo/gpu.h
@@ -0,0 +1,1464 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_GPU_H_
+#define LIBPLACEBO_GPU_H_
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <libplacebo/common.h>
+#include <libplacebo/cache.h>
+#include <libplacebo/log.h>
+
+PL_API_BEGIN
+
+// These are not memory managed, and should represent compile-time constants
+typedef const char *pl_debug_tag;
+#define PL_DEBUG_TAG (__FILE__ ":" PL_TOSTRING(__LINE__))
+
+// Type of a shader input descriptor.
+enum pl_desc_type {
+    PL_DESC_INVALID = 0,
+    PL_DESC_SAMPLED_TEX,    // C: pl_tex*    GLSL: combined texture sampler
+                            // (`pl_tex->params.sampleable` must be set)
+    PL_DESC_STORAGE_IMG,    // C: pl_tex*    GLSL: storage image
+                            // (`pl_tex->params.storable` must be set)
+    PL_DESC_BUF_UNIFORM,    // C: pl_buf*    GLSL: uniform buffer
+                            // (`pl_buf->params.uniform` must be set)
+    PL_DESC_BUF_STORAGE,    // C: pl_buf*    GLSL: storage buffer
+                            // (`pl_buf->params.storable` must be set)
+    PL_DESC_BUF_TEXEL_UNIFORM,// C: pl_buf*  GLSL: uniform samplerBuffer
+                              // (`pl_buf->params.uniform` and `format` must be set)
+    PL_DESC_BUF_TEXEL_STORAGE,// C: pl_buf*  GLSL: uniform imageBuffer
+                              // (`pl_buf->params.uniform` and `format` must be set)
+    PL_DESC_TYPE_COUNT
+};
+
+// This file contains the definition of an API which is designed to abstract
+// away from platform-specific APIs like the various OpenGL variants, Direct3D
+// and Vulkan in a common way. It is a much more limited API than those APIs,
+// since it tries targeting a very small common subset of features that is
+// needed to implement libplacebo's rendering.
+//
+// NOTE: Most, but not all, parameter conditions (phrases such as "must" or
+// "valid usage" are explicitly tested and result in error messages followed by
+// graceful failure. Exceptions are noted where they exist.
+
+// Structure which wraps metadata describing GLSL capabilities.
+struct pl_glsl_version {
+    int version;        // GLSL version (e.g. 450), for #version
+    bool gles;          // GLSL ES semantics (ESSL)
+    bool vulkan;        // GL_KHR_vulkan_glsl semantics
+
+    // Compute shader support and limits. If `compute` is false, then all
+    // of the remaining fields in this section are {0}.
+    bool compute;
+    size_t max_shmem_size;      // maximum compute shader shared memory size
+    uint32_t max_group_threads; // maximum number of local threads per work group
+    uint32_t max_group_size[3]; // maximum work group size per dimension
+
+    // If nonzero, signals availability of shader subgroups. This guarantess
+    // availability of all of the following extensions:
+    // - GL_KHR_shader_subgroup_basic
+    // - GL_KHR_shader_subgroup_vote
+    // - GL_KHR_shader_subgroup_arithmetic
+    // - GL_KHR_shader_subgroup_ballot
+    // - GL_KHR_shader_subgroup_shuffle
+    uint32_t subgroup_size;
+
+    // Miscellaneous shader limits
+    int16_t min_gather_offset;  // minimum `textureGatherOffset` offset
+    int16_t max_gather_offset;  // maximum `textureGatherOffset` offset
+};
+
+// Backwards compatibility alias
+#define pl_glsl_desc pl_glsl_version
+
+// Structure defining the physical limits and capabilities of this GPU
+// instance. If a limit is given as 0, that means that feature is unsupported.
+struct pl_gpu_limits {
+    // --- pl_gpu
+    bool thread_safe;           // `pl_gpu` calls are thread-safe
+    bool callbacks;             // supports asynchronous GPU callbacks
+
+    // --- pl_buf
+    size_t max_buf_size;        // maximum size of any buffer
+    size_t max_ubo_size;        // maximum size of a `uniform` buffer
+    size_t max_ssbo_size;       // maximum size of a `storable` buffer
+    size_t max_vbo_size;        // maximum size of a `drawable` buffer
+    size_t max_mapped_size;     // maximum size of a `host_mapped` buffer
+    uint64_t max_buffer_texels; // maximum number of texels in a texel buffer
+    bool host_cached;           // if true, PL_BUF_MEM_HOST buffers are cached
+
+    // Required alignment for PL_HANDLE_HOST_PTR imports. This is provided
+    // merely as a hint to the user. If the host pointer being imported is
+    // misaligned, libplacebo will internally round (over-map) the region.
+    size_t align_host_ptr;
+
+    // --- pl_tex
+    uint32_t max_tex_1d_dim;    // maximum width for a 1D texture
+    uint32_t max_tex_2d_dim;    // maximum width/height for a 2D texture (required)
+    uint32_t max_tex_3d_dim;    // maximum width/height/depth for a 3D texture
+    bool blittable_1d_3d;       // supports blittable 1D/3D textures
+    bool buf_transfer;          // supports `pl_tex_transfer_params.buf`
+
+    // These don't represent hard limits but indicate performance hints for
+    // optimal alignment. For best performance, the corresponding field
+    // should be aligned to a multiple of these. They will always be a power
+    // of two.
+    size_t align_tex_xfer_pitch;    // optimal `pl_tex_transfer_params.row_pitch`
+    size_t align_tex_xfer_offset;   // optimal `pl_tex_transfer_params.buf_offset`
+
+    // --- pl_pass
+    size_t max_variable_comps;  // maximum components passed in variables
+    size_t max_constants;       // maximum `pl_pass_params.num_constants`
+    bool array_size_constants;  // push constants can be used to size arrays
+    size_t max_pushc_size;      // maximum `push_constants_size`
+    size_t align_vertex_stride; // alignment of `pl_pass_params.vertex_stride`
+    uint32_t max_dispatch[3];   // maximum dispatch size per dimension
+
+    // Note: At least one of `max_variable_comps` or `max_ubo_size` is
+    // guaranteed to be nonzero.
+
+    // As a performance hint, the GPU may signal the number of command queues
+    // it has for fragment and compute shaders, respectively. Users may use
+    // this information to decide the appropriate type of shader to dispatch.
+    uint32_t fragment_queues;
+    uint32_t compute_queues;
+};
+
+// Backwards compatibility aliases
+#define max_xfer_size max_buf_size
+#define align_tex_xfer_stride align_tex_xfer_pitch
+
+// Some `pl_gpu` operations allow sharing GPU resources with external APIs -
+// examples include interop with other graphics APIs such as CUDA, and also
+// various hardware decoding APIs. This defines the mechanism underpinning the
+// communication of such an interoperation.
+typedef uint64_t pl_handle_caps;
+enum pl_handle_type {
+    PL_HANDLE_FD        = (1 << 0), // `int fd` for POSIX-style APIs
+    PL_HANDLE_WIN32     = (1 << 1), // `HANDLE` for win32 API
+    PL_HANDLE_WIN32_KMT = (1 << 2), // `HANDLE` for pre-Windows-8 win32 API
+    PL_HANDLE_DMA_BUF   = (1 << 3), // 'int fd' for a dma_buf fd
+    PL_HANDLE_HOST_PTR  = (1 << 4), // `void *` for a host-allocated pointer
+    PL_HANDLE_MTL_TEX   = (1 << 5), // `MTLTexture*` for Apple platforms
+    PL_HANDLE_IOSURFACE = (1 << 6), // `IOSurfaceRef` for Apple platforms
+};
+
+struct pl_gpu_handle_caps {
+    pl_handle_caps tex;  // supported handles for `pl_tex` + `pl_shared_mem`
+    pl_handle_caps buf;  // supported handles for `pl_buf` + `pl_shared_mem`
+    pl_handle_caps sync; // supported handles for `pl_sync` / semaphores
+};
+
+// Wrapper for the handle used to communicate a shared resource externally.
+// This handle is owned by the `pl_gpu` - if a user wishes to use it in a way
+// that takes over ownership (e.g. importing into some APIs), they must clone
+// the handle before doing so (e.g. using `dup` for fds). It is important to
+// read the external API documentation _very_ carefully as different handle
+// types may be managed in different ways. (eg: CUDA takes ownership of an fd,
+// but does not take ownership of a win32 handle).
+union pl_handle {
+    int fd;         // PL_HANDLE_FD / PL_HANDLE_DMA_BUF
+    void *handle;   // PL_HANDLE_WIN32 / PL_HANDLE_WIN32_KMT / PL_HANDLE_MTL_TEX / PL_HANDLE_IOSURFACE
+    void *ptr;      // PL_HANDLE_HOST_PTR
+};
+
+// Structure encapsulating memory that is shared between libplacebo and the
+// user. This memory can be imported into external APIs using the handle.
+//
+// If the object a `pl_shared_mem` belongs to is destroyed (e.g. via
+// `pl_buf_destroy`), the handle becomes undefined, as do the contents of the
+// memory it points to, as well as any external API objects imported from it.
+struct pl_shared_mem {
+    union pl_handle handle;
+    size_t size;   // the total size of the memory referenced by this handle
+    size_t offset; // the offset of the object within the referenced memory
+
+    // Note: `size` is optional for some APIs and handle types, in particular
+    // when importing DMABUFs or D3D11 textures.
+
+    // For PL_HANDLE_DMA_BUF, this specifies the DRM format modifier that
+    // describes this resource. Note that when importing `pl_buf`, this must
+    // be DRM_FORMAT_MOD_LINEAR. For importing `pl_tex`, it can be any
+    // format modifier supported by the implementation.
+    uint64_t drm_format_mod;
+
+    // When importing a `pl_tex` of type PL_HANDLE_DMA_BUF, this can be used to
+    // set the image stride (AKA pitch) in memory. If left as 0, defaults to
+    // the image width/height.
+    size_t stride_w;
+    size_t stride_h;
+
+    // When importing a `pl_tex` of type PL_HANDLE_MTL_TEX, this determines
+    // which plane is imported (0 - 2).
+    unsigned plane;
+};
+
+// Structure grouping PCI bus address fields for GPU devices
+struct pl_gpu_pci_address {
+    uint32_t domain;
+    uint32_t bus;
+    uint32_t device;
+    uint32_t function;
+};
+
+typedef const struct pl_fmt_t *pl_fmt;
+
+// Abstract device context which wraps an underlying graphics context and can
+// be used to dispatch rendering commands.
+//
+// Thread-safety: Depends on `pl_gpu_limits.thread_safe`
+typedef const struct pl_gpu_t {
+    pl_log log;
+
+    struct pl_glsl_version glsl; // GLSL features supported by this GPU
+    struct pl_gpu_limits limits; // physical device limits and capabilities
+
+    // Fields relevant to external API interop. If the underlying device does
+    // not support interop with other APIs, these will all be {0}.
+    struct pl_gpu_handle_caps export_caps; // supported handles for exporting
+    struct pl_gpu_handle_caps import_caps; // supported handles for importing
+    uint8_t uuid[16];                      // underlying device UUID
+
+    // Supported texture formats, in preference order. (If there are multiple
+    // similar formats, the "better" ones come first)
+    pl_fmt *formats;
+    int num_formats;
+
+    // PCI Bus address of the underlying device, to help with interop.
+    // This will only be filled in if interop is supported.
+    struct pl_gpu_pci_address pci;
+} *pl_gpu;
+
+// Attach a pl_cache object to this GPU instance. This cache will be
+// used to cache all compiled shaders, as well as several other shader objects
+// (e.g. cached 3DLUTs). Calling this with `cache = NULL` disables the cache.
+//
+// Note: Calling this after shaders have already been compiled will not
+// retroactively add those shaders to the cache, so it's recommended to set
+// this early, before creating any passes.
+PL_API void pl_gpu_set_cache(pl_gpu gpu, pl_cache cache);
+
+enum pl_fmt_type {
+    PL_FMT_UNKNOWN = 0, // also used for inconsistent multi-component formats
+    PL_FMT_UNORM,       // unsigned, normalized integer format (sampled as float)
+    PL_FMT_SNORM,       // signed, normalized integer format (sampled as float)
+    PL_FMT_UINT,        // unsigned integer format (sampled as integer)
+    PL_FMT_SINT,        // signed integer format (sampled as integer)
+    PL_FMT_FLOAT,       // (signed) float formats, any bit size
+    PL_FMT_TYPE_COUNT,
+};
+
+enum pl_fmt_caps {
+    PL_FMT_CAP_SAMPLEABLE    = 1 << 0,  // may be sampled from (PL_DESC_SAMPLED_TEX)
+    PL_FMT_CAP_STORABLE      = 1 << 1,  // may be used as storage image (PL_DESC_STORAGE_IMG)
+    PL_FMT_CAP_LINEAR        = 1 << 2,  // may be linearly samplied from (PL_TEX_SAMPLE_LINEAR)
+    PL_FMT_CAP_RENDERABLE    = 1 << 3,  // may be rendered to (pl_pass_params.target_fmt)
+    PL_FMT_CAP_BLENDABLE     = 1 << 4,  // may be blended to (pl_pass_params.enable_blend)
+    PL_FMT_CAP_BLITTABLE     = 1 << 5,  // may be blitted from/to (pl_tex_blit)
+    PL_FMT_CAP_VERTEX        = 1 << 6,  // may be used as a vertex attribute
+    PL_FMT_CAP_TEXEL_UNIFORM = 1 << 7,  // may be used as a texel uniform buffer
+    PL_FMT_CAP_TEXEL_STORAGE = 1 << 8,  // may be used as a texel storage buffer
+    PL_FMT_CAP_HOST_READABLE = 1 << 9,  // may be used with `host_readable` textures
+    PL_FMT_CAP_READWRITE     = 1 << 10, // may be used with PL_DESC_ACCESS_READWRITE
+
+    // Notes:
+    // - PL_FMT_CAP_LINEAR also implies PL_FMT_CAP_SAMPLEABLE
+    // - PL_FMT_CAP_STORABLE also implies `pl_gpu.glsl.compute`
+    // - PL_FMT_CAP_BLENDABLE implies PL_FMT_CAP_RENDERABLE
+    // - PL_FMT_CAP_VERTEX implies that the format is non-opaque
+    // - PL_FMT_CAP_HOST_READABLE implies that the format is non-opaque
+};
+
+struct pl_fmt_plane {
+    // Underlying format of this particular sub-plane. This describes the
+    // components, texel size and host representation for the purpose of
+    // e.g. transfers, blits, and sampling.
+    pl_fmt format;
+
+    // X/Y subsampling shift factor for this plane.
+    uint8_t shift_x, shift_y;
+};
+
+// Structure describing a texel/vertex format.
+struct pl_fmt_t {
+    const char *name;       // symbolic name for this format (e.g. rgba32f)
+    uint64_t signature;     // unique but stable signature (for pass reusability)
+
+    enum pl_fmt_type type;  // the format's data type and interpretation
+    enum pl_fmt_caps caps;  // the features supported by this format
+    int num_components;     // number of components for this format
+    int component_depth[4]; // meaningful bits per component, texture precision
+    size_t internal_size;   // internal texel size (for blit compatibility)
+
+    // For planar formats, this provides a description of each sub-plane.
+    //
+    // Note on planar formats: Planar formats are always opaque and typically
+    // support only a limit subset of capabilities (or none at all). Access
+    // should be done via sub-planes. (See `pl_tex.planes`)
+    struct pl_fmt_plane planes[4];
+    int num_planes;         // or 0 for non-planar textures
+
+    // This controls the relationship between the data as seen by the host and
+    // the way it's interpreted by the texture. The host representation is
+    // always tightly packed (no padding bits in between each component).
+    //
+    // This representation assumes little endian ordering, i.e. components
+    // being ordered from LSB to MSB in memory. Note that for oddly packed
+    // formats like rgb10a2 or rgb565, this is inconsistent with the naming.
+    // (That is to say, rgb565 has sample order {2, 1, 0} under this convention
+    // - because rgb565 treats the R channel as the *most* significant bits)
+    //
+    // If `opaque` is true, then there's no meaningful correspondence between
+    // the two, and all of the remaining fields in this section are unset.
+    //
+    // If `emulated` is true, then this format doesn't actually exist on the
+    // GPU as an uploadable texture format - and any apparent support is being
+    // emulated (typically using compute shaders in the upload path).
+    bool opaque;
+    bool emulated;
+    size_t texel_size;      // total size in bytes per texel
+    size_t texel_align;     // texel alignment requirements (bytes)
+    int host_bits[4];       // number of meaningful bits in host memory
+    int sample_order[4];    // sampled index for each component, e.g.
+                            // {2, 1, 0, 3} for BGRA textures
+
+    // For sampleable formats, this bool indicates whether or not the format
+    // is compatible with `textureGather()`
+    bool gatherable;
+
+    // If usable as a vertex or texel buffer format, this gives the GLSL type
+    // corresponding to the data. (e.g. vec4)
+    const char *glsl_type;
+
+    // If usable as a storage image or texel storage buffer
+    // (PL_FMT_CAP_STORABLE / PL_FMT_CAP_TEXEL_STORAGE), this gives the GLSL
+    // texel format corresponding to the format (e.g. rgba16ui), if any. This
+    // field may be NULL, in which case the format modifier may be left
+    // unspecified.
+    const char *glsl_format;
+
+    // If available, this gives the fourcc associated with the host
+    // representation. In particular, this is intended for use with
+    // PL_HANDLE_DMA_BUF, where this field will match the DRM format from
+    // <drm_fourcc.h>. May be 0, for formats without matching DRM fourcc.
+    uint32_t fourcc;
+
+    // If `fourcc` is set, this contains the list of supported drm format
+    // modifiers for this format.
+    const uint64_t *modifiers;
+    int num_modifiers;
+};
+
+// Returns whether or not a pl_fmt's components are ordered sequentially
+// in memory in the order RGBA.
+PL_API bool pl_fmt_is_ordered(pl_fmt fmt);
+
+// Returns whether or not a pl_fmt is sampled as a float (e.g. UNORM)
+PL_API bool pl_fmt_is_float(pl_fmt fmt);
+
+// Returns whether or not a pl_fmt supports a given DRM modifier.
+PL_API bool pl_fmt_has_modifier(pl_fmt fmt, uint64_t modifier);
+
+// Helper function to find a format with a given number of components and
+// minimum effective precision per component. If `host_bits` is set, then the
+// format will always be non-opaque, unpadded, ordered and have exactly this
+// bit depth for each component. Finally, all `caps` must be supported.
+PL_API pl_fmt pl_find_fmt(pl_gpu gpu, enum pl_fmt_type type, int num_components,
+                          int min_depth, int host_bits, enum pl_fmt_caps caps);
+
+// Finds a vertex format for a given configuration. The resulting vertex will
+// have a component depth equivalent to the sizeof() the equivalent host type.
+// (e.g. PL_FMT_FLOAT will always have sizeof(float))
+PL_API pl_fmt pl_find_vertex_fmt(pl_gpu gpu, enum pl_fmt_type type, int num_components);
+
+// Find a format based on its name.
+PL_API pl_fmt pl_find_named_fmt(pl_gpu gpu, const char *name);
+
+// Find a format based on its fourcc.
+PL_API pl_fmt pl_find_fourcc(pl_gpu gpu, uint32_t fourcc);
+
+// A generic 'timer query' object. These can be used to measure an
+// approximation of the GPU execution time of a given operation. Due to the
+// highly asynchronous nature of GPUs, the actual results of any individual
+// timer query may be delayed by quite a bit. As such, users should avoid
+// trying to pair any particular GPU command with any particular timer query
+// result, and only reuse `pl_timer` objects with identical operations. The
+// results of timer queries are guaranteed to be in-order, but individual
+// queries may be dropped, and some operations might not record timer results
+// at all. (For example, if the underlying hardware does not support timer
+// queries for a given operation type)
+//
+// Thread-safety: Unsafe
+typedef struct pl_timer_t *pl_timer;
+
+// Creates a new timer object. This may return NULL, for example if the
+// implementation does not support timers, but since passing NULL to
+// `pl_timer_destroy` and `pl_timer_query` is safe, users generally need not
+// concern themselves with handling this.
+PL_API pl_timer pl_timer_create(pl_gpu gpu);
+PL_API void pl_timer_destroy(pl_gpu gpu, pl_timer *);
+
+// Queries any results that have been measured since the last execution of
+// `pl_timer_query`. There may be more than one result, in which case the user
+// should simply call the function again to get the subsequent values. This
+// function returns a value of 0 in the event that there are no more
+// unprocessed results.
+//
+// The results are reported in nanoseconds, but the actual precision of the
+// timestamp queries may be significantly lower.
+//
+// Note: Results do not queue up indefinitely. Generally, the implementation
+// will only keep track of a small, fixed number of results internally. Make
+// sure to include this function as part of your main rendering loop to process
+// all of its results, or older results will be overwritten by newer ones.
+PL_API uint64_t pl_timer_query(pl_gpu gpu, pl_timer);
+
+enum pl_buf_mem_type {
+    PL_BUF_MEM_AUTO = 0, // use whatever seems most appropriate
+    PL_BUF_MEM_HOST,     // try allocating from host memory (RAM)
+    PL_BUF_MEM_DEVICE,   // try allocating from device memory (VRAM)
+    PL_BUF_MEM_TYPE_COUNT,
+
+    // Note: This distinction only matters for discrete GPUs
+};
+
+// Structure describing a buffer.
+struct pl_buf_params {
+    size_t size;        // size in bytes (must be <= `pl_gpu_limits.max_buf_size`)
+    bool host_writable; // contents may be updated via pl_buf_write()
+    bool host_readable; // contents may be read back via pl_buf_read()
+    bool host_mapped;   // create a persistent, RW mapping (pl_buf.data)
+
+    // May be used as PL_DESC_BUF_UNIFORM or PL_DESC_BUF_TEXEL_UNIFORM.
+    // Requires `size <= pl_gpu_limits.max_ubo_size`
+    bool uniform;
+
+    // May be used as PL_DESC_BUF_STORAGE or PL_DESC_BUF_TEXEL_STORAGE.
+    // Requires `size <= pl_gpu_limits.max_ssbo_size`
+    bool storable;
+
+    // May be used as the source of vertex data for `pl_pass_run`.
+    bool drawable;
+
+    // Provide a hint for the memory type you want to use when allocating
+    // this buffer's memory.
+    //
+    // Note: Restrictions may apply depending on the usage flags. In
+    // particular, allocating buffers with `uniform` or `storable` enabled from
+    // non-device memory will almost surely fail.
+    enum pl_buf_mem_type memory_type;
+
+    // Setting this to a format with the `PL_FMT_CAP_TEXEL_*` capability allows
+    // this buffer to be used as a `PL_DESC_BUF_TEXEL_*`, when `uniform` and
+    // `storage` are respectively also enabled.
+    pl_fmt format;
+
+    // At most one of `export_handle` and `import_handle` can be set for a
+    // buffer.
+
+    // Setting this indicates that the memory backing this buffer should be
+    // shared with external APIs, If so, this must be exactly *one* of
+    // `pl_gpu.export_caps.buf`.
+    enum pl_handle_type export_handle;
+
+    // Setting this indicates that the memory backing this buffer will be
+    // imported from an external API. If so, this must be exactly *one* of
+    // `pl_gpu.import_caps.buf`.
+    enum pl_handle_type import_handle;
+
+    // If the shared memory is being imported, the import handle must be
+    // specified here. Otherwise, this is ignored.
+    struct pl_shared_mem shared_mem;
+
+    // If non-NULL, the buffer will be created with these contents. Otherwise,
+    // the initial data is undefined. Using this does *not* require setting
+    // host_writable.
+    const void *initial_data;
+
+    // Arbitrary user data. libplacebo does not use this at all.
+    void *user_data;
+
+    // Arbitrary identifying tag. Used only for debugging purposes.
+    pl_debug_tag debug_tag;
+};
+
+#define pl_buf_params(...) (&(struct pl_buf_params) {   \
+        .debug_tag = PL_DEBUG_TAG,                      \
+        __VA_ARGS__                                     \
+    })
+
+// A generic buffer, which can be used for multiple purposes (texture transfer,
+// storage buffer, uniform buffer, etc.)
+//
+// Note on efficiency: A pl_buf does not necessarily represent a true "buffer"
+// object on the underlying graphics API. It may also refer to a sub-slice of
+// a larger buffer, depending on the implementation details of the GPU. The
+// bottom line is that users do not need to worry about the efficiency of using
+// many small pl_buf objects. Having many small pl_bufs, even lots of few-byte
+// vertex buffers, is designed to be completely fine.
+//
+// Thread-safety: Unsafe
+typedef const struct pl_buf_t {
+    struct pl_buf_params params;
+    uint8_t *data; // for persistently mapped buffers, points to the first byte
+
+    // If `params.handle_type` is set, this structure references the shared
+    // memory backing this buffer, via the requested handle type.
+    //
+    // While this buffer is not in an "exported" state, the contents of the
+    // memory are undefined. (See: `pl_buf_export`)
+    struct pl_shared_mem shared_mem;
+} *pl_buf;
+
+// Create a buffer. The type of buffer depends on the parameters. The buffer
+// parameters must adhere to the restrictions imposed by the pl_gpu_limits.
+// Returns NULL on failure.
+//
+// For buffers with shared memory, the buffer is considered to be in an
+// "exported" state by default, and may be used directly by the external API
+// after being created (until the first libplacebo operation on the buffer).
+PL_API pl_buf pl_buf_create(pl_gpu gpu, const struct pl_buf_params *params);
+PL_API void pl_buf_destroy(pl_gpu gpu, pl_buf *buf);
+
+// This behaves like `pl_buf_create`, but if the buffer already exists and has
+// incompatible parameters, it will get destroyed first. A buffer is considered
+// "compatible" if it has the same buffer type and texel format, a size greater
+// than or equal to the requested size, and it has a superset of the features
+// the user requested. After this operation, the contents of the buffer are
+// undefined.
+//
+// Note: Due to its unpredictability, it's not allowed to use this with
+// `params->initial_data` being set. Similarly, it's not allowed on a buffer
+// with `params->export_handle`. since this may invalidate the corresponding
+// external API's handle. Conversely, it *is* allowed on a buffer with
+// `params->host_mapped`, and the corresponding `buf->data` pointer *may*
+// change as a result of doing so.
+//
+// Note: If the `user_data` alone changes, this does not trigger a buffer
+// recreation. In theory, this can be used to detect when the buffer ended
+// up being recreated.
+PL_API bool pl_buf_recreate(pl_gpu gpu, pl_buf *buf, const struct pl_buf_params *params);
+
+// Update the contents of a buffer, starting at a given offset (must be a
+// multiple of 4) and up to a given size, with the contents of *data.
+//
+// This function will block until the buffer is no longer in use. Use
+// `pl_buf_poll` to perform non-blocking queries of buffer availability.
+//
+// Note: This function can incur synchronization overhead, so it shouldn't be
+// used in tight loops. If you do need to loop (e.g. to perform a strided
+// write), consider using host-mapped buffers, or fixing the memory in RAM,
+// before calling this function.
+PL_API void pl_buf_write(pl_gpu gpu, pl_buf buf, size_t buf_offset,
+                         const void *data, size_t size);
+
+// Read back the contents of a buffer, starting at a given offset, storing the
+// data into *dest. Returns whether successful.
+//
+// This function will block until the buffer is no longer in use. Use
+// `pl_buf_poll` to perform non-blocking queries of buffer availability.
+PL_API bool pl_buf_read(pl_gpu gpu, pl_buf buf, size_t buf_offset,
+                        void *dest, size_t size);
+
+// Copy `size` bytes from one buffer to another, reading from and writing to
+// the respective offsets.
+PL_API void pl_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset,
+                        pl_buf src, size_t src_offset, size_t size);
+
+// Initiates a buffer export operation, allowing a buffer to be accessed by an
+// external API. This is only valid for buffers with `params.handle_type`.
+// Calling this twice in a row is a harmless no-op. Returns whether successful.
+//
+// There is no corresponding "buffer import" operation, the next libplacebo
+// operation that touches the buffer (e.g. pl_tex_upload, but also pl_buf_write
+// and pl_buf_read) will implicitly import the buffer back to libplacebo. Users
+// must ensure that all pending operations made by the external API are fully
+// completed before using it in libplacebo again. (Otherwise, the behaviour
+// is undefined)
+//
+// Please note that this function returning does not mean the memory is
+// immediately available as such. In general, it will mark a buffer as "in use"
+// in the same way any other buffer operation would, and it is the user's
+// responsibility to wait until `pl_buf_poll` returns false before accessing
+// the memory from the external API.
+//
+// In terms of the access performed by this operation, it is not considered a
+// "read" or "write" and therefore does not technically conflict with reads or
+// writes to the buffer performed by the host (via mapped memory - any use of
+// `pl_buf_read` or `pl_buf_write` would defeat the purpose of the export).
+// However, restrictions made by the external API may apply that prevent this.
+//
+// The recommended use pattern is something like this:
+//
+// while (loop) {
+//    pl_buf buf = get_free_buffer(); // or block on pl_buf_poll
+//    // write to the buffer using the external API
+//    pl_tex_upload(gpu, /* ... buf ... */); // implicitly imports
+//    pl_buf_export(gpu, buf);
+// }
+//
+// i.e. perform an external API operation, then use and immediately export the
+// buffer in libplacebo, and finally wait until `pl_buf_poll` is false before
+// re-using it in the external API. (Or get a new buffer in the meantime)
+PL_API bool pl_buf_export(pl_gpu gpu, pl_buf buf);
+
+// Returns whether or not a buffer is currently "in use". This can either be
+// because of a pending read operation, a pending write operation or a pending
+// buffer export operation. Any access to the buffer by external APIs or via
+// the host pointer (for host-mapped buffers) is forbidden while a buffer is
+// "in use". The only exception to this rule is multiple reads, for example
+// reading from a buffer with `pl_tex_upload` while simultaneously reading from
+// it using mapped memory.
+//
+// The `timeout`, specified in nanoseconds, indicates how long to block for
+// before returning. If set to 0, this function will never block, and only
+// returns the current status of the buffer. The actual precision of the
+// timeout may be significantly longer than one nanosecond, and has no upper
+// bound. This function does not provide hard latency guarantees. This function
+// may also return at any time, even if the buffer is still in use. If the user
+// wishes to block until the buffer is definitely no longer in use, the
+// recommended usage is:
+//
+// while (pl_buf_poll(gpu, buf, UINT64_MAX))
+//      ; // do nothing
+//
+// Note: libplacebo operations on buffers are always internally synchronized,
+// so this is only needed for host-mapped or externally exported buffers.
+// However, it may be used to do non-blocking queries before calling blocking
+// functions such as `pl_buf_read`.
+//
+// Note: If `pl_gpu_limits.thread_safe` is set, this function is implicitly
+// synchronized, meaning it can safely be called on a `pl_buf` that is in use
+// by another thread.
+PL_API bool pl_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t timeout);
+
+enum pl_tex_sample_mode {
+    PL_TEX_SAMPLE_NEAREST,  // nearest neighbour sampling
+    PL_TEX_SAMPLE_LINEAR,   // linear filtering, requires PL_FMT_CAP_LINEAR
+    PL_TEX_SAMPLE_MODE_COUNT,
+};
+
+enum pl_tex_address_mode {
+    PL_TEX_ADDRESS_CLAMP,  // clamp the nearest edge texel
+    PL_TEX_ADDRESS_REPEAT, // repeat (tile) the texture
+    PL_TEX_ADDRESS_MIRROR, // repeat (mirror) the texture
+    PL_TEX_ADDRESS_MODE_COUNT,
+};
+
+// Structure describing a texture.
+struct pl_tex_params {
+    int w, h, d;            // physical dimension; unused dimensions must be 0
+    pl_fmt format;
+
+    // The following bools describe what operations can be performed. The
+    // corresponding pl_fmt capability must be set for every enabled
+    // operation type.
+    //
+    // Note: For planar formats, it is also possible to set capabilities only
+    // supported by sub-planes. In this case, the corresponding functionality
+    // will be available for the sub-plane, but not the planar texture itself.
+    bool sampleable;    // usable as a PL_DESC_SAMPLED_TEX
+    bool renderable;    // usable as a render target (pl_pass_run)
+                        // (must only be used with 2D textures)
+    bool storable;      // usable as a storage image (PL_DESC_IMG_*)
+    bool blit_src;      // usable as a blit source
+    bool blit_dst;      // usable as a blit destination
+    bool host_writable; // may be updated with pl_tex_upload()
+    bool host_readable; // may be fetched with pl_tex_download()
+
+    // Note: For `blit_src`, `blit_dst`, the texture must either be
+    // 2-dimensional or `pl_gpu_limits.blittable_1d_3d` must be set.
+
+    // At most one of `export_handle` and `import_handle` can be set for a
+    // texture.
+
+    // Setting this indicates that the memory backing this texture should be
+    // shared with external APIs, If so, this must be exactly *one* of
+    // `pl_gpu.export_caps.tex`.
+    enum pl_handle_type export_handle;
+
+    // Setting this indicates that the memory backing this texture will be
+    // imported from an external API. If so, this must be exactly *one* of
+    // `pl_gpu.import_caps.tex`. Mutually exclusive with `initial_data`.
+    enum pl_handle_type import_handle;
+
+    // If the shared memory is being imported, the import handle must be
+    // specified here. Otherwise, this is ignored.
+    struct pl_shared_mem shared_mem;
+
+    // If non-NULL, the texture will be created with these contents (tightly
+    // packed). Using this does *not* require setting host_writable. Otherwise,
+    // the initial data is undefined. Mutually exclusive with `import_handle`.
+    const void *initial_data;
+
+    // Arbitrary user data. libplacebo does not use this at all.
+    void *user_data;
+
+    // Arbitrary identifying tag. Used only for debugging purposes.
+    pl_debug_tag debug_tag;
+};
+
+#define pl_tex_params(...) (&(struct pl_tex_params) {   \
+        .debug_tag = PL_DEBUG_TAG,                      \
+        __VA_ARGS__                                     \
+    })
+
+static inline int pl_tex_params_dimension(const struct pl_tex_params params)
+{
+    return params.d ? 3 : params.h ? 2 : 1;
+}
+
+enum pl_sampler_type {
+    PL_SAMPLER_NORMAL,      // gsampler2D, gsampler3D etc.
+    PL_SAMPLER_RECT,        // gsampler2DRect
+    PL_SAMPLER_EXTERNAL,    // gsamplerExternalOES
+    PL_SAMPLER_TYPE_COUNT,
+};
+
+// Conflates the following typical GPU API concepts:
+// - texture itself
+// - sampler state
+// - staging buffers for texture upload
+// - framebuffer objects
+// - wrappers for swapchain framebuffers
+// - synchronization needed for upload/rendering/etc.
+//
+// Essentially a pl_tex can be anything ranging from a normal texture, a wrapped
+// external/real framebuffer, a framebuffer object + texture pair, a mapped
+// texture (via pl_hwdec), or other sorts of things that can be sampled from
+// and/or rendered to.
+//
+// Thread-safety: Unsafe
+typedef const struct pl_tex_t *pl_tex;
+struct pl_tex_t {
+    struct pl_tex_params params;
+
+    // If `params.format` is a planar format, this contains `pl_tex` handles
+    // encapsulating individual texture planes. Conversely, if this is a
+    // sub-plane of a planar texture, `parent` points to the planar texture.
+    //
+    // Note: Calling `pl_tex_destroy` on sub-planes is undefined behavior.
+    pl_tex planes[4];
+    pl_tex parent;
+
+    // If `params.export_handle` is set, this structure references the shared
+    // memory backing this buffer, via the requested handle type.
+    //
+    // While this texture is not in an "exported" state, the contents of the
+    // memory are undefined. (See: `pl_tex_export`)
+    //
+    // Note: Due to vulkan driver limitations, `shared_mem.drm_format_mod` will
+    // currently always be set to DRM_FORMAT_MOD_INVALID. No guarantee can be
+    // made about the cross-driver compatibility of textures exported this way.
+    struct pl_shared_mem shared_mem;
+
+    // If `params.sampleable` is true, this indicates the correct sampler type
+    // to use when sampling from this texture.
+    enum pl_sampler_type sampler_type;
+};
+
+// Create a texture (with undefined contents). Returns NULL on failure. This is
+// assumed to be an expensive/rare operation, and may need to perform memory
+// allocation or framebuffer creation.
+PL_API pl_tex pl_tex_create(pl_gpu gpu, const struct pl_tex_params *params);
+PL_API void pl_tex_destroy(pl_gpu gpu, pl_tex *tex);
+
+// This works like `pl_tex_create`, but if the texture already exists and has
+// incompatible texture parameters, it will get destroyed first. A texture is
+// considered "compatible" if it has the same texture format and sample/address
+// mode and it supports a superset of the features the user requested.
+//
+// Even if the texture is not recreated, calling this function will still
+// invalidate the contents of the texture. (Note: Because of this,
+// `initial_data` may not be used with `pl_tex_recreate`. Doing so is an error)
+//
+// Note: If the `user_data` alone changes, this does not trigger a texture
+// recreation. In theory, this can be used to detect when the texture ended
+// up being recreated.
+PL_API bool pl_tex_recreate(pl_gpu gpu, pl_tex *tex, const struct pl_tex_params *params);
+
+// Invalidates the contents of a texture. After this, the contents are fully
+// undefined.
+PL_API void pl_tex_invalidate(pl_gpu gpu, pl_tex tex);
+
+union pl_clear_color {
+    float f[4];
+    int32_t i[4];
+    uint32_t u[4];
+};
+
+// Clear the dst texture with the given color (rgba). This is functionally
+// identical to a blit operation, which means `dst->params.blit_dst` must be
+// set.
+PL_API void pl_tex_clear_ex(pl_gpu gpu, pl_tex dst, const union pl_clear_color color);
+
+// Wrapper for `pl_tex_clear_ex` which only works for floating point textures.
+PL_API void pl_tex_clear(pl_gpu gpu, pl_tex dst, const float color[4]);
+
+struct pl_tex_blit_params {
+    // The texture to blit from. Must have `params.blit_src` enabled.
+    pl_tex src;
+
+    // The texture to blit to. Must have `params.blit_dst` enabled, and a
+    // format that is loosely compatible with `src`. This essentially means
+    // that they must have the same `internal_size`. Additionally, UINT
+    // textures can only be blitted to other UINT textures, and SINT textures
+    // can only be blitted to other SINT textures.
+    pl_tex dst;
+
+    // The region of the source texture to blit. Must be within the texture
+    // bounds of `src`. May be flipped. (Optional)
+    pl_rect3d src_rc;
+
+    // The region of the destination texture to blit into. Must be within the
+    // texture bounds of `dst`. May be flipped. Areas outside of `dst_rc` in
+    // `dst` are preserved. (Optional)
+    pl_rect3d dst_rc;
+
+    // If `src_rc` and `dst_rc` have different sizes, the texture will be
+    // scaled using the given texture sampling mode.
+    enum pl_tex_sample_mode sample_mode;
+};
+
+#define pl_tex_blit_params(...) (&(struct pl_tex_blit_params) { __VA_ARGS__ })
+
+// Copy a sub-rectangle from one texture to another.
+PL_API void pl_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params);
+
+// Structure describing a texture transfer operation.
+struct pl_tex_transfer_params {
+    // Texture to transfer to/from. Depending on the type of the operation,
+    // this must have params.host_writable (uploads) or params.host_readable
+    // (downloads) set, respectively.
+    pl_tex tex;
+
+    // Note: Superfluous parameters are ignored, i.e. for a 1D texture, the y
+    // and z fields of `rc`, as well as the corresponding pitches, are ignored.
+    // In all other cases, the pitch must be large enough to contain the
+    // corresponding dimension of `rc`, and the `rc` must be normalized and
+    // fully contained within the image dimensions. Missing fields in the `rc`
+    // are inferred from the image size. If unset, the pitch is inferred
+    // from `rc` (that is, it's assumed that the data is tightly packed in the
+    // buffer). Otherwise, `row_pitch` *must* be a multiple of
+    // `tex->params.format->texel_align`, and `depth_pitch` must be a multiple
+    // of `row_pitch`.
+    pl_rect3d rc;       // region of the texture to transfer
+    size_t row_pitch;   // the number of bytes separating image rows
+    size_t depth_pitch; // the number of bytes separating image planes
+
+    // An optional timer to report the approximate duration of the texture
+    // transfer to. Note that this is only an approximation, since the actual
+    // texture transfer may happen entirely in the background (in particular,
+    // for implementations with asynchronous transfer capabilities). It's also
+    // not guaranteed that all GPUs support this.
+    pl_timer timer;
+
+    // An optional callback to fire after the operation completes. If this is
+    // specified, then the operation is performed asynchronously. Note that
+    // transfers to/from buffers are always asynchronous, even without, this
+    // field, so it's more useful for `ptr` transfers. (Though it can still be
+    // helpful to avoid having to manually poll buffers all the time)
+    //
+    // When this is *not* specified, uploads from `ptr` are still asynchronous
+    // but require a host memcpy, while downloads from `ptr` are blocking. As
+    // such, it's recommended to always try using asynchronous texture
+    // transfers wherever possible.
+    //
+    // Note: Requires `pl_gpu_limits.callbacks`
+    //
+    // Note: Callbacks are implicitly synchronized, meaning that callbacks are
+    // guaranteed to never execute concurrently with other callbacks. However,
+    // they may execute from any thread that the `pl_gpu` is used on.
+    void (*callback)(void *priv);
+    void *priv; // arbitrary user data
+
+    // For the data source/target of a transfer operation, there are two valid
+    // options:
+    //
+    // 1. Transferring to/from a buffer: (requires `pl_gpu_limits.buf_transfer`)
+    pl_buf buf;         // buffer to use
+    size_t buf_offset;  // offset of data within buffer, should be a
+                        // multiple of `tex->params.format->texel_size`
+    // 2. Transferring to/from host memory directly:
+    void *ptr;          // address of data
+    bool no_import;     // always use memcpy, bypassing host ptr import
+
+    // Note: The contents of the memory region / buffer must exactly match the
+    // texture format; i.e. there is no explicit conversion between formats.
+};
+
+#define pl_tex_transfer_params(...) (&(struct pl_tex_transfer_params) { __VA_ARGS__ })
+
+// Upload data to a texture. Returns whether successful.
+PL_API bool pl_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params);
+
+// Download data from a texture. Returns whether successful.
+PL_API bool pl_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params);
+
+// Returns whether or not a texture is currently "in use". This can either be
+// because of a pending read operation, a pending write operation or a pending
+// texture export operation. Note that this function's usefulness is extremely
+// limited under ordinary circumstances. In practically all cases, textures do
+// not need to be directly synchronized by the user, except when interfacing
+// with external libraries. This function should NOT, however, be used as a
+// crutch to avoid having to implement semaphore-based synchronization. Use
+// the API-specific functions such as `pl_vulkan_hold/release` for that.
+//
+// A good example of a use case in which this function is required is when
+// interoperating with external memory management that needs to know when an
+// imported texture is safe to free / reclaim internally, in which case
+// semaphores are insufficient because memory management is a host operation.
+//
+// The `timeout`, specified in nanoseconds, indicates how long to block for
+// before returning. If set to 0, this function will never block, and only
+// returns the current status of the texture. The actual precision of the
+// timeout may be significantly longer than one nanosecond, and has no upper
+// bound. This function does not provide hard latency guarantees. This function
+// may also return at any time, even if the texture is still in use. If the
+// user wishes to block until the texture is definitely no longer in use, the
+// recommended usage is:
+//
+// while (pl_tex_poll(gpu, buf, UINT64_MAX))
+//      ; // do nothing
+//
+// Note: If `pl_gpu_limits.thread_safe` is set, this function is implicitly
+// synchronized, meaning it can safely be called on a `pl_tex` that is in use
+// by another thread.
+PL_API bool pl_tex_poll(pl_gpu gpu, pl_tex tex, uint64_t timeout);
+
+// Data type of a shader input variable (e.g. uniform, or UBO member)
+enum pl_var_type {
+    PL_VAR_INVALID = 0,
+    PL_VAR_SINT,        // C: int           GLSL: int/ivec
+    PL_VAR_UINT,        // C: unsigned int  GLSL: uint/uvec
+    PL_VAR_FLOAT,       // C: float         GLSL: float/vec/mat
+    PL_VAR_TYPE_COUNT
+};
+
+// Returns the host size (in bytes) of a pl_var_type.
+PL_API size_t pl_var_type_size(enum pl_var_type type);
+
+// Represents a shader input variable (concrete data, e.g. vector, matrix)
+struct pl_var {
+    const char *name;       // name as used in the shader
+    enum pl_var_type type;
+    // The total number of values is given by dim_v * dim_m. For example, a
+    // vec2 would have dim_v = 2 and dim_m = 1. A mat3x4 would have dim_v = 4
+    // and dim_m = 3.
+    int dim_v;              // vector dimension
+    int dim_m;              // matrix dimension (number of columns, see below)
+    int dim_a;              // array dimension
+};
+
+// Helper functions for constructing the most common pl_vars, with names
+// corresponding to their corresponding GLSL built-in types.
+PL_API struct pl_var pl_var_float(const char *name);
+PL_API struct pl_var pl_var_vec2(const char *name);
+PL_API struct pl_var pl_var_vec3(const char *name);
+PL_API struct pl_var pl_var_vec4(const char *name);
+PL_API struct pl_var pl_var_mat2(const char *name);
+PL_API struct pl_var pl_var_mat2x3(const char *name);
+PL_API struct pl_var pl_var_mat2x4(const char *name);
+PL_API struct pl_var pl_var_mat3(const char *name);
+PL_API struct pl_var pl_var_mat3x4(const char *name);
+PL_API struct pl_var pl_var_mat4x2(const char *name);
+PL_API struct pl_var pl_var_mat4x3(const char *name);
+PL_API struct pl_var pl_var_mat4(const char *name);
+PL_API struct pl_var pl_var_int(const char *name);
+PL_API struct pl_var pl_var_ivec2(const char *name);
+PL_API struct pl_var pl_var_ivec3(const char *name);
+PL_API struct pl_var pl_var_ivec4(const char *name);
+PL_API struct pl_var pl_var_uint(const char *name);
+PL_API struct pl_var pl_var_uvec2(const char *name);
+PL_API struct pl_var pl_var_uvec3(const char *name);
+PL_API struct pl_var pl_var_uvec4(const char *name);
+
+struct pl_named_var {
+    const char *glsl_name;
+    struct pl_var var;
+};
+
+// The same list as above, tagged by name and terminated with a {0} entry.
+PL_API extern const struct pl_named_var pl_var_glsl_types[];
+
+// Efficient helper function for performing a lookup in the above array.
+// Returns NULL if the variable is not legal. Note that the array dimension is
+// ignored, since it's usually part of the variable name and not the type name.
+PL_API const char *pl_var_glsl_type_name(struct pl_var var);
+
+// Converts a pl_fmt to an "equivalent" pl_var. Equivalent in this sense means
+// that the pl_var's type will be the same as the vertex's sampled type (e.g.
+// PL_FMT_UNORM gets turned into PL_VAR_FLOAT).
+PL_API struct pl_var pl_var_from_fmt(pl_fmt fmt, const char *name);
+
+// Describes the memory layout of a variable, relative to some starting location
+// (typically the offset within a uniform/storage/pushconstant buffer)
+//
+// Note on matrices: All GPUs expect column major matrices, for both buffers and
+// input variables. Care needs to be taken to avoid trying to use e.g. a
+// pl_matrix3x3 (which is row major) directly as a pl_var_update.data!
+//
+// In terms of the host layout, a column-major matrix (e.g. matCxR) with C
+// columns and R rows is treated like an array vecR[C]. The `stride` here refers
+// to the separation between these array elements, i.e. the separation between
+// the individual columns.
+//
+// Visualization of a mat4x3:
+//
+//       0   1   2   3  <- columns
+// 0  [ (A) (D) (G) (J) ]
+// 1  [ (B) (E) (H) (K) ]
+// 2  [ (C) (F) (I) (L) ]
+// ^ rows
+//
+// Layout in GPU memory: (stride=16, size=60)
+//
+// [ A B C ] X <- column 0, offset +0
+// [ D E F ] X <- column 1, offset +16
+// [ G H I ] X <- column 2, offset +32
+// [ J K L ]   <- column 3, offset +48
+//
+// Note the lack of padding on the last column in this example.
+// In general: size <= stride * dim_m
+//
+// C representation: (stride=12, size=48)
+//
+// { { A, B, C },
+//   { D, E, F },
+//   { G, H, I },
+//   { J, K, L } }
+//
+// Note on arrays: `stride` represents both the stride between elements of a
+// matrix, and the stride between elements of an array. That is, there is no
+// distinction between the columns of a matrix and the rows of an array. For
+// example, a mat2[10] and a vec2[20] share the same pl_var_layout - the stride
+// would be sizeof(vec2) and the size would be sizeof(vec2) * 2 * 10.
+//
+// For non-array/matrix types, `stride` is equal to `size`.
+
+struct pl_var_layout {
+    size_t offset; // the starting offset of the first byte
+    size_t stride; // the delta between two elements of an array/matrix
+    size_t size;   // the total size of the input
+};
+
+// Returns the host layout of an input variable as required for a
+// tightly-packed, byte-aligned C data type, given a starting offset.
+PL_API struct pl_var_layout pl_var_host_layout(size_t offset, const struct pl_var *var);
+
+// Returns the GLSL std140 layout of an input variable given a current buffer
+// offset, as required for a buffer descriptor of type PL_DESC_BUF_UNIFORM
+//
+// The normal way to use this function is when calculating the size and offset
+// requirements of a uniform buffer in an incremental fashion, to calculate the
+// new offset of the next variable in this buffer.
+PL_API struct pl_var_layout pl_std140_layout(size_t offset, const struct pl_var *var);
+
+// Returns the GLSL std430 layout of an input variable given a current buffer
+// offset, as required for a buffer descriptor of type PL_DESC_BUF_STORAGE, and
+// for push constants.
+PL_API struct pl_var_layout pl_std430_layout(size_t offset, const struct pl_var *var);
+
+// Convenience definitions / friendly names for these
+#define pl_buf_uniform_layout pl_std140_layout
+#define pl_buf_storage_layout pl_std430_layout
+#define pl_push_constant_layout pl_std430_layout
+
+// Like memcpy, but copies bytes from `src` to `dst` in a manner governed by
+// the stride and size of `dst_layout` as well as `src_layout`. Also takes
+// into account the respective `offset`.
+PL_API void memcpy_layout(void *dst, struct pl_var_layout dst_layout,
+                          const void *src, struct pl_var_layout src_layout);
+
+// Represents a compile-time constant.
+struct pl_constant {
+    enum pl_var_type type;  // constant data type
+    uint32_t id;            // GLSL `constant_id`
+    size_t offset;          // byte offset in `constant_data`
+};
+
+// Represents a vertex attribute.
+struct pl_vertex_attrib {
+    const char *name;   // name as used in the shader
+    pl_fmt fmt;         // data format (must have PL_FMT_CAP_VERTEX)
+    size_t offset;      // byte offset into the vertex struct
+    int location;       // vertex location (as used in the shader)
+};
+
+// Returns an abstract namespace index for a given descriptor type. This will
+// always be a value >= 0 and < PL_DESC_TYPE_COUNT. Implementations can use
+// this to figure out which descriptors may share the same value of `binding`.
+// Bindings must only be unique for all descriptors within the same namespace.
+PL_API int pl_desc_namespace(pl_gpu gpu, enum pl_desc_type type);
+
+// Access mode of a shader input descriptor.
+enum pl_desc_access {
+    PL_DESC_ACCESS_READWRITE,
+    PL_DESC_ACCESS_READONLY,
+    PL_DESC_ACCESS_WRITEONLY,
+    PL_DESC_ACCESS_COUNT,
+};
+
+// Returns the GLSL syntax for a given access mode (e.g. "readonly").
+PL_API const char *pl_desc_access_glsl_name(enum pl_desc_access mode);
+
+// Represents a shader descriptor (e.g. texture or buffer binding)
+struct pl_desc {
+    const char *name;       // name as used in the shader
+    enum pl_desc_type type;
+
+    // The binding of this descriptor, as used in the shader. All bindings
+    // within a namespace must be unique. (see: pl_desc_namespace)
+    int binding;
+
+    // For storage images and storage buffers, this can be used to restrict
+    // the type of access that may be performed on the descriptor. Ignored for
+    // the other descriptor types (uniform buffers and sampled textures are
+    // always read-only).
+    enum pl_desc_access access;
+};
+
+// Framebuffer blending mode (for raster passes)
+enum pl_blend_mode {
+    PL_BLEND_ZERO,
+    PL_BLEND_ONE,
+    PL_BLEND_SRC_ALPHA,
+    PL_BLEND_ONE_MINUS_SRC_ALPHA,
+    PL_BLEND_MODE_COUNT,
+};
+
+struct pl_blend_params {
+    enum pl_blend_mode src_rgb;
+    enum pl_blend_mode dst_rgb;
+    enum pl_blend_mode src_alpha;
+    enum pl_blend_mode dst_alpha;
+};
+
+#define pl_blend_params(...) (&(struct pl_blend_params) { __VA_ARGS__ })
+
+// Typical alpha compositing
+PL_API extern const struct pl_blend_params pl_alpha_overlay;
+
+enum pl_prim_type {
+    PL_PRIM_TRIANGLE_LIST,
+    PL_PRIM_TRIANGLE_STRIP,
+    PL_PRIM_TYPE_COUNT,
+};
+
+enum pl_index_format {
+    PL_INDEX_UINT16 = 0,
+    PL_INDEX_UINT32,
+    PL_INDEX_FORMAT_COUNT,
+};
+
+enum pl_pass_type {
+    PL_PASS_INVALID = 0,
+    PL_PASS_RASTER,  // vertex+fragment shader
+    PL_PASS_COMPUTE, // compute shader (requires `pl_gpu.glsl.compute`)
+    PL_PASS_TYPE_COUNT,
+};
+
+// Description of a rendering pass. It conflates the following:
+//  - GLSL shader(s) and its list of inputs
+//  - target parameters (for raster passes)
+struct pl_pass_params {
+    enum pl_pass_type type;
+
+    // Input variables.
+    struct pl_var *variables;
+    int num_variables;
+
+    // Input descriptors.
+    struct pl_desc *descriptors;
+    int num_descriptors;
+
+    // Compile-time specialization constants.
+    struct pl_constant *constants;
+    int num_constants;
+
+    // Initial data for the specialization constants. Optional. If NULL,
+    // specialization constants receive the values from the shader text.
+    void *constant_data;
+
+    // Push constant region. Must be be a multiple of 4 <= limits.max_pushc_size
+    size_t push_constants_size;
+
+    // The shader text in GLSL. For PL_PASS_RASTER, this is interpreted
+    // as a fragment shader. For PL_PASS_COMPUTE, this is interpreted as
+    // a compute shader.
+    const char *glsl_shader;
+
+    // --- type==PL_PASS_RASTER only
+
+    // Describes the interpretation and layout of the vertex data.
+    enum pl_prim_type vertex_type;
+    struct pl_vertex_attrib *vertex_attribs;
+    int num_vertex_attribs;
+    size_t vertex_stride; // must be a multiple of limits.align_vertex_stride
+
+    // The vertex shader itself.
+    const char *vertex_shader;
+
+    // Target format. The format must support PL_FMT_CAP_RENDERABLE. The
+    // resulting pass may only be used on textures that have a format with a
+    // `pl_fmt.signature` compatible to this format.
+    pl_fmt target_format;
+
+    // Target blending mode. If this is NULL, blending is disabled. Otherwise,
+    // the `target_format` must also support PL_FMT_CAP_BLENDABLE.
+    const struct pl_blend_params *blend_params;
+
+    // If false, the target's existing contents will be discarded before the
+    // pass is run. (Semantically equivalent to calling pl_tex_invalidate
+    // before every pl_pass_run, but slightly more efficient)
+    //
+    // Specifying `blend_params` requires `load_target` to be true.
+    bool load_target;
+
+    // --- Deprecated / removed fields.
+    PL_DEPRECATED const uint8_t *cached_program; // Non-functional
+    PL_DEPRECATED size_t cached_program_len;
+};
+
+#define pl_pass_params(...) (&(struct pl_pass_params) { __VA_ARGS__ })
+
+// Conflates the following typical GPU API concepts:
+// - various kinds of shaders
+// - rendering pipelines
+// - descriptor sets, uniforms, other bindings
+// - all synchronization necessary
+// - the current values of all inputs
+//
+// Thread-safety: Unsafe
+typedef const struct pl_pass_t {
+    struct pl_pass_params params;
+} *pl_pass;
+
+// Compile a shader and create a render pass. This is a rare/expensive
+// operation and may take a significant amount of time, even if a cached
+// program is used. Returns NULL on failure.
+PL_API pl_pass pl_pass_create(pl_gpu gpu, const struct pl_pass_params *params);
+PL_API void pl_pass_destroy(pl_gpu gpu, pl_pass *pass);
+
+struct pl_desc_binding {
+    const void *object; // pl_* object with type corresponding to pl_desc_type
+
+    // For PL_DESC_SAMPLED_TEX, this can be used to configure the sampler.
+    enum pl_tex_address_mode address_mode;
+    enum pl_tex_sample_mode sample_mode;
+};
+
+struct pl_var_update {
+    int index;        // index into params.variables[]
+    const void *data; // pointer to raw byte data corresponding to pl_var_host_layout()
+};
+
+struct pl_pass_run_params {
+    pl_pass pass;
+
+    // If present, the shader will be re-specialized with the new constants
+    // provided. This is a significantly cheaper operation than recompiling a
+    // brand new shader, but should still be avoided if possible.
+    //
+    // Leaving it as NULL re-uses the existing specialization values. Ignored
+    // if the shader has no specialization constants. Guaranteed to be a no-op
+    // if the values have not changed since the last invocation.
+    void *constant_data;
+
+    // This list only contains descriptors/variables which have changed
+    // since the previous invocation. All non-mentioned variables implicitly
+    // preserve their state from the last invocation.
+    struct pl_var_update *var_updates;
+    int num_var_updates;
+
+    // This list contains all descriptors used by this pass. It must
+    // always be filled, even if the descriptors haven't changed. The order
+    // must match that of pass->params.descriptors
+    struct pl_desc_binding *desc_bindings;
+
+    // The push constants for this invocation. This must always be set and
+    // fully defined for every invocation if params.push_constants_size > 0.
+    void *push_constants;
+
+    // An optional timer to report the approximate runtime of this shader pass
+    // invocation to. Note that this is only an approximation, since shaders
+    // may overlap their execution times and contend for GPU time.
+    pl_timer timer;
+
+    // --- pass->params.type==PL_PASS_RASTER only
+
+    // Target must be a 2D texture, `target->params.renderable` must be true,
+    // and `target->params.format->signature` must match the signature provided
+    // in `pass->params.target_format`.
+    //
+    // If the viewport or scissors are left blank, they are inferred from
+    // target->params.
+    //
+    // WARNING: Rendering to a *target that is being read from by the same
+    // shader is undefined behavior. In general, trying to bind the same
+    // resource multiple times to the same shader is undefined behavior.
+    pl_tex target;
+    pl_rect2d viewport; // screen space viewport (must be normalized)
+    pl_rect2d scissors; // target render scissors (must be normalized)
+
+    // Number of vertices to render
+    int vertex_count;
+
+    // Vertex data may be provided in one of two forms:
+    //
+    // 1. Drawing from host memory directly
+    const void *vertex_data;
+    // 2. Drawing from a vertex buffer (requires `vertex_buf->params.drawable`)
+    pl_buf vertex_buf;
+    size_t buf_offset;
+
+    // (Optional) Index data may be provided in the form given by `index_fmt`.
+    // These will be used for instanced rendering. Similar to vertex data, this
+    // can be provided in two forms:
+    // 1. From host memory
+    const void *index_data;
+    enum pl_index_format index_fmt;
+    // 2. From an index buffer (requires `index_buf->params.drawable`)
+    pl_buf index_buf;
+    size_t index_offset;
+    // Note: Drawing from an index buffer requires vertex data to also be
+    // present in buffer form, i.e. it's forbidden to mix `index_buf` with
+    // `vertex_data` (though vice versa is allowed).
+
+    // --- pass->params.type==PL_PASS_COMPUTE only
+
+    // Number of work groups to dispatch per dimension (X/Y/Z). Must be <= the
+    // corresponding index of limits.max_dispatch
+    int compute_groups[3];
+};
+
+#define pl_pass_run_params(...) (&(struct pl_pass_run_params) { __VA_ARGS__ })
+
+// Execute a render pass.
+PL_API void pl_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params);
+
+// This is semantically a no-op, but it provides a hint that you want to flush
+// any partially queued up commands and begin execution. There is normally no
+// need to call this, because queued commands will always be implicitly flushed
+// whenever necessary to make forward progress on commands like `pl_buf_poll`,
+// or when submitting a frame to a swapchain for display. In fact, calling this
+// function can negatively impact performance, because some GPUs rely on being
+// able to re-order and modify queued commands in order to enable optimizations
+// retroactively.
+//
+// The only time this might be beneficial to call explicitly is if you're doing
+// lots of offline processing, i.e. you aren't rendering to a swapchain but to
+// textures that you download from again. In that case you should call this
+// function after each "work item" to ensure good parallelism between them.
+//
+// It's worth noting that this function may block if you're over-feeding the
+// GPU without waiting for existing results to finish.
+PL_API void pl_gpu_flush(pl_gpu gpu);
+
+// This is like `pl_gpu_flush` but also blocks until the GPU is fully idle
+// before returning. Using this in your rendering loop is seriously disadvised,
+// and almost never the right solution. The intended use case is for deinit
+// logic, where users may want to force the all pending GPU operations to
+// finish so they can clean up their state more easily.
+//
+// After this operation is called, it's guaranteed that all pending buffer
+// operations are complete - i.e. `pl_buf_poll` is guaranteed to return false.
+// It's also guaranteed that any outstanding timer query results are available.
+//
+// Note: If you only care about buffer operations, you can accomplish this more
+// easily by using `pl_buf_poll` with the timeout set to `UINT64_MAX`. But if
+// you have many buffers it may be more convenient to call this function
+// instead. The difference is that this function will also affect e.g. renders
+// to a `pl_swapchain`.
+PL_API void pl_gpu_finish(pl_gpu gpu);
+
+// Returns true if the GPU is considered to be in a "failed" state, which
+// during normal operation is typically the result of things like the device
+// being lost (due to e.g. power management).
+//
+// If this returns true, users *should* destroy and recreate the `pl_gpu`,
+// including all associated resources, via the appropriate mechanism.
+PL_API bool pl_gpu_is_failed(pl_gpu gpu);
+
+
+// Deprecated objects and functions:
+
+// A generic synchronization object intended for use with an external API. This
+// is not required when solely using libplacebo API functions, as all required
+// synchronisation is done internally. This comes in the form of a pair of
+// semaphores - one to synchronize access in each direction.
+//
+// Thread-safety: Unsafe
+typedef const struct pl_sync_t {
+    enum pl_handle_type handle_type;
+
+    // This handle is signalled by the `pl_gpu`, and waited on by the user. It
+    // fires when it is safe for the user to access the shared resource.
+    union pl_handle wait_handle;
+
+    // This handle is signalled by the user, and waited on by the `pl_gpu`. It
+    // must fire when the user has finished accessing the shared resource.
+    union pl_handle signal_handle;
+} *pl_sync;
+
+// Create a synchronization object. Returns NULL on failure.
+//
+// `handle_type` must be exactly *one* of `pl_gpu.export_caps.sync`, and
+// indicates which type of handle to generate for sharing this sync object.
+//
+// Deprecated in favor of API-specific semaphore creation operations such as
+// `pl_vulkan_sem_create`.
+PL_DEPRECATED PL_API pl_sync pl_sync_create(pl_gpu gpu, enum pl_handle_type handle_type);
+
+// Destroy a `pl_sync`. Note that this invalidates the externally imported
+// semaphores. Users should therefore make sure that all operations that
+// wait on or signal any of the semaphore have been fully submitted and
+// processed by the external API before destroying the `pl_sync`.
+//
+// Despite this, it's safe to destroy a `pl_sync` if the only pending
+// operations that involve it are internal to libplacebo.
+PL_DEPRECATED PL_API void pl_sync_destroy(pl_gpu gpu, pl_sync *sync);
+
+// Initiates a texture export operation, allowing a texture to be accessed by
+// an external API. Returns whether successful. After this operation
+// successfully returns, it is guaranteed that `sync->wait_handle` will
+// eventually be signalled. For APIs where this is relevant, the image layout
+// should be specified as "general", e.g. `GL_LAYOUT_GENERAL_EXT` for OpenGL.
+//
+// There is no corresponding "import" operation - the next operation that uses
+// a texture will implicitly import the texture. Valid API usage requires that
+// the user *must* submit a semaphore signal operation on `sync->signal_handle`
+// before doing so. Not doing so is undefined behavior and may very well
+// deadlock the calling process and/or the graphics card!
+//
+// Note that despite this restriction, it is always valid to call
+// `pl_tex_destroy`, even if the texture is in an exported state, without
+// having to signal the corresponding sync object first.
+//
+// Deprecated in favor of API-specific synchronization mechanisms such as
+// `pl_vulkan_hold/release_ex`.
+PL_DEPRECATED PL_API bool pl_tex_export(pl_gpu gpu, pl_tex tex, pl_sync sync);
+
+
+PL_API_END
+
+#endif // LIBPLACEBO_GPU_H_
diff --git a/src/include/libplacebo/log.h b/src/include/libplacebo/log.h
new file mode 100644
index 0000000..b24c931
--- /dev/null
+++ b/src/include/libplacebo/log.h
@@ -0,0 +1,113 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_LOG_H_
+#define LIBPLACEBO_LOG_H_
+
+#include <libplacebo/config.h>
+#include <libplacebo/common.h>
+
+PL_API_BEGIN
+
+// The log level associated with a given log message.
+enum pl_log_level {
+    PL_LOG_NONE = 0,
+    PL_LOG_FATAL,   // results in total loss of function of a major component
+    PL_LOG_ERR,     // serious error; may result in degraded function
+    PL_LOG_WARN,    // warning; potentially bad, probably user-relevant
+    PL_LOG_INFO,    // informational message, also potentially harmless errors
+    PL_LOG_DEBUG,   // verbose debug message, informational
+    PL_LOG_TRACE,   // very noisy trace of activity,, usually benign
+    PL_LOG_ALL = PL_LOG_TRACE,
+};
+
+struct pl_log_params {
+    // Logging callback. All messages, informational or otherwise, will get
+    // redirected to this callback. The logged messages do not include trailing
+    // newlines. Optional.
+    void (*log_cb)(void *log_priv, enum pl_log_level level, const char *msg);
+    void *log_priv;
+
+    // The current log level. Controls the level of message that will be
+    // redirected to the log callback. Setting this to PL_LOG_ALL means all
+    // messages will be forwarded, but doing so indiscriminately can result
+    // in increased CPU usage as it may enable extra debug paths based on the
+    // configured log level.
+    enum pl_log_level log_level;
+};
+
+#define pl_log_params(...) (&(struct pl_log_params) { __VA_ARGS__ })
+PL_API extern const struct pl_log_params pl_log_default_params;
+
+// Thread-safety: Safe
+//
+// Note: In any context in which `pl_log` is used, users may also pass NULL
+// to disable logging. In other words, NULL is a valid `pl_log`.
+typedef const struct pl_log_t {
+    struct pl_log_params params;
+} *pl_log;
+
+#define pl_log_glue1(x, y) x##y
+#define pl_log_glue2(x, y) pl_log_glue1(x, y)
+// Force a link error in the case of linking against an incompatible API
+// version.
+#define pl_log_create pl_log_glue2(pl_log_create_, PL_API_VER)
+// Creates a pl_log. `api_ver` is for historical reasons and ignored currently.
+// `params` defaults to `&pl_log_default_params` if left as NULL.
+//
+// Note: As a general rule, any `params` struct used as an argument to a
+// function need only live until the corresponding function returns.
+PL_API pl_log pl_log_create(int api_ver, const struct pl_log_params *params);
+
+// Destroy a `pl_log` object.
+//
+// Note: As a general rule, all `_destroy` functions take the pointer to the
+// object to free as their parameter. This pointer is overwritten by NULL
+// afterwards. Calling a _destroy function on &{NULL} is valid, but calling it
+// on NULL itself is invalid.
+PL_API void pl_log_destroy(pl_log *log);
+
+// Update the parameters of a `pl_log` without destroying it. This can be
+// used to change the log function, log context or log level retroactively.
+// `params` defaults to `&pl_log_default_params` if left as NULL.
+//
+// Returns the previous params, atomically.
+PL_API struct pl_log_params pl_log_update(pl_log log, const struct pl_log_params *params);
+
+// Like `pl_log_update` but only updates the log level, leaving the log
+// callback intact.
+//
+// Returns the previous log level, atomically.
+PL_API enum pl_log_level pl_log_level_update(pl_log log, enum pl_log_level level);
+
+// Two simple, stream-based loggers. You can use these as the log_cb. If you
+// also set log_priv to a FILE* (e.g. stdout or stderr) it will be printed
+// there; otherwise, it will be printed to stdout or stderr depending on the
+// log level.
+//
+// The version with colors will use ANSI escape sequences to indicate the log
+// level. The version without will use explicit prefixes.
+PL_API void pl_log_simple(void *stream, enum pl_log_level level, const char *msg);
+PL_API void pl_log_color(void *stream, enum pl_log_level level, const char *msg);
+
+// Backwards compatibility with older versions of libplacebo
+#define pl_context pl_log
+#define pl_context_params pl_log_params
+
+PL_API_END
+
+#endif // LIBPLACEBO_LOG_H_
diff --git a/src/include/libplacebo/meson.build b/src/include/libplacebo/meson.build
new file mode 100644
index 0000000..2f4631e
--- /dev/null
+++ b/src/include/libplacebo/meson.build
@@ -0,0 +1,6 @@
+sources += configure_file(
+  input: 'config.h.in',
+  output: 'config.h',
+  install_dir: get_option('includedir') / meson.project_name(),
+  configuration: conf_public,
+)
diff --git a/src/include/libplacebo/opengl.h b/src/include/libplacebo/opengl.h
new file mode 100644
index 0000000..46597b2
--- /dev/null
+++ b/src/include/libplacebo/opengl.h
@@ -0,0 +1,230 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_OPENGL_H_
+#define LIBPLACEBO_OPENGL_H_
+
+#include <string.h>
+
+#include <libplacebo/gpu.h>
+#include <libplacebo/swapchain.h>
+
+PL_API_BEGIN
+
+// Note on thread safety: The thread safety of `pl_opengl` and any associated
+// GPU objects follows the same thread safety rules as the underlying OpenGL
+// context. In other words, they must only be called from the thread the OpenGL
+// context is current on.
+
+typedef const struct pl_opengl_t {
+    pl_gpu gpu;
+
+    // Detected GL version
+    int major, minor;
+
+    // List of GL/EGL extensions, provided for convenience
+    const char * const *extensions;
+    int num_extensions;
+} *pl_opengl;
+
+static inline bool pl_opengl_has_ext(pl_opengl gl, const char *ext)
+{
+    for (int i = 0; i < gl->num_extensions; i++)
+        if (!strcmp(ext, gl->extensions[i]))
+            return true;
+    return false;
+}
+
+typedef void (*pl_voidfunc_t)(void);
+
+struct pl_opengl_params {
+    // Main gl*GetProcAddr function. This will be used to load all GL/EGL
+    // functions. Optional - if unspecified, libplacebo will default to an
+    // internal loading logic which should work on most platforms.
+    pl_voidfunc_t (*get_proc_addr_ex)(void *proc_ctx, const char *procname);
+    void *proc_ctx;
+
+    // Simpler API for backwards compatibility / convenience. (This one
+    // directly matches the signature of most gl*GetProcAddr library functions)
+    pl_voidfunc_t (*get_proc_addr)(const char *procname);
+
+    // Enable OpenGL debug report callbacks. May have little effect depending
+    // on whether or not the GL context was initialized with appropriate
+    // debugging enabled.
+    bool debug;
+
+    // Allow the use of (suspected) software rasterizers and renderers. These
+    // can be useful for debugging purposes, but normally, their use is
+    // undesirable when GPU-accelerated processing is expected.
+    bool allow_software;
+
+    // Restrict the maximum allowed GLSL version. (Mainly for testing)
+    int max_glsl_version;
+
+    // Optional. Required when importing/exporting dmabufs as textures.
+    void *egl_display;
+    void *egl_context;
+
+    // Optional callbacks to bind/release the OpenGL context on the current
+    // thread. If these are specified, then the resulting `pl_gpu` will have
+    // `pl_gpu_limits.thread_safe` enabled, and may therefore be used from any
+    // thread without first needing to bind the OpenGL context.
+    //
+    // If the user is re-using the same OpenGL context in non-libplacebo code,
+    // then these callbacks should include whatever synchronization is
+    // necessary to prevent simultaneous use between libplacebo and the user.
+    bool (*make_current)(void *priv);
+    void (*release_current)(void *priv);
+    void *priv;
+};
+
+// Default/recommended parameters
+#define pl_opengl_params(...) (&(struct pl_opengl_params) { __VA_ARGS__ })
+PL_API extern const struct pl_opengl_params pl_opengl_default_params;
+
+// Creates a new OpenGL renderer based on the given parameters. This will
+// internally use whatever platform-defined mechanism (WGL, X11, EGL) is
+// appropriate for loading the OpenGL function calls, so the user doesn't need
+// to pass in a `getProcAddress` callback. If `params` is left as NULL, it
+// defaults to `&pl_opengl_default_params`. The context must be active when
+// calling this function, and must remain active whenever calling any
+// libplacebo function on the resulting `pl_opengl` or `pl_gpu`.
+//
+// Note that creating multiple `pl_opengl` instances from the same OpenGL
+// context is undefined behavior.
+PL_API pl_opengl pl_opengl_create(pl_log log, const struct pl_opengl_params *params);
+
+// All resources allocated from the `pl_gpu` contained by this `pl_opengl` must
+// be explicitly destroyed by the user before calling `pl_opengl_destroy`.
+PL_API void pl_opengl_destroy(pl_opengl *gl);
+
+// For a `pl_gpu` backed by `pl_opengl`, this function can be used to retrieve
+// the underlying `pl_opengl`. Returns NULL for any other type of `gpu`.
+PL_API pl_opengl pl_opengl_get(pl_gpu gpu);
+
+struct pl_opengl_framebuffer {
+    // ID of the framebuffer, or 0 to use the context's default framebuffer.
+    int id;
+
+    // If true, then the framebuffer is assumed to be "flipped" relative to
+    // normal GL semantics, i.e. set this to `true` if the first pixel is the
+    // top left corner.
+    bool flipped;
+};
+
+struct pl_opengl_swapchain_params {
+    // Set this to the platform-specific function to swap buffers, e.g.
+    // glXSwapBuffers, eglSwapBuffers etc. This will be called internally by
+    // `pl_swapchain_swap_buffers`. Required, unless you never call that
+    // function.
+    void (*swap_buffers)(void *priv);
+
+    // Initial framebuffer description. This can be changed later on using
+    // `pl_opengl_swapchain_update_fb`.
+    struct pl_opengl_framebuffer framebuffer;
+
+    // Attempt forcing a specific latency. If this is nonzero, then
+    // `pl_swapchain_swap_buffers` will wait until fewer than N frames are "in
+    // flight" before returning. Setting this to a high number generally
+    // accomplished nothing, because the OpenGL driver typically limits the
+    // number of buffers on its own. But setting it to a low number like 2 or
+    // even 1 can reduce latency (at the cost of throughput).
+    int max_swapchain_depth;
+
+    // Arbitrary user pointer that gets passed to `swap_buffers` etc.
+    void *priv;
+};
+
+#define pl_opengl_swapchain_params(...) (&(struct pl_opengl_swapchain_params) { __VA_ARGS__ })
+
+// Creates an instance of `pl_swapchain` tied to the active context.
+// Note: Due to OpenGL semantics, users *must* call `pl_swapchain_resize`
+// before attempting to use this swapchain, otherwise calls to
+// `pl_swapchain_start_frame` will fail.
+PL_API pl_swapchain pl_opengl_create_swapchain(pl_opengl gl,
+                                               const struct pl_opengl_swapchain_params *params);
+
+// Update the framebuffer description. After calling this function, users
+// *must* call `pl_swapchain_resize` before attempting to use the swapchain
+// again, otherwise calls to `pl_swapchain_start_frame` will fail.
+PL_API void pl_opengl_swapchain_update_fb(pl_swapchain sw,
+                                          const struct pl_opengl_framebuffer *fb);
+
+struct pl_opengl_wrap_params {
+    // The GLuint texture object itself. Optional. If no texture is provided,
+    // then only the opaque framebuffer `fbo` will be wrapped, leaving the
+    // resulting `pl_tex` object with some operations (such as sampling) being
+    // unsupported.
+    unsigned int texture;
+
+    // The GLuint associated framebuffer. Optional. If this is not specified,
+    // then libplacebo will attempt creating a framebuffer from the provided
+    // texture object (if possible).
+    //
+    // Note: As a special case, if neither a texture nor an FBO are provided,
+    // this is equivalent to wrapping the OpenGL default framebuffer (id 0).
+    unsigned int framebuffer;
+
+    // The image's dimensions (unused dimensions must be 0)
+    int width;
+    int height;
+    int depth;
+
+    // Texture-specific fields:
+    //
+    // Note: These are only relevant if `texture` is provided.
+
+    // The GLenum for the texture target to use, e.g. GL_TEXTURE_2D. Optional.
+    // If this is left as 0, the target is inferred from the number of
+    // dimensions. Users may want to set this to something specific like
+    // GL_TEXTURE_EXTERNAL_OES depending on the nature of the texture.
+    unsigned int target;
+
+    // The texture's GLint sized internal format (e.g. GL_RGBA16F). Required.
+    int iformat;
+};
+
+#define pl_opengl_wrap_params(...) (&(struct pl_opengl_wrap_params) { __VA_ARGS__ })
+
+// Wraps an external OpenGL object into a `pl_tex` abstraction. Due to the
+// internally synchronized nature of OpenGL, no explicit synchronization
+// is needed between libplacebo `pl_tex_` operations, and host accesses to
+// the texture. Wrapping the same OpenGL texture multiple times is permitted.
+// Note that this function transfers no ownership.
+//
+// This wrapper can be destroyed by simply calling `pl_tex_destroy` on it,
+// which will *not* destroy the user-provided OpenGL texture or framebuffer.
+//
+// This function may fail, in which case it returns NULL.
+PL_API pl_tex pl_opengl_wrap(pl_gpu gpu, const struct pl_opengl_wrap_params *params);
+
+// Analogous to `pl_opengl_wrap`, this function takes any `pl_tex` (including
+// ones created by `pl_tex_create`) and unwraps it to expose the underlying
+// OpenGL texture to the user. Note that this function transfers no ownership,
+// i.e. the texture object and framebuffer shall not be destroyed by the user.
+//
+// Returns the OpenGL texture. `out_target` and `out_iformat` will be updated
+// to hold the target type and internal format, respectively. (Optional)
+//
+// For renderable/blittable textures, `out_fbo` will be updated to the ID of
+// the framebuffer attached to this texture, or 0 if there is none. (Optional)
+PL_API unsigned int pl_opengl_unwrap(pl_gpu gpu, pl_tex tex, unsigned int *out_target,
+                                     int *out_iformat, unsigned int *out_fbo);
+
+PL_API_END
+
+#endif // LIBPLACEBO_OPENGL_H_
diff --git a/src/include/libplacebo/options.h b/src/include/libplacebo/options.h
new file mode 100644
index 0000000..e40f5e7
--- /dev/null
+++ b/src/include/libplacebo/options.h
@@ -0,0 +1,201 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_OPTIONS_H_
+#define LIBPLACEBO_OPTIONS_H_
+
+#include <libplacebo/renderer.h>
+
+PL_API_BEGIN
+
+// High-level heap-managed struct containing storage for all options implied by
+// pl_render_params, including a high-level interface for serializing,
+// deserializing and interfacing with them in a programmatic way.
+
+typedef const struct pl_opt_t *pl_opt;
+typedef struct pl_options_t {
+    // Non-NULL `params.*_params` pointers must always point into this struct
+    struct pl_render_params params;
+
+    // Backing storage for all of the various rendering parameters. Whether
+    // or not these params are active is determined by whether or not
+    // `params.*_params` is set to this address or NULL.
+    struct pl_deband_params deband_params;
+    struct pl_sigmoid_params sigmoid_params;
+    struct pl_color_adjustment color_adjustment;
+    struct pl_peak_detect_params peak_detect_params;
+    struct pl_color_map_params color_map_params;
+    struct pl_dither_params dither_params;
+    struct pl_icc_params icc_params PL_DEPRECATED;
+    struct pl_cone_params cone_params;
+    struct pl_blend_params blend_params;
+    struct pl_deinterlace_params deinterlace_params;
+    struct pl_distort_params distort_params;
+
+    // Backing storage for "custom" scalers. `params.upscaler` etc. will
+    // always be a pointer either to a built-in pl_filter_config, or one of
+    // these structs. `name`, `description` and `allowed` will always be
+    // valid for the respective type of filter config.
+    struct pl_filter_config upscaler;
+    struct pl_filter_config downscaler;
+    struct pl_filter_config plane_upscaler;
+    struct pl_filter_config plane_downscaler;
+    struct pl_filter_config frame_mixer;
+} *pl_options;
+
+// Allocate a new set of render params, with internally backed storage for
+// all parameters. Initialized to an "empty" config (PL_RENDER_DEFAULTS),
+// equivalent to `&pl_render_fast_params`. To initialize the struct instead to
+// the recommended default parameters, use `pl_options_reset` with
+// `pl_render_default_params`.
+//
+// If `log` is provided, errors related to parsing etc. will be logged there.
+PL_API pl_options pl_options_alloc(pl_log log);
+PL_API void pl_options_free(pl_options *opts);
+
+// Resets all options to their default values from a given struct. If `preset`
+// is NULL, `opts` is instead reset back to the initial "empty" configuration,
+// with all options disabled, as if it was freshly allocated.
+//
+// Note: This function will also reset structs which were not included in
+// `preset`, such as any custom upscalers.
+PL_API void pl_options_reset(pl_options opts, const struct pl_render_params *preset);
+
+typedef const struct pl_opt_data_t {
+    // Original options struct.
+    pl_options opts;
+
+    // Triggering option for this callback invocation.
+    pl_opt opt;
+
+    // The raw data associated with this option. Always some pointer into
+    // `opts`. Note that only PL_OPT_BOOL, PL_OPT_INT and PL_OPT_FLOAT have
+    // a fixed representation, for other fields its usefulness is dubious.
+    const void *value;
+
+    // The underlying data, as a formatted, locale-invariant string. Lifetime
+    // is limited until the return of this callback.
+    const char *text;
+} *pl_opt_data;
+
+// Query a single option from `opts` by key, or NULL if none was found.
+// The resulting pointer is only valid until the next pl_options_* call.
+PL_API pl_opt_data pl_options_get(pl_options opts, const char *key);
+
+// Update an option from a formatted value string (see `pl_opt_data.text`).
+// This can be used for all type of options, even non-string ones. In this case,
+// `value` will be parsed according to the option type.
+//
+// Returns whether successful.
+PL_API bool pl_options_set_str(pl_options opts, const char *key, const char *value);
+
+// Programmatically iterate over options set in a `pl_options`, running the
+// provided callback on each entry.
+PL_API void pl_options_iterate(pl_options opts,
+                               void (*cb)(void *priv, pl_opt_data data),
+                               void *priv);
+
+// Serialize a `pl_options` structs to a comma-separated key/value string. The
+// returned string has a lifetime valid until either the next call to
+// `pl_options_save`, or until the `pl_options` is freed.
+PL_API const char *pl_options_save(pl_options opts);
+
+// Parse a `pl_options` struct from a key/value string, in standard syntax
+// "key1=value1,key2=value2,...", and updates `opts` with the new values.
+// Valid separators include whitespace, commas (,) and (semi)colons (:;).
+//
+// Returns true if no errors occurred.
+PL_API bool pl_options_load(pl_options opts, const char *str);
+
+// Helpers for interfacing with `opts->params.hooks`. Note that using any of
+// these helpers will overwrite the array by an internally managed pointer,
+// so care must be taken when combining them with external management of
+// this memory. Negative indices are possible and are counted relative to the
+// end of the list.
+//
+// Note: These hooks are *not* included in pl_options_save() and related.
+PL_API void pl_options_add_hook(pl_options opts, const struct pl_hook *hook);
+PL_API void pl_options_insert_hook(pl_options opts, const struct pl_hook *hook, int idx);
+PL_API void pl_options_remove_hook_at(pl_options opts, int idx);
+
+// Underlying options system and list
+//
+// Note: By necessity, this option list does not cover every single field
+// present in `pl_render_params`. In particular, fields like `info_callback`,
+// `lut` and `hooks` cannot be configured through the options system, as doing
+// so would require interop with C code or I/O. (However, see
+// `pl_options_add_hook` and related)
+
+enum pl_option_type {
+    // Accepts `yes/no`, `on/off`, `true/false` and variants
+    PL_OPT_BOOL,
+
+    // Parsed as human-readable locale-invariant (C) numbers, scientific
+    // notation accepted for floats
+    PL_OPT_INT,
+    PL_OPT_FLOAT,
+
+    // Parsed as a short string containing only alphanumerics and _-,
+    // corresponding to some name/identifier. Catch-all bucket for several
+    // other types of options, such as presets, struct pointers, and functions
+    //
+    // Note: These options do not correspond to actual strings in C, the
+    // underlying type of option will determine the values of `size` and
+    // corresponding interpretation of pointers.
+    PL_OPT_STRING,
+
+    PL_OPT_TYPE_COUNT,
+};
+
+struct pl_opt_t {
+    // Programmatic key uniquely identifying this option.
+    const char *key;
+
+    // Longer, human readable friendly name
+    const char *name;
+
+    // Data type of option, affects how it is parsed. This field is purely
+    // informative for the user, the actual implementation may vary.
+    enum pl_option_type type;
+
+    // Minimum/maximum value ranges for numeric options (int / float)
+    // If both are 0.0, these limits are disabled/ignored.
+    float min, max;
+
+    // If true, this option is considered deprecated and may be removed
+    // in the future.
+    bool deprecated;
+
+    // If true, this option is considered a 'preset' (read-only), which can
+    // be loaded but not saved. (The equivalent underlying options this preset
+    // corresponds to will be saved instead)
+    bool preset;
+
+    // Internal implementation details (for parsing/saving), opaque to user
+    const void *priv;
+};
+
+// A list of options, terminated by {0} for convenience
+PL_API extern const struct pl_opt_t pl_option_list[];
+PL_API extern const int pl_option_count; // excluding terminating {0}
+
+// Returns the `pl_option` associated with a given key, or NULL
+PL_API pl_opt pl_find_option(const char *key);
+
+PL_API_END
+
+#endif // LIBPLACEBO_OPTIONS_H_
diff --git a/src/include/libplacebo/renderer.h b/src/include/libplacebo/renderer.h
new file mode 100644
index 0000000..d2e01e4
--- /dev/null
+++ b/src/include/libplacebo/renderer.h
@@ -0,0 +1,847 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_RENDERER_H_
+#define LIBPLACEBO_RENDERER_H_
+
+#include <libplacebo/config.h>
+#include <libplacebo/colorspace.h>
+#include <libplacebo/filters.h>
+#include <libplacebo/gpu.h>
+#include <libplacebo/shaders/colorspace.h>
+#include <libplacebo/shaders/deinterlacing.h>
+#include <libplacebo/shaders/dithering.h>
+#include <libplacebo/shaders/film_grain.h>
+#include <libplacebo/shaders/icc.h>
+#include <libplacebo/shaders/lut.h>
+#include <libplacebo/shaders/sampling.h>
+#include <libplacebo/shaders/custom.h>
+#include <libplacebo/swapchain.h>
+
+PL_API_BEGIN
+
+// Thread-safety: Unsafe
+typedef struct pl_renderer_t *pl_renderer;
+
+// Enum values used in pl_renderer_errors_t as a bit positions for error flags
+enum pl_render_error {
+    PL_RENDER_ERR_NONE              = 0,
+    PL_RENDER_ERR_FBO               = 1 << 0,
+    PL_RENDER_ERR_SAMPLING          = 1 << 1,
+    PL_RENDER_ERR_DEBANDING         = 1 << 2,
+    PL_RENDER_ERR_BLENDING          = 1 << 3,
+    PL_RENDER_ERR_OVERLAY           = 1 << 4,
+    PL_RENDER_ERR_PEAK_DETECT       = 1 << 5,
+    PL_RENDER_ERR_FILM_GRAIN        = 1 << 6,
+    PL_RENDER_ERR_FRAME_MIXING      = 1 << 7,
+    PL_RENDER_ERR_DEINTERLACING     = 1 << 8,
+    PL_RENDER_ERR_ERROR_DIFFUSION   = 1 << 9,
+    PL_RENDER_ERR_HOOKS             = 1 << 10,
+    PL_RENDER_ERR_CONTRAST_RECOVERY = 1 << 11,
+};
+
+// Struct describing current renderer state, including internal processing errors,
+// as well as list of signatures of disabled hooks.
+struct pl_render_errors {
+    enum pl_render_error errors;
+    // List containing signatures of disabled hooks
+    const uint64_t *disabled_hooks;
+    int num_disabled_hooks;
+};
+
+// Creates a new renderer object, which is backed by a GPU context. This is a
+// high-level object that takes care of the rendering chain as a whole, from
+// the source textures to the finished frame.
+PL_API pl_renderer pl_renderer_create(pl_log log, pl_gpu gpu);
+PL_API void pl_renderer_destroy(pl_renderer *rr);
+
+// Returns current renderer state, see pl_render_errors.
+PL_API struct pl_render_errors pl_renderer_get_errors(pl_renderer rr);
+
+// Clears errors state of renderer. If `errors` is NULL, all render errors will
+// be cleared. Otherwise only selected errors/hooks will be cleared.
+// If `PL_RENDER_ERR_HOOKS` is set and `num_disabled_hooks` is 0, clear all hooks.
+// Otherwise only selected hooks will be cleard based on `disabled_hooks` array.
+PL_API void pl_renderer_reset_errors(pl_renderer rr,
+                                     const struct pl_render_errors *errors);
+
+enum pl_lut_type {
+    PL_LUT_UNKNOWN = 0,
+    PL_LUT_NATIVE,      // applied to raw image contents (after fixing bit depth)
+    PL_LUT_NORMALIZED,  // applied to normalized (HDR) RGB values
+    PL_LUT_CONVERSION,  // LUT fully replaces color conversion
+
+    // Note: When using a PL_LUT_CONVERSION to replace the YUV->RGB conversion,
+    // `pl_render_params.color_adjustment` is no longer applied. Similarly,
+    // when using a PL_LUT_CONVERSION to replace the image->target color space
+    // conversion, `pl_render_params.color_map_params` are ignored.
+    //
+    // Note: For LUTs attached to the output frame, PL_LUT_CONVERSION should
+    // instead perform the inverse (RGB->native) conversion.
+    //
+    // Note: PL_LUT_UNKNOWN tries inferring the meaning of the LUT from the
+    // LUT's tagged metadata, and otherwise falls back to PL_LUT_NATIVE.
+};
+
+enum pl_render_stage {
+    PL_RENDER_STAGE_FRAME,  // full frame redraws, for fresh/uncached frames
+    PL_RENDER_STAGE_BLEND,  // the output blend pass (only for pl_render_image_mix)
+    PL_RENDER_STAGE_COUNT,
+};
+
+struct pl_render_info {
+    const struct pl_dispatch_info *pass;    // information about the shader
+    enum pl_render_stage stage;             // the associated render stage
+
+    // This specifies the chronological index of this pass within the frame and
+    // stage (starting at `index == 0`).
+    int index;
+
+    // For PL_RENDER_STAGE_BLEND, this specifies the number of frames
+    // being blended (since that results in a different shader).
+    int count;
+};
+
+// Represents the options used for rendering. These affect the quality of
+// the result.
+struct pl_render_params {
+    // Configures the algorithms used for upscaling and downscaling,
+    // respectively. If left as NULL, then libplacebo will only use inexpensive
+    // sampling (bilinear or nearest neighbour depending on the capabilities
+    // of the hardware / texture).
+    //
+    // Note: Setting `downscaler` to NULL also implies `skip_anti_aliasing`,
+    // since the built-in GPU sampling algorithms can't anti-alias.
+    //
+    // Note: If set to the same address as the built-in `pl_filter_bicubic`,
+    // `pl_filter_nearest` etc.; libplacebo will also use the more efficient
+    // direct sampling algorithm where possible without quality loss.
+    const struct pl_filter_config *upscaler;
+    const struct pl_filter_config *downscaler;
+
+    // If set, this overrides the value of `upscaler`/`downscaling` for
+    // subsampled (chroma) planes. These scalers are used whenever the size of
+    // multiple different `pl_plane`s in a single `pl_frame` differ, requiring
+    // adaptation when converting to/from RGB. Note that a value of NULL simply
+    // means "no override". To force built-in scaling explicitly, set this to
+    // `&pl_filter_bilinear`.
+    const struct pl_filter_config *plane_upscaler;
+    const struct pl_filter_config *plane_downscaler;
+
+    // The anti-ringing strength to apply to filters. See the equivalent option
+    // in `pl_sample_filter_params` for more information.
+    float antiringing_strength;
+
+    // Configures the algorithm used for frame mixing (when using
+    // `pl_render_image_mix`). Ignored otherwise. As a special requirement,
+    // this must be a filter config with `polar` set to false, since it's only
+    // used for 1D mixing and thus only 1D filters are compatible.
+    //
+    // If set to NULL, frame mixing is disabled, in which case
+    // `pl_render_image_mix` will use nearest-neighbour semantics. (Note that
+    // this still goes through the redraw cache, unless you also enable
+    // `skip_caching_single_frame`)
+    const struct pl_filter_config *frame_mixer;
+
+    // Configures the settings used to deband source textures. Leaving this as
+    // NULL disables debanding.
+    //
+    // Note: The `deband_params.grain` setting is automatically adjusted to
+    // prevent blowing up on HDR sources. The user need not account for this.
+    const struct pl_deband_params *deband_params;
+
+    // Configures the settings used to sigmoidize the image before upscaling.
+    // This is not always used. If NULL, disables sigmoidization.
+    const struct pl_sigmoid_params *sigmoid_params;
+
+    // Configures the color adjustment parameters used to decode the color.
+    // This can be used to apply additional artistic settings such as
+    // desaturation, etc. If NULL, defaults to &pl_color_adjustment_neutral.
+    const struct pl_color_adjustment *color_adjustment;
+
+    // Configures the settings used to detect the peak of the source content,
+    // for HDR sources. Has no effect on SDR content. If NULL, peak detection
+    // is disabled.
+    const struct pl_peak_detect_params *peak_detect_params;
+
+    // Configures the settings used to tone map from HDR to SDR, or from higher
+    // gamut to standard gamut content. If NULL, defaults to
+    // `&pl_color_map_default_params`.
+    const struct pl_color_map_params *color_map_params;
+
+    // Configures the settings used to dither to the output depth. Leaving this
+    // as NULL disables dithering.
+    const struct pl_dither_params *dither_params;
+
+    // Configures the error diffusion kernel to use for error diffusion
+    // dithering. If set, this will be used instead of `dither_params` whenever
+    // possible. Leaving this as NULL disables error diffusion.
+    const struct pl_error_diffusion_kernel *error_diffusion;
+
+    // Configures the settings used to simulate color blindness, if desired.
+    // If NULL, this feature is disabled.
+    const struct pl_cone_params *cone_params;
+
+    // Configures output blending. When rendering to the final target, the
+    // framebuffer contents will be blended using this blend mode. Requires
+    // that the target format has PL_FMT_CAP_BLENDABLE. NULL disables blending.
+    const struct pl_blend_params *blend_params;
+
+    // Configures the settings used to deinterlace frames (see
+    // `pl_frame.field`), if required.. If NULL, deinterlacing is "disabled",
+    // meaning interlaced frames are rendered as weaved frames instead.
+    //
+    // Note: As a consequence of how `pl_frame` represents individual fields,
+    // and especially when using the `pl_queue`, this will still result in
+    // frames being redundantly rendered twice. As such, it's highly
+    // recommended to, instead, fully disable deinterlacing by not marking
+    // source frames as interlaced in the first place.
+    const struct pl_deinterlace_params *deinterlace_params;
+
+    // If set, applies an extra distortion matrix to the image, after
+    // scaling and before presenting it to the screen. Can be used for e.g.
+    // fractional rotation.
+    //
+    // Note: The distortion canvas will be set to the size of `target->crop`,
+    // so this cannot effectively draw outside the specified target area,
+    // nor change the aspect ratio of the image.
+    const struct pl_distort_params *distort_params;
+
+    // List of custom user shaders / hooks.
+    // See <libplacebo/shaders/custom.h> for more information.
+    const struct pl_hook * const *hooks;
+    int num_hooks;
+
+    // Color mapping LUT. If present, this will be applied as part of the
+    // image being rendered, in normalized RGB space.
+    //
+    // Note: In this context, PL_LUT_NATIVE means "gamma light" and
+    // PL_LUT_NORMALIZED means "linear light". For HDR signals, normalized LUTs
+    // are scaled so 1.0 corresponds to the `pl_color_transfer_nominal_peak`.
+    //
+    // Note: A PL_LUT_CONVERSION fully replaces the color adaptation from
+    // `image` to `target`, including any tone-mapping (if necessary) and ICC
+    // profiles. It has the same representation as PL_LUT_NATIVE, so in this
+    // case the input and output are (respectively) non-linear light RGB.
+    const struct pl_custom_lut *lut;
+    enum pl_lut_type lut_type;
+
+    // If the image being rendered does not span the entire size of the target,
+    // it will be cleared explicitly using this background color (RGB). To
+    // disable this logic, set `skip_target_clearing`.
+    float background_color[3];
+    float background_transparency; // 0.0 for opaque, 1.0 for fully transparent
+    bool skip_target_clearing;
+
+    // If set to a value above 0.0, the output will be rendered with rounded
+    // corners, as if an alpha transparency mask had been applied. The value
+    // indicates the relative fraction of the side length to round - a value
+    // of 1.0 rounds the corners as much as possible.
+    float corner_rounding;
+
+    // If true, then transparent images will made opaque by painting them
+    // against a checkerboard pattern consisting of alternating colors. If both
+    // colors are left as {0}, they default respectively to 93% and 87% gray.
+    bool blend_against_tiles;
+    float tile_colors[2][3];
+    int tile_size;
+
+    // --- Performance / quality trade-off options:
+    // These should generally be left off where quality is desired, as they can
+    // degrade the result quite noticeably; but may be useful for older or
+    // slower hardware. Note that libplacebo will automatically disable
+    // advanced features on hardware where they are unsupported, regardless of
+    // these settings. So only enable them if you need a performance bump.
+
+    // Disables anti-aliasing on downscaling. This will result in moiré
+    // artifacts and nasty, jagged pixels when downscaling, except for some
+    // very limited special cases (e.g. bilinear downsampling to exactly 0.5x).
+    //
+    // Significantly speeds up downscaling with high downscaling ratios.
+    bool skip_anti_aliasing;
+
+    // Normally, when the size of the `target` used with `pl_render_image_mix`
+    // changes, or the render parameters are updated, the internal cache of
+    // mixed frames must be discarded in order to re-render all required
+    // frames. Setting this option to `true` will skip the cache invalidation
+    // and instead re-use the existing frames (with bilinear scaling to the new
+    // size if necessary), which comes at a quality loss shortly after a
+    // resize, but should make it much more smooth.
+    bool preserve_mixing_cache;
+
+    // --- Performance tuning / debugging options
+    // These may affect performance or may make debugging problems easier,
+    // but shouldn't have any effect on the quality.
+
+    // Normally, `pl_render_image_mix` will also push single frames through the
+    // mixer cache, in order to speed up re-draws. Enabling this option
+    // disables that logic, causing single frames to bypass the cache. (Though
+    // it will still read from, if they happen to already be cached)
+    bool skip_caching_single_frame;
+
+    // Disables linearization / sigmoidization before scaling. This might be
+    // useful when tracking down unexpected image artifacts or excessing
+    // ringing, but it shouldn't normally be necessary.
+    bool disable_linear_scaling;
+
+    // Forces the use of the "general" scaling algorithms even when using the
+    // special-cased built-in presets like `pl_filter_bicubic`. Basically, this
+    // disables the more efficient implementations in favor of the slower,
+    // general-purpose ones.
+    bool disable_builtin_scalers;
+
+    // Forces correction of subpixel offsets (using the configured `upscaler`).
+    bool correct_subpixel_offsets;
+
+    // Forces the use of dithering, even when rendering to 16-bit FBOs. This is
+    // generally pretty pointless because most 16-bit FBOs have high enough
+    // depth that rounding errors are below the human perception threshold,
+    // but this can be used to test the dither code.
+    bool force_dither;
+
+    // Disables the gamma-correct dithering logic which normally applies when
+    // dithering to low bit depths. No real use, outside of testing.
+    bool disable_dither_gamma_correction;
+
+    // Completely overrides the use of FBOs, as if there were no renderable
+    // texture format available. This disables most features.
+    bool disable_fbos;
+
+    // Use only low-bit-depth FBOs (8 bits). Note that this also implies
+    // disabling linear scaling and sigmoidization.
+    bool force_low_bit_depth_fbos;
+
+    // If this is true, all shaders will be generated as "dynamic" shaders,
+    // with any compile-time constants being replaced by runtime-adjustable
+    // values. This is generally a performance loss, but has the advantage of
+    // being able to freely change parameters without triggering shader
+    // recompilations.
+    //
+    // It's a good idea to enable while presenting configurable settings to the
+    // user, but it should be set to false once those values are "dialed in".
+    bool dynamic_constants;
+
+    // This callback is invoked for every pass successfully executed in the
+    // process of rendering a frame. Optional.
+    //
+    // Note: `info` is only valid until this function returns.
+    void (*info_callback)(void *priv, const struct pl_render_info *info);
+    void *info_priv;
+
+    // --- Deprecated/removed fields
+    bool allow_delayed_peak_detect PL_DEPRECATED; // moved to pl_peak_detect_params
+    const struct pl_icc_params *icc_params PL_DEPRECATED; // use pl_frame.icc
+    bool ignore_icc_profiles PL_DEPRECATED; // non-functional, just set pl_frame.icc to NULL
+    int lut_entries PL_DEPRECATED; // hard-coded as 256
+    float polar_cutoff PL_DEPRECATED; // hard-coded as 1e-3
+};
+
+// Bare minimum parameters, with no features enabled. This is the fastest
+// possible configuration, and should therefore be fine on any system.
+#define PL_RENDER_DEFAULTS                              \
+    .color_map_params   = &pl_color_map_default_params, \
+    .color_adjustment   = &pl_color_adjustment_neutral, \
+    .tile_colors        = {{0.93, 0.93, 0.93},          \
+                           {0.87, 0.87, 0.87}},         \
+    .tile_size          = 32,
+
+#define pl_render_params(...) (&(struct pl_render_params) { PL_RENDER_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_render_params pl_render_fast_params;
+
+// This contains the default/recommended options for reasonable image quality,
+// while also not being too terribly slow. All of the *_params structs are
+// defaulted to the corresponding *_default_params, except for deband_params,
+// which is disabled by default.
+//
+// This should be fine on most integrated GPUs, but if it's too slow,
+// consider using `pl_render_fast_params` instead.
+PL_API extern const struct pl_render_params pl_render_default_params;
+
+// This contains a higher quality preset for better image quality at the cost
+// of quite a bit of performance. In addition to the settings implied by
+// `pl_render_default_params`, it enables debanding, sets the upscaler to
+// `pl_filter_ewa_lanczossharp`, and uses pl_*_high_quality_params structs where
+// available. This should only really be used with a discrete GPU and where
+// maximum image quality is desired.
+PL_API extern const struct pl_render_params pl_render_high_quality_params;
+
+#define PL_MAX_PLANES 4
+
+// High level description of a single slice of an image. This basically
+// represents a single 2D plane, with any number of components
+struct pl_plane {
+    // The texture underlying this plane. The texture must be 2D, and must
+    // have specific parameters set depending on what the plane is being used
+    // for (see `pl_render_image`).
+    pl_tex texture;
+
+    // The preferred behaviour when sampling outside of this texture. Optional,
+    // since the default (PL_TEX_ADDRESS_CLAMP) is very reasonable.
+    enum pl_tex_address_mode address_mode;
+
+    // Controls whether or not the `texture` will be considered flipped
+    // vertically with respect to the overall image dimensions. It's generally
+    // preferable to flip planes using this setting instead of the crop in
+    // cases where the flipping is the result of e.g. negative plane strides or
+    // flipped framebuffers (OpenGL).
+    //
+    // Note that any planar padding (due to e.g. size mismatch or misalignment
+    // of subsampled planes) is always at the physical end of the texture
+    // (highest y coordinate) - even if this bool is true. However, any
+    // subsampling shift (`shift_y`) is applied with respect to the flipped
+    // direction. This ensures the correct interpretation when e.g. vertically
+    // flipping 4:2:0 sources by flipping all planes.
+    bool flipped;
+
+    // Describes the number and interpretation of the components in this plane.
+    // This defines the mapping from component index to the canonical component
+    // order (RGBA, YCbCrA or XYZA). It's worth pointing out that this is
+    // completely separate from `texture->format.sample_order`. The latter is
+    // essentially irrelevant/transparent for the API user, since it just
+    // determines which order the texture data shows up as inside the GLSL
+    // shader; whereas this field controls the actual meaning of the component.
+    //
+    // Example; if the user has a plane with just {Y} and a plane with just
+    // {Cb Cr}, and a GPU that only supports bgra formats, you would still
+    // specify the component mapping as {0} and {1 2} respectively, even though
+    // the GPU is sampling the data in the order BGRA. Use -1 for "ignored"
+    // components.
+    int components;           // number of relevant components
+    int component_mapping[4]; // semantic index of each component
+
+    // Controls the sample offset, relative to the "reference" dimensions. For
+    // an example of what to set here, see `pl_chroma_location_offset`. Note
+    // that this is given in unit of reference pixels. For a graphical example,
+    // imagine you have a 2x2 image with a 1x1 (subsampled) plane. Without any
+    // shift (0.0), the situation looks like this:
+    //
+    // X-------X  X = reference pixel
+    // |       |  P = plane pixel
+    // |   P   |
+    // |       |
+    // X-------X
+    //
+    // For 4:2:0 subsampling, this corresponds to PL_CHROMA_CENTER. If the
+    // shift_x was instead set to -0.5, the `P` pixel would be offset to the
+    // left by half the separation between the reference (`X` pixels), resulting
+    // in the following:
+    //
+    // X-------X  X = reference pixel
+    // |       |  P = plane pixel
+    // P       |
+    // |       |
+    // X-------X
+    //
+    // For 4:2:0 subsampling, this corresponds to PL_CHROMA_LEFT.
+    //
+    // Note: It's recommended to fill this using `pl_chroma_location_offset` on
+    // the chroma planes.
+    float shift_x, shift_y;
+};
+
+enum pl_overlay_mode {
+    PL_OVERLAY_NORMAL = 0, // treat the texture as a normal, full-color texture
+    PL_OVERLAY_MONOCHROME, // treat the texture as a single-component alpha map
+    PL_OVERLAY_MODE_COUNT,
+};
+
+enum pl_overlay_coords {
+    PL_OVERLAY_COORDS_AUTO = 0,  // equal to SRC/DST_FRAME, respectively
+    PL_OVERLAY_COORDS_SRC_FRAME, // relative to the raw src frame
+    PL_OVERLAY_COORDS_SRC_CROP,  // relative to the src frame crop
+    PL_OVERLAY_COORDS_DST_FRAME, // relative to the raw dst frame
+    PL_OVERLAY_COORDS_DST_CROP,  // relative to the dst frame crop
+    PL_OVERLAY_COORDS_COUNT,
+
+    // Note on rotations: If there is an end-to-end rotation between `src` and
+    // `dst`, then any overlays relative to SRC_FRAME or SRC_CROP will be
+    // rotated alongside the image, while overlays relative to DST_FRAME or
+    // DST_CROP will not.
+};
+
+struct pl_overlay_part {
+    pl_rect2df src; // source coordinate with respect to `pl_overlay.tex`
+    pl_rect2df dst; // target coordinates with respect to `pl_overlay.coords`
+
+    // If `mode` is PL_OVERLAY_MONOCHROME, then this specifies the color of
+    // this overlay part. The color is multiplied into the sampled texture's
+    // first channel.
+    float color[4];
+};
+
+// A struct representing an image overlay (e.g. for subtitles or on-screen
+// status messages, controls, ...)
+struct pl_overlay {
+    // The texture containing the backing data for overlay parts. Must have
+    // `params.sampleable` set.
+    pl_tex tex;
+
+    // This controls the coloring mode of this overlay.
+    enum pl_overlay_mode mode;
+
+    // Controls which coordinates this overlay is addressed relative to.
+    enum pl_overlay_coords coords;
+
+    // This controls the colorspace information for this overlay. The contents
+    // of the texture / the value of `color` are interpreted according to this.
+    struct pl_color_repr repr;
+    struct pl_color_space color;
+
+    // The number of parts for this overlay.
+    const struct pl_overlay_part *parts;
+    int num_parts;
+};
+
+// High-level description of a complete frame, including metadata and planes
+struct pl_frame {
+    // Each frame is split up into some number of planes, each of which may
+    // carry several components and be of any size / offset.
+    int num_planes;
+    struct pl_plane planes[PL_MAX_PLANES];
+
+    // For interlaced frames. If set, this `pl_frame` corresponds to a single
+    // field of the underlying source textures. `first_field` indicates which
+    // of these fields is ordered first in time. `prev` and `next` should point
+    // to the previous/next frames in the file, or NULL if there are none.
+    //
+    // Note: Setting these fields on the render target has no meaning and will
+    // be ignored.
+    enum pl_field field;
+    enum pl_field first_field;
+    const struct pl_frame *prev, *next;
+
+    // If set, will be called immediately before GPU access to this frame. This
+    // function *may* be used to, for example, perform synchronization with
+    // external APIs (e.g. `pl_vulkan_hold/release`). If your mapping requires
+    // a memcpy of some sort (e.g. pl_tex_transfer), users *should* instead do
+    // the memcpy up-front and avoid the use of these callbacks - because they
+    // might be called multiple times on the same frame.
+    //
+    // This function *may* arbitrarily mutate the `pl_frame`, but it *should*
+    // ideally only update `planes` - in particular, color metadata and so
+    // forth should be provided up-front as best as possible. Note that changes
+    // here will not be reflected back to the structs provided in the original
+    // `pl_render_*` call (e.g. via `pl_frame_mix`).
+    //
+    // Note: Unless dealing with interlaced frames, only one frame will ever be
+    // acquired at a time per `pl_render_*` call. So users *can* safely use
+    // this with, for example, hwdec mappers that can only map a single frame
+    // at a time. When using this with, for example, `pl_render_image_mix`,
+    // each frame to be blended is acquired and release in succession, before
+    // moving on to the next frame. For interlaced frames, the previous and
+    // next frames must also be acquired simultaneously.
+    bool (*acquire)(pl_gpu gpu, struct pl_frame *frame);
+
+    // If set, will be called after a plane is done being used by the GPU,
+    // *including* after any errors (e.g. `acquire` returning false).
+    void (*release)(pl_gpu gpu, struct pl_frame *frame);
+
+    // Color representation / encoding / semantics of this frame.
+    struct pl_color_repr repr;
+    struct pl_color_space color;
+
+    // Optional ICC profile associated with this frame.
+    pl_icc_object icc;
+
+    // Alternative to `icc`, this can be used in cases where allocating and
+    // tracking an pl_icc_object externally may be inconvenient. The resulting
+    // profile will be managed internally by the pl_renderer.
+    struct pl_icc_profile profile;
+
+    // Optional LUT associated with this frame.
+    const struct pl_custom_lut *lut;
+    enum pl_lut_type lut_type;
+
+    // The logical crop / rectangle containing the valid information, relative
+    // to the reference plane's dimensions (e.g. luma). Pixels outside of this
+    // rectangle will ostensibly be ignored, but note that this is not a hard
+    // guarantee. In particular, scaler filters may end up sampling outside of
+    // this crop. This rect may be flipped, and may be partially or wholly
+    // outside the bounds of the underlying textures. (Optional)
+    //
+    // Note that `pl_render_image` will map the input crop directly to the
+    // output crop, stretching and scaling as needed. If you wish to preserve
+    // the aspect ratio, use a dedicated function like pl_rect2df_aspect_copy.
+    pl_rect2df crop;
+
+    // Logical rotation of the image, with respect to the underlying planes.
+    // For example, if this is PL_ROTATION_90, then the image will be rotated
+    // to the right by 90° when mapping to `crop`. The actual position on-screen
+    // is unaffected, so users should ensure that the (rotated) aspect ratio
+    // matches the source. (Or use a helper like `pl_rect2df_aspect_set_rot`)
+    //
+    // Note: For `target` frames, this corresponds to a rotation of the
+    // display, for `image` frames, this corresponds to a rotation of the
+    // camera.
+    //
+    // So, as an example, target->rotation = PL_ROTATE_90 means the end user
+    // has rotated the display to the right by 90° (meaning rendering will be
+    // rotated 90° to the *left* to compensate), and image->rotation =
+    // PL_ROTATE_90 means the video provider has rotated the camera to the
+    // right by 90° (so rendering will be rotated 90° to the *right* to
+    // compensate).
+    pl_rotation rotation;
+
+    // A list of additional overlays associated with this frame. Note that will
+    // be rendered directly onto intermediate/cache frames, so changing any of
+    // these overlays may require flushing the renderer cache.
+    const struct pl_overlay *overlays;
+    int num_overlays;
+
+    // Note on subsampling and plane correspondence: All planes belonging to
+    // the same frame will only be stretched by an integer multiple (or inverse
+    // thereof) in order to match the reference dimensions of this image. For
+    // example, suppose you have an 8x4 image. A valid plane scaling would be
+    // 4x2 -> 8x4 or 4x4 -> 4x4, but not 6x4 -> 8x4. So if a 6x4 plane is
+    // given, then it would be treated like a cropped 8x4 plane (since 1.0 is
+    // the closest scaling ratio to the actual ratio of 1.3).
+    //
+    // For an explanation of why this makes sense, consider the relatively
+    // common example of a subsampled, oddly sized (e.g. jpeg) image. In such
+    // cases, for example a 35x23 image, the 4:2:0 subsampled chroma plane
+    // would have to end up as 17.5x11.5, which gets rounded up to 18x12 by
+    // implementations. So in this example, the 18x12 chroma plane would get
+    // treated by libplacebo as an oversized chroma plane - i.e. the plane
+    // would get sampled as if it was 17.5 pixels wide and 11.5 pixels large.
+
+    // Associated film grain data (see <libplacebo/shaders/film_grain.h>).
+    //
+    // Note: This is ignored for the `target` of `pl_render_image`, since
+    // un-applying grain makes little sense.
+    struct pl_film_grain_data film_grain;
+
+    // Ignored by libplacebo. May be useful for users.
+    void *user_data;
+};
+
+// Helper function to infer the chroma location offset for each plane in a
+// frame. This is equivalent to calling `pl_chroma_location_offset` on all
+// subsampled planes' shift_x/shift_y variables.
+PL_API void pl_frame_set_chroma_location(struct pl_frame *frame,
+                                         enum pl_chroma_location chroma_loc);
+
+// Fills in a `pl_frame` based on a swapchain frame's FBO and metadata.
+PL_API void pl_frame_from_swapchain(struct pl_frame *out_frame,
+                                    const struct pl_swapchain_frame *frame);
+
+// Helper function to determine if a frame is logically cropped or not. In
+// particular, this is useful in determining whether or not an output frame
+// needs to be cleared before rendering or not.
+PL_API bool pl_frame_is_cropped(const struct pl_frame *frame);
+
+// Helper function to reset a frame to a given RGB color. If the frame's
+// color representation is something other than RGB, the clear color will
+// be adjusted accordingly. `clear_color` should be non-premultiplied.
+PL_API void pl_frame_clear_rgba(pl_gpu gpu, const struct pl_frame *frame,
+                                const float clear_color[4]);
+
+// Like `pl_frame_clear_rgba` but without an alpha channel.
+static inline void pl_frame_clear(pl_gpu gpu, const struct pl_frame *frame,
+                                  const float clear_color[3])
+{
+    const float clear_color_rgba[4] = { clear_color[0], clear_color[1], clear_color[2], 1.0 };
+    pl_frame_clear_rgba(gpu, frame, clear_color_rgba);
+}
+
+// Helper functions to return the fixed/inferred pl_frame parameters used
+// for rendering internally. Mutates `image` and `target` in-place to hold
+// the modified values, which are what will actually be used for rendering.
+//
+// This currently includes:
+// - Defaulting all missing pl_color_space/repr parameters
+// - Coalescing all rotation to the target
+// - Rounding and clamping the target crop to pixel boundaries and adjusting the
+//   image crop correspondingly
+//
+// Note: This is idempotent and does not generally alter the effects of a
+// subsequent `pl_render_image` on the same pl_frame pair. (But see the
+// following warning)
+//
+// Warning: This does *not* call pl_frame.acquire/release, and so the returned
+// metadata *may* be incorrect if the acquire callback mutates the pl_frame in
+// nontrivial ways, in particular the crop and color space fields.
+PL_API void pl_frames_infer(pl_renderer rr, struct pl_frame *image,
+                            struct pl_frame *target);
+
+
+// Render a single image to a target using the given parameters. This is
+// fully dynamic, i.e. the params can change at any time. libplacebo will
+// internally detect and flush whatever caches are invalidated as a result of
+// changing colorspace, size etc.
+//
+// Required plane capabilities:
+// - Planes in `image` must be `sampleable`
+// - Planes in `target` must be `renderable`
+//
+// Recommended plane capabilities: (Optional, but good for performance)
+// - Planes in `image` should have `sample_mode` PL_TEX_SAMPLE_LINEAR
+// - Planes in `target` should be `storable`
+// - Planes in `target` should have `blit_dst`
+//
+// Note on lifetime: Once this call returns, the passed structures may be
+// freely overwritten or discarded by the caller, even the referenced
+// `pl_tex` objects may be freely reused.
+//
+// Note: `image` may be NULL, in which case `target.overlays` will still be
+// rendered, but nothing else.
+PL_API bool pl_render_image(pl_renderer rr, const struct pl_frame *image,
+                            const struct pl_frame *target,
+                            const struct pl_render_params *params);
+
+// Flushes the internal state of this renderer. This is normally not needed,
+// even if the image parameters, colorspace or target configuration change,
+// since libplacebo will internally detect such circumstances and recreate
+// outdated resources automatically. Doing this explicitly *may* be useful to
+// purge some state related to things like HDR peak detection or frame mixing,
+// so calling it is a good idea if the content source is expected to change
+// dramatically (e.g. when switching to a different file).
+PL_API void pl_renderer_flush_cache(pl_renderer rr);
+
+// Mirrors `pl_get_detected_hdr_metadata`, giving you the current internal peak
+// detection HDR metadata (when peak detection is active). Returns false if no
+// information is available (e.g. not HDR source, peak detection disabled).
+PL_API bool pl_renderer_get_hdr_metadata(pl_renderer rr,
+                                         struct pl_hdr_metadata *metadata);
+
+// Represents a mixture of input frames, distributed temporally.
+//
+// NOTE: Frames must be sorted by timestamp, i.e. `timestamps` must be
+// monotonically increasing.
+struct pl_frame_mix {
+    // The number of frames in this mixture. The number of frames should be
+    // sufficient to meet the needs of the configured frame mixer. See the
+    // section below for more information.
+    //
+    // If the number of frames is 0, this call will be equivalent to
+    // `pl_render_image` with `image == NULL`.
+    int num_frames;
+
+    // A list of the frames themselves. The frames can have different
+    // colorspaces, configurations of planes, or even sizes.
+    //
+    // Note: This is a list of pointers, to avoid users having to copy
+    // around `pl_frame` structs when re-organizing this array.
+    const struct pl_frame **frames;
+
+    // A list of unique signatures, one for each frame. These are used to
+    // identify frames across calls to this function, so it's crucial that they
+    // be both unique per-frame but also stable across invocations of
+    // `pl_render_frame_mix`.
+    const uint64_t *signatures;
+
+    // A list of relative timestamps for each frame. These are relative to the
+    // time of the vsync being drawn, i.e. this function will render the frame
+    // that will be made visible at timestamp 0.0. The values are expected to
+    // be normalized such that a separation of 1.0 corresponds to roughly one
+    // nominal source frame duration. So a constant framerate video file will
+    // always have timestamps like e.g. {-2.3, -1.3, -0.3, 0.7, 1.7, 2.7},
+    // using an example radius of 3.
+    //
+    // In cases where the framerate is variable (e.g. VFR video), the choice of
+    // what to scale to use can be difficult to answer. A typical choice would
+    // be either to use the canonical (container-tagged) framerate, or the
+    // highest momentary framerate, as a reference. If all else fails, you
+    // could also use the display's framerate.
+    //
+    // Note: This function assumes zero-order-hold semantics, i.e. the frame at
+    // timestamp 0.7 is intended to remain visible until timestamp 1.7, when
+    // the next frame replaces it.
+    const float *timestamps;
+
+    // The duration for which the vsync being drawn will be held, using the
+    // same scale as `timestamps`. If the display has an unknown or variable
+    // frame-rate (e.g. Adaptive Sync), then you're probably better off not
+    // using this function and instead just painting the frames directly using
+    // `pl_render_frame` at the correct PTS.
+    //
+    // As an example, if `vsync_duration` is 0.4, then it's assumed that the
+    // vsync being painted is visible for the period [0.0, 0.4].
+    float vsync_duration;
+
+    // Explanation of the frame mixing radius: The algorithm chosen in
+    // `pl_render_params.frame_mixer` has a canonical radius equal to
+    // `pl_filter_config.kernel->radius`. This means that the frame mixing
+    // algorithm will (only) need to consult all of the frames that have a
+    // distance within the interval [-radius, radius]. As such, the user should
+    // include all such frames in `frames`, but may prune or omit frames that
+    // lie outside it.
+    //
+    // The built-in frame mixing (`pl_render_params.frame_mixer == NULL`) has
+    // no concept of radius, it just always needs access to the "current" and
+    // "next" frames.
+};
+
+// Helper function to calculate the base frame mixing radius.
+//
+// Note: When the source FPS exceeds the display FPS, this radius must be
+// increased by the corresponding ratio.
+static inline float pl_frame_mix_radius(const struct pl_render_params *params)
+{
+    // For backwards compatibility, allow !frame_mixer->kernel
+    if (!params->frame_mixer || !params->frame_mixer->kernel)
+        return 0.0;
+
+    return params->frame_mixer->kernel->radius;
+}
+
+// Find closest frame to current PTS by zero-order hold semantics, or NULL.
+PL_API const struct pl_frame *pl_frame_mix_current(const struct pl_frame_mix *mix);
+
+// Find closest frame to current PTS by nearest neighbour semantics, or NULL.
+PL_API const struct pl_frame *pl_frame_mix_nearest(const struct pl_frame_mix *mix);
+
+// Render a mixture of images to the target using the given parameters. This
+// functions much like a generalization of `pl_render_image`, for when the API
+// user has more control over the frame queue / vsync loop, and can provide a
+// few frames from the past and future + timestamp information.
+//
+// This allows libplacebo to perform rudimentary frame mixing / interpolation,
+// in order to eliminate judder artifacts typically associated with
+// source/display frame rate mismatch.
+PL_API bool pl_render_image_mix(pl_renderer rr, const struct pl_frame_mix *images,
+                                const struct pl_frame *target,
+                                const struct pl_render_params *params);
+
+// Analog of `pl_frame_infer` corresponding to `pl_render_image_mix`. This
+// function will *not* mutate the frames contained in `mix`, and instead
+// return an adjusted copy of the "reference" frame for that image mix in
+// `out_refimage`, or {0} if the mix is empty.
+PL_API void pl_frames_infer_mix(pl_renderer rr, const struct pl_frame_mix *mix,
+                                struct pl_frame *target, struct pl_frame *out_ref);
+
+// Backwards compatibility with old filters API, may be deprecated.
+// Redundant with pl_filter_configs and masking `allowed` for
+// PL_FILTER_SCALING and PL_FILTER_FRAME_MIXING respectively.
+
+// A list of recommended frame mixer presets, terminated by {0}
+PL_API extern const struct pl_filter_preset pl_frame_mixers[];
+PL_API extern const int pl_num_frame_mixers; // excluding trailing {0}
+
+// A list of recommended scaler presets, terminated by {0}. This is almost
+// equivalent to `pl_filter_presets` with the exception of including extra
+// built-in filters that don't map to the `pl_filter` architecture.
+PL_API extern const struct pl_filter_preset pl_scale_filters[];
+PL_API extern const int pl_num_scale_filters; // excluding trailing {0}
+
+// Deprecated in favor of `pl_cache_save/pl_cache_load` on the `pl_cache`
+// associated with the `pl_gpu` this renderer is using.
+PL_DEPRECATED PL_API size_t pl_renderer_save(pl_renderer rr, uint8_t *out_cache);
+PL_DEPRECATED PL_API void pl_renderer_load(pl_renderer rr, const uint8_t *cache);
+
+PL_API_END
+
+#endif // LIBPLACEBO_RENDERER_H_
diff --git a/src/include/libplacebo/shaders.h b/src/include/libplacebo/shaders.h
new file mode 100644
index 0000000..b8046be
--- /dev/null
+++ b/src/include/libplacebo/shaders.h
@@ -0,0 +1,273 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_SHADERS_H_
+#define LIBPLACEBO_SHADERS_H_
+
+// This function defines the "direct" interface to libplacebo's GLSL shaders,
+// suitable for use in contexts where the user controls GLSL shader compilation
+// but wishes to include functions generated by libplacebo as part of their
+// own rendering process. This API is normally not used for operation with
+// libplacebo's higher-level constructs such as `pl_dispatch` or `pl_renderer`.
+
+#include <libplacebo/gpu.h>
+
+PL_API_BEGIN
+
+// Thread-safety: Unsafe
+typedef struct pl_shader_t *pl_shader;
+
+struct pl_shader_params {
+    // The `id` represents an abstract identifier for the shader, to avoid
+    // collisions with other shaders being used as part of the same larger,
+    // overarching shader. This is relevant for users which want to combine
+    // multiple `pl_shader` objects together, in which case all `pl_shader`
+    // objects should have a unique `id`.
+    uint8_t id;
+
+    // If `gpu` is non-NULL, then this `gpu` will be used to create objects
+    // such as textures and buffers, or check for required capabilities, for
+    // operations which depend on either of those. This is fully optional, i.e.
+    // these GLSL primitives are designed to be used without a dependency on
+    // `gpu` wherever possible - however, some features may not work, and will
+    // be disabled even if requested.
+    pl_gpu gpu;
+
+    // The `index` represents an abstract frame index, which shaders may use
+    // internally to do things like temporal dithering or seeding PRNGs. If the
+    // user does not care about temporal dithering/debanding, or wants
+    // deterministic rendering, this may safely be left as 0. Otherwise, it
+    // should be incremented by 1 on successive frames.
+    uint8_t index;
+
+    // If `glsl.version` is nonzero, then this structure will be used to
+    // determine the effective GLSL mode and capabilities. If `gpu` is also
+    // set, then this overrides `gpu->glsl`.
+    struct pl_glsl_version glsl;
+
+    // If this is true, all constants in the shader will be replaced by
+    // dynamic variables. This is mainly useful to avoid recompilation for
+    // shaders which expect to have their values change constantly.
+    bool dynamic_constants;
+};
+
+#define pl_shader_params(...) (&(struct pl_shader_params) { __VA_ARGS__ })
+
+// Creates a new, blank, mutable pl_shader object.
+//
+// Note: Rather than allocating and destroying many shaders, users are
+// encouraged to reuse them (using `pl_shader_reset`) for efficiency.
+PL_API pl_shader pl_shader_alloc(pl_log log, const struct pl_shader_params *params);
+
+// Frees a pl_shader and all resources associated with it.
+PL_API void pl_shader_free(pl_shader *sh);
+
+// Resets a pl_shader to a blank slate, without releasing internal memory.
+// If you're going to be re-generating shaders often, this function will let
+// you skip the re-allocation overhead.
+PL_API void pl_shader_reset(pl_shader sh, const struct pl_shader_params *params);
+
+// Returns whether or not a shader is in a "failed" state. Trying to modify a
+// shader in illegal ways (e.g. signature mismatch) will result in the shader
+// being marked as "failed". Since most pl_shader_ operations have a void
+// return type, the user can use this function to figure out whether a specific
+// shader operation has failed or not. This function is somewhat redundant
+// since `pl_shader_finalize` will also return NULL in this case.
+PL_API bool pl_shader_is_failed(const pl_shader sh);
+
+// Returns whether or not a pl_shader needs to be run as a compute shader. This
+// will never be the case unless the `pl_glsl_version` this `pl_shader` was
+// created using has `compute` support enabled.
+PL_API bool pl_shader_is_compute(const pl_shader sh);
+
+// Returns whether or not the shader has any particular output size
+// requirements. Some shaders, in particular those that sample from other
+// textures, have specific output size requirements which need to be respected
+// by the caller. If this is false, then the shader is compatible with every
+// output size. If true, the size requirements are stored into *w and *h.
+PL_API bool pl_shader_output_size(const pl_shader sh, int *w, int *h);
+
+// Indicates the type of signature that is associated with a shader result.
+// Every shader result defines a function that may be called by the user, and
+// this enum indicates the type of value that this function takes and/or
+// returns.
+//
+// Which signature a shader ends up with depends on the type of operation being
+// performed by a shader fragment, as determined by the user's calls. See below
+// for more information.
+enum pl_shader_sig {
+    PL_SHADER_SIG_NONE = 0, // no input / void output
+    PL_SHADER_SIG_COLOR,    // vec4 color (normalized so that 1.0 is the ref white)
+
+    // The following are only valid as input signatures:
+    PL_SHADER_SIG_SAMPLER, // (gsampler* src_tex, vecN tex_coord) pair,
+                           // specifics depend on how the shader was generated
+};
+
+// Structure encapsulating information about a shader. This is internally
+// refcounted, to allow moving it around without having to create deep copies.
+typedef const struct pl_shader_info_t {
+    // A copy of the parameters used to create the shader.
+    struct pl_shader_params params;
+
+    // A list of friendly names for the semantic operations being performed by
+    // this shader, e.g. "color decoding" or "debanding".
+    const char **steps;
+    int num_steps;
+
+    // As a convenience, this contains a pretty-printed version of the
+    // above list, with entries tallied and separated by commas
+    const char *description;
+} *pl_shader_info;
+
+PL_API pl_shader_info pl_shader_info_ref(pl_shader_info info);
+PL_API void pl_shader_info_deref(pl_shader_info *info);
+
+// Represents a finalized shader fragment. This is not a complete shader, but a
+// collection of raw shader text together with description of the input
+// attributes, variables and vertices it expects to be available.
+struct pl_shader_res {
+    // Descriptive information about the shader. Note that this reference is
+    // attached to the shader itself - the user does not need to manually ref
+    // or deref `info` unless they wish to move it elsewhere.
+    pl_shader_info info;
+
+    // The shader text, as literal GLSL. This will always be a function
+    // definition, such that the the function with the indicated name and
+    // signature may be called by the user.
+    const char *glsl;
+    const char *name;
+    enum pl_shader_sig input;  // what the function expects
+    enum pl_shader_sig output; // what the function returns
+
+    // For compute shaders (pl_shader_is_compute), this indicates the requested
+    // work group size. Otherwise, both fields are 0. The interpretation of
+    // these work groups is that they're tiled across the output image.
+    int compute_group_size[2];
+
+    // If this pass is a compute shader, this field indicates the shared memory
+    // size requirements for this shader pass.
+    size_t compute_shmem;
+
+    // A set of input vertex attributes needed by this shader fragment.
+    const struct pl_shader_va *vertex_attribs;
+    int num_vertex_attribs;
+
+    // A set of input variables needed by this shader fragment.
+    const struct pl_shader_var *variables;
+    int num_variables;
+
+    // A list of input descriptors needed by this shader fragment,
+    const struct pl_shader_desc *descriptors;
+    int num_descriptors;
+
+    // A list of compile-time constants used by this shader fragment.
+    const struct pl_shader_const *constants;
+    int num_constants;
+
+    // --- Deprecated fields (see `info`)
+    struct pl_shader_params params PL_DEPRECATED;
+    const char **steps PL_DEPRECATED;
+    int num_steps PL_DEPRECATED;
+    const char *description PL_DEPRECATED;
+};
+
+// Represents a vertex attribute. The four values will be bound to the four
+// corner vertices respectively, in row-wise order starting from the top left:
+//   data[0] data[1]
+//   data[2] data[3]
+struct pl_shader_va {
+    struct pl_vertex_attrib attr; // VA type, excluding `offset` and `location`
+    const void *data[4];
+};
+
+// Represents a bound shared variable / descriptor
+struct pl_shader_var {
+    struct pl_var var;  // the underlying variable description
+    const void *data;   // the raw data (as per `pl_var_host_layout`)
+    bool dynamic;       // if true, the value is expected to change frequently
+};
+
+struct pl_buffer_var {
+    struct pl_var var;
+    struct pl_var_layout layout;
+};
+
+typedef uint16_t pl_memory_qualifiers;
+enum {
+    PL_MEMORY_COHERENT  = 1 << 0, // supports synchronization across shader invocations
+    PL_MEMORY_VOLATILE  = 1 << 1, // all writes are synchronized automatically
+
+    // Note: All descriptors are also implicitly assumed to have the 'restrict'
+    // memory qualifier. There is currently no way to override this behavior.
+};
+
+struct pl_shader_desc {
+    struct pl_desc desc; // descriptor type, excluding `int binding`
+    struct pl_desc_binding binding; // contents of the descriptor binding
+
+    // For PL_DESC_BUF_UNIFORM/STORAGE, this specifies the layout of the
+    // variables contained by a buffer. Ignored for the other descriptor types
+    struct pl_buffer_var *buffer_vars;
+    int num_buffer_vars;
+
+    // For storage images and buffers, this specifies additional memory
+    // qualifiers on the descriptor. It's highly recommended to always use
+    // at least PL_MEMORY_RESTRICT. Ignored for other descriptor types.
+    pl_memory_qualifiers memory;
+};
+
+// Represents a compile-time constant. This can be lowered to a specialization
+// constant to support cheaper recompilations.
+struct pl_shader_const {
+    enum pl_var_type type;
+    const char *name;
+    const void *data;
+
+    // If true, this constant *must* be a compile-time constant, which
+    // basically just overrides `pl_shader_params.dynamic_constants`. Useful
+    // for constants which will serve as inputs to e.g. array sizes.
+    bool compile_time;
+};
+
+// Finalize a pl_shader. It is no longer mutable at this point, and any further
+// attempts to modify it result in an error. (Functions which take a `const
+// pl_shader` argument do not modify the shader and may be freely
+// called on an already-finalized shader)
+//
+// The returned pl_shader_res is bound to the lifetime of the pl_shader - and
+// will only remain valid until the pl_shader is freed or reset. This function
+// may be called multiple times, and will produce the same result each time.
+//
+// This function will return NULL if the shader is considered to be in a
+// "failed" state (see pl_shader_is_failed).
+PL_API const struct pl_shader_res *pl_shader_finalize(pl_shader sh);
+
+// Shader objects represent abstract resources that shaders need to manage in
+// order to ensure their operation. This could include shader storage buffers,
+// generated lookup textures, or other sorts of configured state. The body
+// of a shader object is fully opaque; but the user is in charge of cleaning up
+// after them and passing them to the right shader passes.
+//
+// Note: pl_shader_obj objects must be initialized to NULL by the caller.
+typedef struct pl_shader_obj_t *pl_shader_obj;
+
+PL_API void pl_shader_obj_destroy(pl_shader_obj *obj);
+
+PL_API_END
+
+#endif // LIBPLACEBO_SHADERS_H_
diff --git a/src/include/libplacebo/shaders/colorspace.h b/src/include/libplacebo/shaders/colorspace.h
new file mode 100644
index 0000000..ead0958
--- /dev/null
+++ b/src/include/libplacebo/shaders/colorspace.h
@@ -0,0 +1,381 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_SHADERS_COLORSPACE_H_
+#define LIBPLACEBO_SHADERS_COLORSPACE_H_
+
+// Color space transformation shaders. These all input and output a color
+// value (PL_SHADER_SIG_COLOR).
+
+#include <libplacebo/colorspace.h>
+#include <libplacebo/gamut_mapping.h>
+#include <libplacebo/tone_mapping.h>
+#include <libplacebo/shaders.h>
+
+// For backwards compatibility
+#include <libplacebo/shaders/dithering.h>
+
+PL_API_BEGIN
+
+// Transform the input color, in its given representation, to ensure
+// compatibility with the indicated alpha mode. Mutates `repr` to reflect the
+// change. Note that this is a no-op if the input is PL_ALPHA_UNKNOWN.
+PL_API void pl_shader_set_alpha(pl_shader sh, struct pl_color_repr *repr,
+                                enum pl_alpha_mode mode);
+
+// Colorspace reshaping for PL_COLOR_SYSTEM_DOLBYVISION. Note that this is done
+// automatically by `pl_shader_decode_color` for PL_COLOR_SYSTEM_DOLBYVISION.
+PL_API void pl_shader_dovi_reshape(pl_shader sh, const struct pl_dovi_metadata *data);
+
+// Decode the color into normalized RGB, given a specified color_repr. This
+// also takes care of additional pre- and post-conversions requires for the
+// "special" color systems (XYZ, BT.2020-C, etc.). If `params` is left as NULL,
+// it defaults to &pl_color_adjustment_neutral.
+//
+// Note: This function always returns PC-range RGB with independent alpha.
+// It mutates the pl_color_repr to reflect the change.
+//
+// Note: For DCDM XYZ decoding output is linear
+PL_API void pl_shader_decode_color(pl_shader sh, struct pl_color_repr *repr,
+                                   const struct pl_color_adjustment *params);
+
+// Encodes a color from normalized, PC-range, independent alpha RGB into a
+// given representation. That is, this performs the inverse operation of
+// `pl_shader_decode_color` (sans color adjustments).
+//
+// Note: For DCDM XYZ encoding input is expected to be linear
+PL_API void pl_shader_encode_color(pl_shader sh, const struct pl_color_repr *repr);
+
+// Linearize (expand) `vec4 color`, given a specified color space. In essence,
+// this corresponds to the ITU-R EOTF.
+//
+// Note: Unlike the ITU-R EOTF, it never includes the OOTF - even for systems
+// where the EOTF includes the OOTF (such as HLG).
+PL_API void pl_shader_linearize(pl_shader sh, const struct pl_color_space *csp);
+
+// Delinearize (compress), given a color space as output. This loosely
+// corresponds to the inverse EOTF (not the OETF) in ITU-R terminology, again
+// assuming a reference monitor.
+PL_API void pl_shader_delinearize(pl_shader sh, const struct pl_color_space *csp);
+
+struct pl_sigmoid_params {
+    // The center (bias) of the sigmoid curve. Must be between 0.0 and 1.0.
+    // If left as NULL, defaults to 0.75
+    float center;
+
+    // The slope (steepness) of the sigmoid curve. Must be between 1.0 and 20.0.
+    // If left as NULL, defaults to 6.5.
+    float slope;
+};
+
+#define PL_SIGMOID_DEFAULTS \
+    .center = 0.75,         \
+    .slope  = 6.50,
+
+#define pl_sigmoid_params(...) (&(struct pl_sigmoid_params) { PL_SIGMOID_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_sigmoid_params pl_sigmoid_default_params;
+
+// Applies a sigmoidal color transform to all channels. This helps avoid
+// ringing artifacts during upscaling by bringing the color information closer
+// to neutral and away from the extremes. If `params` is NULL, it defaults to
+// &pl_sigmoid_default_params.
+//
+// Warning: This function clamps the input to the interval [0,1]; and as such
+// it should *NOT* be used on already-decoded high-dynamic range content.
+PL_API void pl_shader_sigmoidize(pl_shader sh, const struct pl_sigmoid_params *params);
+
+// This performs the inverse operation to `pl_shader_sigmoidize`.
+PL_API void pl_shader_unsigmoidize(pl_shader sh, const struct pl_sigmoid_params *params);
+
+struct pl_peak_detect_params {
+    // Smoothing coefficient for the detected values. This controls the time
+    // parameter (tau) of an IIR low pass filter. In other words, it represent
+    // the cutoff period (= 1 / cutoff frequency) in frames. Frequencies below
+    // this length will be suppressed. This helps block out annoying
+    // "sparkling" or "flickering" due to small variations in frame-to-frame
+    // brightness. If left as 0.0, this smoothing is completely disabled.
+    float smoothing_period;
+
+    // In order to avoid reacting sluggishly on scene changes as a result of
+    // the low-pass filter, we disable it when the difference between the
+    // current frame brightness and the average frame brightness exceeds a
+    // given threshold difference. But rather than a single hard cutoff, which
+    // would lead to weird discontinuities on fades, we gradually disable it
+    // over a small window of brightness ranges. These parameters control the
+    // lower and upper bounds of this window, in units of 1% PQ.
+    //
+    // Setting either one of these to 0.0 disables this logic.
+    float scene_threshold_low;
+    float scene_threshold_high;
+
+    // Which percentile of the input image brightness histogram to consider as
+    // the true peak of the scene. If this is set to 100 (or 0), the brightest
+    // pixel is measured. Otherwise, the top of the frequency distribution is
+    // progressively cut off. Setting this too low will cause clipping of very
+    // bright details, but can improve the dynamic brightness range of scenes
+    // with very bright isolated highlights.
+    //
+    // A recommended value is 99.995%, which is very conservative and should
+    // cause no major issues in typical content.
+    float percentile;
+
+    // Allows the peak detection result to be delayed by up to a single frame,
+    // which can sometimes improve thoughput, at the cost of introducing the
+    // possibility of 1-frame flickers on transitions. Disabled by default.
+    bool allow_delayed;
+
+    // --- Deprecated / removed fields
+    float overshoot_margin PL_DEPRECATED;
+    float minimum_peak PL_DEPRECATED;
+};
+
+#define PL_PEAK_DETECT_DEFAULTS         \
+    .smoothing_period       = 20.0f,    \
+    .scene_threshold_low    = 1.0f,     \
+    .scene_threshold_high   = 3.0f,     \
+    .percentile             = 100.0f,
+
+#define PL_PEAK_DETECT_HQ_DEFAULTS      \
+    PL_PEAK_DETECT_DEFAULTS             \
+    .percentile             = 99.995f,
+
+#define pl_peak_detect_params(...) (&(struct pl_peak_detect_params) { PL_PEAK_DETECT_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_peak_detect_params pl_peak_detect_default_params;
+PL_API extern const struct pl_peak_detect_params pl_peak_detect_high_quality_params;
+
+// This function can be used to measure the CLL and FALL of a video
+// source automatically, using a compute shader. The measured values are
+// smoothed automatically (depending on the parameters), so to keep track of
+// the measured results over time, a tone mapping shader state object is used
+// to hold the state. Returns false on failure initializing the tone mapping
+// object, or if compute shaders are not supported.
+//
+// It's important that the same shader object is used for successive frames
+// belonging to the same source. If the source changes (e.g. due to a file
+// change or seek), the user should reset it with `pl_reset_detected_peak` (or
+// destroy it and use a new state object).
+//
+// The parameter `csp` holds the representation of the color values that are
+// the input to this function. (They must already be in decoded RGB form, i.e.
+// alternate color representations are not supported)
+PL_API bool pl_shader_detect_peak(pl_shader sh, struct pl_color_space csp,
+                                  pl_shader_obj *state,
+                                  const struct pl_peak_detect_params *params);
+
+// After dispatching the above shader, this function can be used to retrieve
+// the detected dynamic HDR10+ metadata parameters. The other fields of
+// `metadata` are not written to. Returns whether or not any values were
+// written. If not, the values are left untouched, so this can be used to
+// safely update `pl_hdr_metadata` values in-place. This function may or may
+// not block, depending on the previous setting of `allow_delayed`.
+PL_API bool pl_get_detected_hdr_metadata(const pl_shader_obj state,
+                                         struct pl_hdr_metadata *metadata);
+
+// After dispatching the above shader, this function *may* be used to read out
+// the detected CLL and FALL directly (in PL_HDR_NORM units). If the shader
+// has never been dispatched yet, i.e. no information is available, this will
+// return false.
+//
+// Deprecated in favor of `pl_get_detected_hdr_metadata`
+PL_DEPRECATED PL_API bool pl_get_detected_peak(const pl_shader_obj state,
+                                               float *out_cll, float *out_fall);
+
+// Resets the peak detection state in a given tone mapping state object. This
+// is not equal to `pl_shader_obj_destroy`, because it does not destroy any
+// state used by `pl_shader_tone_map`.
+PL_API void pl_reset_detected_peak(pl_shader_obj state);
+
+// Feature map extraction (for pl_color_map_args.feature_map). The result
+// of this shader should be downscaled / low-passed to the indicated kernel
+// size before use. (This does not happen automatically)
+PL_API void pl_shader_extract_features(pl_shader sh, struct pl_color_space csp);
+
+// Deprecated and unused. Libplacebo now always performs a variant of the old
+// hybrid tone-mapping, mixing together the intensity (I) and per-channel (LMS)
+// results.
+enum pl_tone_map_mode {
+    PL_TONE_MAP_AUTO    PL_DEPRECATED_ENUMERATOR,
+    PL_TONE_MAP_RGB     PL_DEPRECATED_ENUMERATOR,
+    PL_TONE_MAP_MAX     PL_DEPRECATED_ENUMERATOR,
+    PL_TONE_MAP_HYBRID  PL_DEPRECATED_ENUMERATOR,
+    PL_TONE_MAP_LUMA    PL_DEPRECATED_ENUMERATOR,
+    PL_TONE_MAP_MODE_COUNT,
+};
+
+// Deprecated by <libplacebo/gamut_mapping.h>
+enum pl_gamut_mode {
+    PL_GAMUT_CLIP       PL_DEPRECATED_ENUMERATOR, // pl_gamut_map_clip
+    PL_GAMUT_WARN       PL_DEPRECATED_ENUMERATOR, // pl_gamut_map_highlight
+    PL_GAMUT_DARKEN     PL_DEPRECATED_ENUMERATOR, // pl_gamut_map_darken
+    PL_GAMUT_DESATURATE PL_DEPRECATED_ENUMERATOR, // pl_gamut_map_desaturate
+    PL_GAMUT_MODE_COUNT,
+};
+
+struct pl_color_map_params {
+    // --- Gamut mapping options
+
+    // Gamut mapping function to use to handle out-of-gamut colors, including
+    // colors which are out-of-gamut as a consequence of tone mapping.
+    const struct pl_gamut_map_function *gamut_mapping;
+
+    // Gamut mapping constants, for expert tuning. Leave as default otherwise.
+    struct pl_gamut_map_constants gamut_constants;
+
+    // Gamut mapping 3DLUT size, for channels ICh. Defaults to {48, 32, 256}
+    int lut3d_size[3];
+
+    // Use higher quality, but slower, tricubic interpolation for gamut mapping
+    // 3DLUTs. May substantially improve the 3DLUT gamut mapping accuracy, in
+    // particular at smaller 3DLUT sizes. Shouldn't have much effect at the
+    // default size.
+    bool lut3d_tricubic;
+
+    // If true, allows the gamut mapping function to expand the gamut, in
+    // cases where the target gamut exceeds that of the source. If false,
+    // the source gamut will never be enlarged, even when using a gamut
+    // mapping function capable of bidirectional mapping.
+    bool gamut_expansion;
+
+    // --- Tone mapping options
+
+    // Tone mapping function to use to handle out-of-range colors.
+    const struct pl_tone_map_function *tone_mapping_function;
+
+    // Tone mapping constants, for expert tuning. Leave as default otherwise.
+    struct pl_tone_map_constants tone_constants;
+
+    // If true, and supported by the given tone mapping function, libplacebo
+    // will perform inverse tone mapping to expand the dynamic range of a
+    // signal. libplacebo is not liable for any HDR-induced eye damage.
+    bool inverse_tone_mapping;
+
+    // Data source to use when tone-mapping. Setting this to a specific
+    // value allows overriding the default metadata preference logic.
+    enum pl_hdr_metadata_type metadata;
+
+    // Tone mapping LUT size. Defaults to 256.
+    int lut_size;
+
+    // HDR contrast recovery strength. If set to a value above 0.0, the source
+    // image will be divided into high-frequency and low-frequency components,
+    // and a portion of the high-frequency image is added back onto the
+    // tone-mapped output. May cause excessive ringing artifacts for some HDR
+    // sources, but can improve the subjective sharpness and detail left over
+    // in the image after tone-mapping.
+    float contrast_recovery;
+
+    // Contrast recovery lowpass kernel size. Defaults to 3.5. Increasing
+    // or decreasing this will affect the visual appearance substantially.
+    float contrast_smoothness;
+
+    // --- Debugging options
+
+    // Force the use of a full tone-mapping LUT even for functions that have
+    // faster pure GLSL replacements (e.g. clip, linear, saturation).
+    bool force_tone_mapping_lut;
+
+    // Visualize the tone-mapping LUT and gamut mapping 3DLUT, in IPT space.
+    bool visualize_lut;
+
+    // Controls where to draw the visualization, relative to the rendered
+    // video (dimensions 0-1). Optional, defaults to the full picture.
+    pl_rect2df visualize_rect;
+
+    // Controls the rotation of the 3DLUT visualization.
+    float visualize_hue;    // useful range [-pi, pi]
+    float visualize_theta;  // useful range [0, pi/2]
+
+    // Graphically highlight hard-clipped pixels during tone-mapping (i.e.
+    // pixels that exceed the claimed source luminance range).
+    bool show_clipping;
+
+    // --- Deprecated fields
+    enum pl_tone_map_mode tone_mapping_mode PL_DEPRECATED; // removed
+    float tone_mapping_param PL_DEPRECATED;         // see `tone_constants`
+    float tone_mapping_crosstalk PL_DEPRECATED;     // now hard-coded as 0.04
+    enum pl_rendering_intent intent PL_DEPRECATED;  // see `gamut_mapping`
+    enum pl_gamut_mode gamut_mode PL_DEPRECATED;    // see `gamut_mapping`
+    float hybrid_mix PL_DEPRECATED;                 // removed
+};
+
+#define PL_COLOR_MAP_DEFAULTS                                   \
+    .gamut_mapping          = &pl_gamut_map_perceptual,         \
+    .tone_mapping_function  = &pl_tone_map_spline,              \
+    .gamut_constants        = { PL_GAMUT_MAP_CONSTANTS },       \
+    .tone_constants         = { PL_TONE_MAP_CONSTANTS },        \
+    .metadata               = PL_HDR_METADATA_ANY,              \
+    .lut3d_size             = {48, 32, 256},                    \
+    .lut_size               = 256,                              \
+    .visualize_rect         = {0, 0, 1, 1},                     \
+    .contrast_smoothness    = 3.5f,
+
+#define PL_COLOR_MAP_HQ_DEFAULTS                                \
+    PL_COLOR_MAP_DEFAULTS                                       \
+    .contrast_recovery      = 0.30f,
+
+#define pl_color_map_params(...) (&(struct pl_color_map_params) { PL_COLOR_MAP_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_color_map_params pl_color_map_default_params;
+PL_API extern const struct pl_color_map_params pl_color_map_high_quality_params;
+
+// Execution arguments for the `pl_shader_color_map_ex` call. Distinct from
+// `pl_color_map_params` because it is filled by internally-provided execution
+// metadata, instead of user-tunable aesthetic parameters.
+struct pl_color_map_args {
+    // Input/output color space for the mapping.
+    struct pl_color_space src;
+    struct pl_color_space dst;
+
+    // If true, the logic will assume the input has already been linearized by
+    // the caller (e.g. as part of a previous linear light scaling operation).
+    bool prelinearized;
+
+    // Object to be used to store generated LUTs. Note that this is the same
+    // state object used by `pl_shader_detect_peak`, and if that function has
+    // been called on `state` prior to `pl_shader_color_map`, the detected
+    // values will be used to guide the tone mapping algorithm. If this is not
+    // provided, tone/gamut mapping are disabled.
+    pl_shader_obj *state;
+
+    // Low-resolution intensity feature map, as generated by
+    // `pl_shader_extract_features`. Optional. No effect if
+    // `params->contrast_recovery` is disabled.
+    pl_tex feature_map;
+};
+
+#define pl_color_map_args(...) (&(struct pl_color_map_args) { __VA_ARGS__ })
+
+// Maps `vec4 color` from one color space to another color space according
+// to the parameters (described in greater depth above). If `params` is left
+// as NULL, it defaults to `&pl_color_map_default_params`
+PL_API void pl_shader_color_map_ex(pl_shader sh,
+                                   const struct pl_color_map_params *params,
+                                   const struct pl_color_map_args *args);
+
+// Backwards compatibility wrapper around `pl_shader_color_map_ex`
+PL_API void pl_shader_color_map(pl_shader sh, const struct pl_color_map_params *params,
+                                struct pl_color_space src, struct pl_color_space dst,
+                                pl_shader_obj *state, bool prelinearized);
+
+// Applies a set of cone distortion parameters to `vec4 color` in a given color
+// space. This can be used to simulate color blindness. See `pl_cone_params`
+// for more information.
+PL_API void pl_shader_cone_distort(pl_shader sh, struct pl_color_space csp,
+                                   const struct pl_cone_params *params);
+
+PL_API_END
+
+#endif // LIBPLACEBO_SHADERS_COLORSPACE_H_
diff --git a/src/include/libplacebo/shaders/custom.h b/src/include/libplacebo/shaders/custom.h
new file mode 100644
index 0000000..a4eec69
--- /dev/null
+++ b/src/include/libplacebo/shaders/custom.h
@@ -0,0 +1,341 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_SHADERS_CUSTOM_H_
+#define LIBPLACEBO_SHADERS_CUSTOM_H_
+
+#include <stdlib.h>
+
+// Functions for writing custom shaders and hooking them into the `pl_renderer`
+// pipeline, as well as compatibility functions for parsing shaders in mpv
+// format.
+
+#include <libplacebo/shaders.h>
+#include <libplacebo/dispatch.h>
+#include <libplacebo/colorspace.h>
+
+PL_API_BEGIN
+
+// Parameters describing custom shader text to be embedded into a `pl_shader`
+// object. All of the strings are optional and can be left as NULL, but without
+// a `body` in particular, the shader will do nothing useful on its own.
+struct pl_custom_shader {
+    // The prelude contains text such as extra #defines, #extension pragmas,
+    // or other parts of the shader that must be placed at the very
+    // beginning (before input layout declarations etc.)
+    //
+    // Note: #extension pragmas do not need to be emitted to enable support for
+    // resource types already attached to the shader (e.g. SSBOs), compute
+    // shaders, or GPU capabilities known to libplacebo (e.g. subgroups).
+    const char *prelude;
+
+    // The header contains text such as helper function definitions, extra
+    // uniforms, shared memory variables or buffer descriptions.
+    const char *header;
+
+    // A friendly name for the shader. (Optional)
+    const char *description;
+
+    // The "primary" GLSL code. This will be effectively appended to the "main"
+    // function. It lives in an environment given by the `input` signature, and
+    // is expected to return results in a way given by the `output` signature.
+    //
+    // Note: In the case of PL_SHADER_SIG_COLOR, the output `vec4 color` is
+    // allocated by `pl_shader_custom`, the user merely needs to assign to it.
+    //
+    // Note: For ease of development it can be useful to have the main logic
+    // live inside a helper function defined as part of `header`, and specify
+    // the `body` as a single line that simply calls the helper function.
+    const char *body;
+    enum pl_shader_sig input;
+    enum pl_shader_sig output;
+
+    // Extra descriptors, variables and vertex attributes to attach to the
+    // resulting `pl_shader_res`.
+    //
+    // Note: The names inside these will possibly be replaced by fresh
+    // identifiers internally, so users should avoid looking for exact string
+    // matches for the given names inside the `pl_shader_res`.
+    const struct pl_shader_desc *descriptors;
+    int num_descriptors;
+    const struct pl_shader_var *variables;
+    int num_variables;
+    const struct pl_shader_va *vertex_attribs;
+    int num_vertex_attribs;
+    const struct pl_shader_const *constants;
+    int num_constants;
+
+    // If true, this shader must be a compute shader. The desired workgroup
+    // size and shared memory usage can be optionally specified, or 0 if no
+    // specific work group size or shared memory size restrictions apply.
+    //
+    // See also: `pl_shader_res.compute_group_size`
+    bool compute;
+    size_t compute_shmem;
+    int compute_group_size[2];
+
+    // Fixes the output size requirements of the shader to exact dimensions.
+    // Optional, if left as 0, means the shader can be dispatched at any size.
+    int output_w;
+    int output_h;
+};
+
+// Append custom shader code, including extra descriptors and variables, to an
+// existing `pl_shader` object. Returns whether successful. This function may
+// fail in the event that e.g. the custom shader requires compute shaders on
+// an unsupported GPU, or exceeds the GPU's shared memory capabilities.
+PL_API bool pl_shader_custom(pl_shader sh, const struct pl_custom_shader *params);
+
+// Which "rendering stages" are available for user shader hooking purposes.
+// Except where otherwise noted, all stages are "non-resizable", i.e. the
+// shaders already have specific output size requirements.
+enum pl_hook_stage {
+    // Hook stages for the untouched planes, as made available by the source.
+    // These are all resizable, i.e. there are no specific output stage
+    // requirements.
+    PL_HOOK_RGB_INPUT       = 1 << 0,
+    PL_HOOK_LUMA_INPUT      = 1 << 1,
+    PL_HOOK_CHROMA_INPUT    = 1 << 2,
+    PL_HOOK_ALPHA_INPUT     = 1 << 3,
+    PL_HOOK_XYZ_INPUT       = 1 << 4,
+
+    // Hook stages for the scaled/aligned planes
+    PL_HOOK_CHROMA_SCALED   = 1 << 5,
+    PL_HOOK_ALPHA_SCALED    = 1 << 6,
+
+    PL_HOOK_NATIVE          = 1 << 7,  // Combined image in its native color space
+    PL_HOOK_RGB             = 1 << 8,  // After conversion to RGB (resizable)
+    PL_HOOK_LINEAR          = 1 << 9,  // After linearization but before scaling
+    PL_HOOK_SIGMOID         = 1 << 10, // After sigmoidization
+    PL_HOOK_PRE_KERNEL      = 1 << 11, // Immediately before the main scaler kernel
+    PL_HOOK_POST_KERNEL     = 1 << 12, // Immediately after the main scaler kernel
+    PL_HOOK_SCALED          = 1 << 13, // After scaling, before color management
+    PL_HOOK_PRE_OUTPUT      = 1 << 14, // After color management, before blending/rotation
+    PL_HOOK_OUTPUT          = 1 << 15, // After blending/rotation, before dithering
+};
+
+// Returns true if a given hook stage is resizable
+static inline bool pl_hook_stage_resizable(enum pl_hook_stage stage) {
+    switch (stage) {
+    case PL_HOOK_RGB_INPUT:
+    case PL_HOOK_LUMA_INPUT:
+    case PL_HOOK_CHROMA_INPUT:
+    case PL_HOOK_ALPHA_INPUT:
+    case PL_HOOK_XYZ_INPUT:
+    case PL_HOOK_NATIVE:
+    case PL_HOOK_RGB:
+        return true;
+
+    case PL_HOOK_CHROMA_SCALED:
+    case PL_HOOK_ALPHA_SCALED:
+    case PL_HOOK_LINEAR:
+    case PL_HOOK_SIGMOID:
+    case PL_HOOK_PRE_KERNEL:
+    case PL_HOOK_POST_KERNEL:
+    case PL_HOOK_SCALED:
+    case PL_HOOK_PRE_OUTPUT:
+    case PL_HOOK_OUTPUT:
+        return false;
+    }
+
+    abort();
+}
+
+// The different forms of communicating image data between the renderer and
+// the hooks
+enum pl_hook_sig {
+    PL_HOOK_SIG_NONE,   // No data is passed, no data is received/returned
+    PL_HOOK_SIG_COLOR,  // `vec4 color` already pre-sampled in a `pl_shader`
+    PL_HOOK_SIG_TEX,    // `pl_tex` containing the image data
+    PL_HOOK_SIG_COUNT,
+};
+
+struct pl_hook_params {
+    // GPU objects associated with the `pl_renderer`, which the user may
+    // use for their own purposes.
+    pl_gpu gpu;
+    pl_dispatch dispatch;
+
+    // Helper function to fetch a new temporary texture, using renderer-backed
+    // storage. This is guaranteed to have sane image usage requirements and a
+    // 16-bit or floating point format. The user does not need to free/destroy
+    // this texture in any way. May return NULL.
+    pl_tex (*get_tex)(void *priv, int width, int height);
+    void *priv;
+
+    // Which stage triggered the hook to run.
+    enum pl_hook_stage stage;
+
+    // For `PL_HOOK_SIG_COLOR`, this contains the existing shader object with
+    // the color already pre-sampled into `vec4 color`. The user may modify
+    // this as much as they want, as long as they don't dispatch/finalize/reset
+    // it.
+    //
+    // Note that this shader might have specific output size requirements,
+    // depending on the exact shader stage hooked by the user, and may already
+    // be a compute shader.
+    pl_shader sh;
+
+    // For `PL_HOOK_SIG_TEX`, this contains the texture that the user should
+    // sample from.
+    //
+    // Note: This texture object is owned by the renderer, and users must not
+    // modify its contents. It will not be touched for the duration of a frame,
+    // but the contents are lost in between frames.
+    pl_tex tex;
+
+    // The effective current rectangle of the image we're rendering in this
+    // shader, i.e. the effective rect of the content we're interested in,
+    // as a crop of either `sh` or `tex` (depending on the signature).
+    //
+    // Note: This is still set even for `PL_HOOK_SIG_NONE`!
+    pl_rect2df rect;
+
+    // The current effective colorspace and representation, of either the
+    // pre-sampled color (in `sh`), or the contents of `tex`, respectively.
+    //
+    // Note: This is still set even for `PL_HOOK_SIG_NONE`!
+    struct pl_color_repr repr;
+    struct pl_color_space color;
+    int components;
+
+    // The representation and colorspace of the original image, for reference.
+    const struct pl_color_repr *orig_repr;
+    const struct pl_color_space *orig_color;
+
+    // The (cropped) source and destination rectangles of the overall
+    // rendering. These are functionallty equivalent to `image.crop` and
+    // `target.crop`, respectively, but `src_rect` in particular may change as
+    // a result of previous hooks being executed. (e.g. prescalers)
+    pl_rect2df src_rect;
+    pl_rect2d dst_rect;
+};
+
+struct pl_hook_res {
+    // If true, the hook is assumed to have "failed" or errored in some way,
+    // and all other fields are ignored.
+    bool failed;
+
+    // What type of output this hook is returning.
+    // Note: If this is `PL_HOOK_SIG_NONE`, all other fields are ignored.
+    enum pl_hook_sig output;
+
+    // For `PL_HOOK_SIG_COLOR`, this *must* be set to a valid `pl_shader`
+    // object containing the sampled color value (i.e. with an output signature
+    // of `PL_SHADER_SIG_COLOR`), and *should* be allocated from the given
+    // `pl_dispatch` object. Ignored otherwise.
+    pl_shader sh;
+
+    // For `PL_HOOK_SIG_TEX`, this *must* contain the texture object containing
+    // the result of rendering the hook. This *should* be a texture allocated
+    // using the given `get_tex` callback, to ensure the format and texture
+    // usage flags are compatible with what the renderer expects.
+    pl_tex tex;
+
+    // For shaders that return some sort of output, this contains the
+    // new/altered versions of the existing "current texture" metadata.
+    struct pl_color_repr repr;
+    struct pl_color_space color;
+    int components;
+
+    // This contains the new effective rect of the contents. This may be
+    // different from the original `rect` for resizable passes. Ignored for
+    // non-resizable passes.
+    pl_rect2df rect;
+};
+
+enum pl_hook_par_mode {
+    PL_HOOK_PAR_VARIABLE,   // normal shader variable
+    PL_HOOK_PAR_DYNAMIC,    // dynamic shader variable, e.g. per-frame changing
+    PL_HOOK_PAR_CONSTANT,   // fixed at compile time (e.g. for array sizes),
+                            // must be scalar (non-vector/matrix)
+    PL_HOOK_PAR_DEFINE,     // defined in the preprocessor, must be `int`
+    PL_HOOK_PAR_MODE_COUNT,
+};
+
+typedef union pl_var_data {
+    int i;
+    unsigned u;
+    float f;
+} pl_var_data;
+
+struct pl_hook_par {
+    // Name as used in the shader.
+    const char *name;
+
+    // Type of this shader parameter, and how it's manifested in the shader.
+    enum pl_var_type type;
+    enum pl_hook_par_mode mode;
+
+    // Human-readable explanation of this parameter. (Optional)
+    const char *description;
+
+    // Mutable data pointer to current value of variable.
+    pl_var_data *data;
+
+    // Default/initial value, and lower/upper bounds.
+    pl_var_data initial;
+    pl_var_data minimum;
+    pl_var_data maximum;
+
+    // Human-readable names for the variants of an integer option. This array
+    // can be indexed directly by integer values, ranging from `minimum.i` to
+    // `maximum.i`. May be NULL, in which case options are unnamed.
+    const char * const *names;
+};
+
+// Struct describing a hook.
+//
+// Note: Users may freely create their own instances of this struct, there is
+// nothing particularly special about `pl_mpv_user_shader_parse`.
+struct pl_hook {
+    enum pl_hook_stage stages;  // Which stages to hook on
+    enum pl_hook_sig input;     // Which input signature this hook expects
+    void *priv;                 // Arbitrary user context
+
+    // Custom tunable shader parameters exported by this hook. These may be
+    // updated at any time by the user, to influence the behavior of the hook.
+    // Contents are arbitrary and subject to the method of hook construction.
+    const struct pl_hook_par *parameters;
+    int num_parameters;
+
+    // Called at the beginning of passes, to reset/initialize the hook. (Optional)
+    void (*reset)(void *priv);
+
+    // The hook function itself. Called by the renderer at any of the indicated
+    // hook stages. See `pl_hook_res` for more info on the return values.
+    struct pl_hook_res (*hook)(void *priv, const struct pl_hook_params *params);
+
+    // Unique signature identifying this hook, used to disable misbehaving hooks.
+    // All hooks with the same signature will be disabled, should they fail to
+    // execute during run-time.
+    uint64_t signature;
+};
+
+// Compatibility layer with `mpv` user shaders. See the mpv man page for more
+// information on the format. Will return `NULL` if the shader fails parsing.
+//
+// The resulting `pl_hook` objects should be destroyed with the corresponding
+// destructor when no longer needed.
+PL_API const struct pl_hook *
+pl_mpv_user_shader_parse(pl_gpu gpu, const char *shader_text, size_t shader_len);
+
+PL_API void pl_mpv_user_shader_destroy(const struct pl_hook **hook);
+
+PL_API_END
+
+#endif // LIBPLACEBO_SHADERS_CUSTOM_H_
diff --git a/src/include/libplacebo/shaders/deinterlacing.h b/src/include/libplacebo/shaders/deinterlacing.h
new file mode 100644
index 0000000..40e74e8
--- /dev/null
+++ b/src/include/libplacebo/shaders/deinterlacing.h
@@ -0,0 +1,137 @@
+
+/*
+ * This file is part of libplacebo, which is normally licensed under the terms
+ * of the LGPL v2.1+. However, this file (film_grain.h) is also available under
+ * the terms of the more permissive MIT license:
+ *
+ * Copyright (c) 2018-2019 Niklas Haas
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef LIBPLACEBO_SHADERS_DEINTERLACING_H_
+#define LIBPLACEBO_SHADERS_DEINTERLACING_H_
+
+#include <libplacebo/shaders.h>
+
+PL_API_BEGIN
+
+enum pl_field {
+    PL_FIELD_NONE = 0, // no deinterlacing
+    PL_FIELD_EVEN,     // "top" fields, with even y coordinates
+    PL_FIELD_ODD,      // "bottom" fields, with odd y coordinates
+
+    // Convenience aliases
+    PL_FIELD_TOP = PL_FIELD_EVEN,
+    PL_FIELD_BOTTOM = PL_FIELD_ODD,
+};
+
+static inline enum pl_field pl_field_other(enum pl_field field)
+{
+    switch (field) {
+    case PL_FIELD_EVEN: return PL_FIELD_ODD;
+    case PL_FIELD_ODD:  return PL_FIELD_EVEN;
+    default: return field;
+    }
+}
+
+struct pl_field_pair {
+    // Top texture. If only this is specified, it's assumed to contain both
+    // fields in an interleaved fashion (MBAFF).
+    //
+    // Note: Support for separate fields (PAFF), is currently pending, so this
+    // is the only way to provide interlaced frames at the moment.
+    pl_tex top;
+};
+
+#define pl_field_pair(...) ((struct pl_field_pair) { __VA_ARGS__ })
+
+struct pl_deinterlace_source {
+    // Previous, current and next source (interlaced) frames. `prev` and `next`
+    // may be NULL, but `cur` is required. If present, they must all have the
+    // exact same texture dimensions.
+    //
+    // Note: `prev` and `next` are only required for PL_DEINTERLACE_YADIF.
+    struct pl_field_pair prev, cur, next;
+
+    // The parity of the current field to output. This field will be unmodified
+    // from `cur`, with the corresponding other field interpolated.
+    //
+    // If this is `PL_FIELD_NONE`, no deinterlacing is performed, and the
+    // texture is merely sampled as-is.
+    enum pl_field field;
+
+    // The parity of the first frame in a stream. Set this the field that is
+    // (conceptually) ordered first in time.
+    //
+    // If this is `PL_FIELD_NONE`, it will instead default to `PL_FIELD_TOP`.
+    enum pl_field first_field;
+
+    // Components to deinterlace. Components not specified will be ignored.
+    // Optional, if left as 0, all components will be deinterlaced.
+    uint8_t component_mask;
+};
+
+#define pl_deinterlace_source(...) (&(struct pl_deinterlace_source) { __VA_ARGS__ })
+
+enum pl_deinterlace_algorithm {
+    // No-op deinterlacing, just sample the weaved frame un-touched.
+    PL_DEINTERLACE_WEAVE = 0,
+
+    // Naive bob deinterlacing. Doubles the field lines vertically.
+    PL_DEINTERLACE_BOB,
+
+    // "Yet another deinterlacing filter". Deinterlacer with temporal and
+    // spatial information. Based on FFmpeg's Yadif filter algorithm, but
+    // adapted slightly for the GPU.
+    PL_DEINTERLACE_YADIF,
+
+    PL_DEINTERLACE_ALGORITHM_COUNT,
+};
+
+// Returns whether or not an algorithm requires `prev`/`next` refs to be set.
+static inline bool pl_deinterlace_needs_refs(enum pl_deinterlace_algorithm algo)
+{
+    return algo == PL_DEINTERLACE_YADIF;
+}
+
+struct pl_deinterlace_params {
+    // Algorithm to use. The recommended default is PL_DEINTERLACE_YADIF, which
+    // provides a good trade-off of quality and speed.
+    enum pl_deinterlace_algorithm algo;
+
+    // Skip the spatial interlacing check. (PL_DEINTERLACE_YADIF only)
+    bool skip_spatial_check;
+};
+
+#define PL_DEINTERLACE_DEFAULTS     \
+    .algo   = PL_DEINTERLACE_YADIF,
+
+#define pl_deinterlace_params(...) (&(struct pl_deinterlace_params) { PL_DEINTERLACE_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_deinterlace_params pl_deinterlace_default_params;
+
+// Deinterlaces a set of interleaved source frames and outputs the result into
+// `vec4 color`. If `params` is left as NULL, it defaults to
+// `&pl_deinterlace_default_params`.
+PL_API void pl_shader_deinterlace(pl_shader sh, const struct pl_deinterlace_source *src,
+                                  const struct pl_deinterlace_params *params);
+
+PL_API_END
+
+#endif // LIBPLACEBO_SHADERS_DEINTERLACING_H_
diff --git a/src/include/libplacebo/shaders/dithering.h b/src/include/libplacebo/shaders/dithering.h
new file mode 100644
index 0000000..9146c81
--- /dev/null
+++ b/src/include/libplacebo/shaders/dithering.h
@@ -0,0 +1,140 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_SHADERS_DITHERING_H_
+#define LIBPLACEBO_SHADERS_DITHERING_H_
+
+// Dithering shaders
+
+#include <libplacebo/colorspace.h>
+#include <libplacebo/dither.h>
+#include <libplacebo/shaders.h>
+
+PL_API_BEGIN
+
+enum pl_dither_method {
+    // Dither with blue noise. Very high quality, but requires the use of a
+    // LUT. Warning: Computing a blue noise texture with a large size can be
+    // very slow, however this only needs to be performed once. Even so, using
+    // this with a `lut_size` greater than 6 is generally ill-advised. This is
+    // the preferred/default dither method.
+    PL_DITHER_BLUE_NOISE,
+
+    // Dither with an ordered (bayer) dither matrix, using a LUT. Low quality,
+    // and since this also uses a LUT, there's generally no advantage to picking
+    // this instead of `PL_DITHER_BLUE_NOISE`. It's mainly there for testing.
+    PL_DITHER_ORDERED_LUT,
+
+    // The same as `PL_DITHER_ORDERED_LUT`, but uses fixed function math instead
+    // of a LUT. This is faster, but only supports a fixed dither matrix size
+    // of 16x16 (equal to a `lut_size` of 4).
+    PL_DITHER_ORDERED_FIXED,
+
+    // Dither with white noise. This does not require a LUT and is fairly cheap
+    // to compute. Unlike the other modes it doesn't show any repeating
+    // patterns either spatially or temporally, but the downside is that this
+    // is visually fairly jarring due to the presence of low frequencies in the
+    // noise spectrum.
+    PL_DITHER_WHITE_NOISE,
+
+    PL_DITHER_METHOD_COUNT,
+};
+
+struct pl_dither_params {
+    // The source of the dither noise to use.
+    enum pl_dither_method method;
+
+    // For the dither methods which require the use of a LUT, this controls
+    // the size of the LUT (base 2). If left as NULL, this defaults to 6, which
+    // is equivalent to a 64x64 dither matrix. Must not be larger than 8.
+    int lut_size;
+
+    // Enables temporal dithering. This reduces the persistence of dithering
+    // artifacts by perturbing the dithering matrix per frame.
+    // Warning: This can cause nasty aliasing artifacts on some LCD screens.
+    bool temporal;
+
+    // Gamma function to use for dither gamma correction. This will only have
+    // an effect when dithering to low bit depths (<= 4).
+    enum pl_color_transfer transfer;
+};
+
+#define PL_DITHER_DEFAULTS                              \
+    .method     = PL_DITHER_BLUE_NOISE,                 \
+    .lut_size   = 6,                                    \
+    /* temporal dithering commonly flickers on LCDs */  \
+    .temporal   = false,
+
+#define pl_dither_params(...) (&(struct pl_dither_params) { PL_DITHER_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_dither_params pl_dither_default_params;
+
+// Dither the colors to a lower depth, given in bits. This can be used on input
+// colors of any precision. Basically, this rounds the colors to only linear
+// multiples of the stated bit depth. The average intensity of the result
+// will not change (i.e., the dither noise is balanced in both directions).
+// If `params` is NULL, it defaults to &pl_dither_default_params.
+//
+// For the dither methods which require the use of a LUT, `dither_state` must
+// be set to a valid pointer. To avoid thrashing the resource, users should
+// avoid trying to re-use the same LUT for different dither configurations. If
+// passed as NULL, libplacebo will automatically fall back to dither algorithms
+// that don't require the use of a LUT.
+//
+// Warning: This dithering algorithm is not gamma-invariant; so using it for
+// very low bit depths (below 4 or so) will noticeably increase the brightness
+// of the resulting image. When doing low bit depth dithering for aesthetic
+// purposes, it's recommended that the user explicitly (de)linearize the colors
+// before and after this algorithm.
+PL_API void pl_shader_dither(pl_shader sh, int new_depth,
+                             pl_shader_obj *dither_state,
+                             const struct pl_dither_params *params);
+
+struct pl_error_diffusion_params {
+    // Both the input and output texture must be provided up-front, with the
+    // same size. The output texture must be storable, and the input texture
+    // must be sampleable.
+    pl_tex input_tex;
+    pl_tex output_tex;
+
+    // Depth to dither to. Required.
+    int new_depth;
+
+    // Error diffusion kernel to use. Optional. If unspecified, defaults to
+    // `&pl_error_diffusion_sierra_lite`.
+    const struct pl_error_diffusion_kernel *kernel;
+};
+
+#define pl_error_diffusion_params(...) (&(struct pl_error_diffusion_params) { __VA_ARGS__ })
+
+// Computes the shared memory requirements for a given error diffusion kernel.
+// This can be used to test up-front whether or not error diffusion would be
+// supported or not, before having to initialize textures.
+PL_API size_t pl_error_diffusion_shmem_req(const struct pl_error_diffusion_kernel *kernel,
+                                           int height);
+
+// Apply an error diffusion dithering kernel. This is a much more expensive and
+// heavy dithering method, and is not generally recommended for realtime usage
+// where performance is critical.
+//
+// Requires compute shader support. Returns false if dithering fail e.g. as a
+// result of shader memory limits being exceeded. The resulting shader must be
+// dispatched with a work group count of exactly 1.
+PL_API bool pl_shader_error_diffusion(pl_shader sh, const struct pl_error_diffusion_params *params);
+
+PL_API_END
+
+#endif // LIBPLACEBO_SHADERS_DITHERING_H_
diff --git a/src/include/libplacebo/shaders/film_grain.h b/src/include/libplacebo/shaders/film_grain.h
new file mode 100644
index 0000000..8a9c78b
--- /dev/null
+++ b/src/include/libplacebo/shaders/film_grain.h
@@ -0,0 +1,137 @@
+/*
+ * This file is part of libplacebo, which is normally licensed under the terms
+ * of the LGPL v2.1+. However, this file (film_grain.h) is also available under
+ * the terms of the more permissive MIT license:
+ *
+ * Copyright (c) 2018-2019 Niklas Haas
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef LIBPLACEBO_SHADERS_FILM_GRAIN_H_
+#define LIBPLACEBO_SHADERS_FILM_GRAIN_H_
+
+// Film grain synthesis shaders for AV1 / H.274.
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <libplacebo/colorspace.h>
+#include <libplacebo/shaders.h>
+
+PL_API_BEGIN
+
+enum pl_film_grain_type {
+    PL_FILM_GRAIN_NONE = 0,
+    PL_FILM_GRAIN_AV1,
+    PL_FILM_GRAIN_H274,
+    PL_FILM_GRAIN_COUNT,
+};
+
+// AV1 film grain parameters. For the exact meaning of these, see the AV1
+// specification (section 6.8.20).
+struct pl_av1_grain_data {
+    int num_points_y;
+    uint8_t points_y[14][2];     // [n][0] = value, [n][1] = scaling
+    bool chroma_scaling_from_luma;
+    int num_points_uv[2];        // should be {0} for grayscale images
+    uint8_t points_uv[2][10][2]; // like points_y for points_uv[0, 1] = u, v
+    int scaling_shift;
+    int ar_coeff_lag;
+    int8_t ar_coeffs_y[24];
+    int8_t ar_coeffs_uv[2][25];
+    int ar_coeff_shift;
+    int grain_scale_shift;
+    int8_t uv_mult[2];
+    int8_t uv_mult_luma[2];
+    int16_t uv_offset[2];        // 9-bit value, range [-256, 255]
+    bool overlap;
+};
+
+// H.274 film grain parameters. For the exact meaning of these, see the H.274
+// specification (section 8.5).
+struct pl_h274_grain_data {
+    int model_id;
+    int blending_mode_id;
+    int log2_scale_factor;
+    bool component_model_present[3];
+    uint16_t num_intensity_intervals[3];
+    uint8_t num_model_values[3];
+    const uint8_t *intensity_interval_lower_bound[3];
+    const uint8_t *intensity_interval_upper_bound[3];
+    const int16_t (*comp_model_value[3])[6];
+};
+
+// Tagged union for film grain data
+struct pl_film_grain_data {
+    enum pl_film_grain_type type;   // film grain type
+    uint64_t seed;                  // shared seed value
+
+    union {
+        // Warning: These values are not sanity-checked at all, Invalid grain
+        // data results in undefined behavior!
+        struct pl_av1_grain_data av1;
+        struct pl_h274_grain_data h274;
+    } params;
+};
+
+// Options for the `pl_shader_film_grain` call.
+struct pl_film_grain_params {
+    // Required for all film grain types:
+    struct pl_film_grain_data data; // film grain data
+    pl_tex tex;                     // texture to sample from
+    struct pl_color_repr *repr;     // underlying color representation (see notes)
+    int components;
+    int component_mapping[4];       // same as `struct pl_plane`
+
+    // Notes for `repr`:
+    //  - repr->bits affects the rounding for grain generation
+    //  - repr->levels affects whether or not we clip to full range or not
+    //  - repr->sys affects the interpretation of channels
+    //  - *repr gets normalized by this shader, which is why it's a pointer
+
+    // Required for PL_FILM_GRAIN_AV1 only:
+    pl_tex luma_tex;                // "luma" texture (see notes)
+    int luma_comp;                  // index of luma in `luma_tex`
+
+    // Notes for `luma_tex`:
+    //  - `luma_tex` must be specified if the `tex` does not itself contain the
+    //     "luma-like" component. For XYZ systems, the Y channel is the luma
+    //     component. For RGB systems, the G channel is.
+};
+
+#define pl_film_grain_params(...) (&(struct pl_film_grain_params) { __VA_ARGS__ })
+
+// Test if film grain needs to be applied. This is a helper function that users
+// can use to decide whether or not `pl_shader_film_grain` needs to be called,
+// based on the given grain metadata.
+PL_API bool pl_needs_film_grain(const struct pl_film_grain_params *params);
+
+// Sample from a texture while applying film grain at the same time.
+// `grain_state` must be unique for every plane configuration, as it may
+// contain plane-dependent state.
+//
+// Returns false on any error, or if film grain generation is not supported
+// due to GLSL limitations.
+PL_API bool pl_shader_film_grain(pl_shader sh, pl_shader_obj *grain_state,
+                                 const struct pl_film_grain_params *params);
+
+PL_API_END
+
+#endif // LIBPLACEBO_SHADERS_FILM_GRAIN_H_
diff --git a/src/include/libplacebo/shaders/icc.h b/src/include/libplacebo/shaders/icc.h
new file mode 100644
index 0000000..a4003f4
--- /dev/null
+++ b/src/include/libplacebo/shaders/icc.h
@@ -0,0 +1,135 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_SHADERS_ICC_H_
+#define LIBPLACEBO_SHADERS_ICC_H_
+
+// Functions for generating and applying ICC-derived (3D)LUTs
+
+#include <libplacebo/colorspace.h>
+#include <libplacebo/shaders.h>
+
+PL_API_BEGIN
+
+struct pl_icc_params {
+    // The rendering intent to use, for profiles with multiple intents. A
+    // recommended value is PL_INTENT_RELATIVE_COLORIMETRIC for color-accurate
+    // video reproduction, or PL_INTENT_PERCEPTUAL for profiles containing
+    // meaningful perceptual mapping tables for some more suitable color space
+    // like BT.709.
+    //
+    // If this is set to the special value PL_INTENT_AUTO, will use the
+    // preferred intent provided by the profile header.
+    enum pl_rendering_intent intent;
+
+    // The size of the 3DLUT to generate. If left as NULL, these individually
+    // default to values appropriate for the profile. (Based on internal
+    // precision heuristics)
+    //
+    // Note: Setting this manually is strongly discouraged, as it can result
+    // in excessively high 3DLUT sizes where a much smaller LUT would have
+    // sufficed.
+    int size_r, size_g, size_b;
+
+    // This field can be used to override the detected brightness level of the
+    // ICC profile. If you set this to the special value 0 (or a negative
+    // number), libplacebo will attempt reading the brightness value from the
+    // ICC profile's tagging (if available), falling back to PL_COLOR_SDR_WHITE
+    // if unavailable.
+    float max_luma;
+
+    // Force black point compensation. May help avoid crushed or raised black
+    // points on "improper" profiles containing e.g. colorimetric tables that
+    // do not round-trip. Should not be required on well-behaved profiles,
+    // or when using PL_INTENT_PERCEPTUAL, but YMMV.
+    bool force_bpc;
+
+    // If provided, this pl_cache instance will be used, instead of the
+    // GPU-internal cache, to cache the generated 3DLUTs. Note that these can
+    // get large, especially for large values of size_{r,g,b}, so the user may
+    // wish to split this cache off from the main shader cache. (Optional)
+    pl_cache cache;
+
+    // Deprecated legacy caching API. Replaced by `cache`.
+    PL_DEPRECATED void *cache_priv;
+    PL_DEPRECATED void (*cache_save)(void *priv, uint64_t sig, const uint8_t *cache, size_t size);
+    PL_DEPRECATED bool (*cache_load)(void *priv, uint64_t sig, uint8_t *cache, size_t size);
+};
+
+#define PL_ICC_DEFAULTS                         \
+    .intent = PL_INTENT_RELATIVE_COLORIMETRIC,  \
+    .max_luma = PL_COLOR_SDR_WHITE,
+
+#define pl_icc_params(...) (&(struct pl_icc_params) { PL_ICC_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_icc_params pl_icc_default_params;
+
+// This object represents a "parsed" ICC profile.
+typedef const struct pl_icc_object_t {
+    // Provided params, with the `intent` and `size` fields set (as described)
+    struct pl_icc_params params;
+
+    // Signature of the corresponding ICC profile.
+    uint64_t signature;
+
+    // Detected color space (or UNKNOWN for profiles which don't contain an
+    // exact match), with HDR metedata set to the detected gamut and
+    // white/black value ranges.
+    struct pl_color_space csp;
+
+    // Best estimate of profile gamma. This only serves as a rough guideline.
+    float gamma;
+
+    // Smallest containing primary set, always set.
+    enum pl_color_primaries containing_primaries;
+} *pl_icc_object;
+
+// Attempts opening/parsing the contents of an ICC profile. The resulting
+// object is memory managed and may outlive the original profile - access
+// to the underlying profile is no longer needed once this returns.
+PL_API pl_icc_object pl_icc_open(pl_log log, const struct pl_icc_profile *profile,
+                                 const struct pl_icc_params *params);
+PL_API void pl_icc_close(pl_icc_object *icc);
+
+// Update an existing pl_icc_object, which may be NULL, replacing it by the
+// new profile and parameters (if incompatible).
+//
+// Returns success. `obj` is set to the created profile, or NULL on error.
+//
+// Note: If `profile->signature` matches `(*obj)->signature`, or if `profile` is
+// NULL, then the existing profile is directly reused, with only the effective
+// parameters changing. In this case, `profile->data` is also *not* read from,
+// and may safely be NULL.
+PL_API bool pl_icc_update(pl_log log, pl_icc_object *obj,
+                          const struct pl_icc_profile *profile,
+                          const struct pl_icc_params *params);
+
+// Decode the input from the colorspace determined by the attached ICC profile
+// to linear light RGB (in the profile's containing primary set). `lut` must be
+// set to a shader object that will store the GPU resources associated with the
+// generated LUT. The resulting color space will be written to `out_csp`.
+PL_API void pl_icc_decode(pl_shader sh, pl_icc_object profile, pl_shader_obj *lut,
+                          struct pl_color_space *out_csp);
+
+// Encode the input from linear light RGB (in the profile's containing primary
+// set) into the colorspace determined by the attached ICC profile. `lut` must
+// be set to a shader object that will store the GPU resources associated with
+// the generated LUT.
+PL_API void pl_icc_encode(pl_shader sh, pl_icc_object profile, pl_shader_obj *lut);
+
+PL_API_END
+
+#endif // LIBPLACEBO_SHADERS_ICC_H_
diff --git a/src/include/libplacebo/shaders/lut.h b/src/include/libplacebo/shaders/lut.h
new file mode 100644
index 0000000..6e30ddc
--- /dev/null
+++ b/src/include/libplacebo/shaders/lut.h
@@ -0,0 +1,78 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_SHADERS_LUT_H_
+#define LIBPLACEBO_SHADERS_LUT_H_
+
+// Shaders for loading and applying arbitrary custom 1D/3DLUTs
+
+#include <libplacebo/colorspace.h>
+#include <libplacebo/shaders.h>
+
+PL_API_BEGIN
+
+// Struct defining custom LUTs
+//
+// Note: Users may freely create their own instances of this struct, there is
+// nothing particularly special about `pl_lut_parse_cube`.
+struct pl_custom_lut {
+    // Some unique signature identifying this LUT, needed to detect state
+    // changes (for cache invalidation). This should ideally be a hash of the
+    // file contents. (Which is what `pl_lut_parse_*` will set it to.)
+    uint64_t signature;
+
+    // Size of each dimension, in the order R, G, B. For 1D LUTs, only the R
+    // dimension should be specified (the others left as 0).
+    int size[3];
+
+    // Raw LUT data itself, in properly scaled floating point format. For 3D
+    // LUTs, the innermost dimension is the first dimension (R), and the
+    // outermost dimension is the last dimension (B). Individual color samples
+    // are in the order R, G, B.
+    const float *data;
+
+    // Extra input/output shaper matrices. Ignored if equal to {0}. This is
+    // mostly useful for 1D LUTs, since 3D LUTs can bake the shaper matrix into
+    // the LUT itself - but it can still help optimize LUT precision.
+    pl_matrix3x3 shaper_in, shaper_out;
+
+    // Nominal metadata for the input/output of a LUT. Left as {0} if unknown.
+    // Note: This is purely informative, `pl_shader_custom_lut` ignores it.
+    struct pl_color_repr repr_in, repr_out;
+    struct pl_color_space color_in, color_out;
+};
+
+// Parse a 3DLUT in .cube format. Returns NULL if the file fails parsing.
+PL_API struct pl_custom_lut *pl_lut_parse_cube(pl_log log, const char *str, size_t str_len);
+
+// Frees a LUT created by `pl_lut_parse_*`.
+PL_API void pl_lut_free(struct pl_custom_lut **lut);
+
+// Apply a `pl_custom_lut`. The user is responsible for ensuring colors going
+// into the LUT are in the expected format as informed by the LUT metadata.
+//
+// `lut_state` must be a pointer to a NULL-initialized shader state object that
+// will be used to encapsulate any required GPU state.
+//
+// Note: `lut` does not have to be allocated by `pl_lut_parse_*`. It can be a
+// struct filled out by the user.
+PL_API void pl_shader_custom_lut(pl_shader sh, const struct pl_custom_lut *lut,
+                                 pl_shader_obj *lut_state);
+
+PL_API_END
+
+#endif // LIBPLACEBO_SHADERS_LUT_H_
diff --git a/src/include/libplacebo/shaders/sampling.h b/src/include/libplacebo/shaders/sampling.h
new file mode 100644
index 0000000..5221e44
--- /dev/null
+++ b/src/include/libplacebo/shaders/sampling.h
@@ -0,0 +1,257 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_SHADERS_SAMPLING_H_
+#define LIBPLACEBO_SHADERS_SAMPLING_H_
+
+// Sampling operations. These shaders perform some form of sampling operation
+// from a given pl_tex. In order to use these, the pl_shader *must* have been
+// created using the same `gpu` as the originating `pl_tex`. Otherwise, this
+// is undefined behavior. They require nothing (PL_SHADER_SIG_NONE) and return
+// a color (PL_SHADER_SIG_COLOR).
+
+#include <libplacebo/colorspace.h>
+#include <libplacebo/filters.h>
+#include <libplacebo/shaders.h>
+
+PL_API_BEGIN
+
+// Common parameters for sampling operations
+struct pl_sample_src {
+    // There are two mutually exclusive ways of providing the source to sample
+    // from:
+    //
+    // 1. Provide the texture and sampled region directly. This generates
+    // a shader with input signature `PL_SHADER_SIG_NONE`, which binds the
+    // texture as a descriptor (and the coordinates as a vertex attribute)
+    pl_tex tex;             // texture to sample
+    pl_rect2df rect;        // sub-rect to sample from (optional)
+    enum pl_tex_address_mode address_mode; // preferred texture address mode
+
+    // 2. Have the shader take it as an argument. Doing this requires
+    // specifying the missing metadata of the texture backing the sampler, so
+    // that the shader generation can generate the correct code.
+    int tex_w, tex_h;             // dimensions of the actual texture
+    enum pl_fmt_type format;      // format of the sampler being accepted
+    enum pl_sampler_type sampler; // type of the sampler being accepted
+    enum pl_tex_sample_mode mode; // sample mode of the sampler being accepted
+    float sampled_w, sampled_h;   // dimensions of the sampled region (optional)
+
+    // Common metadata for both sampler input types:
+    int components;   // number of components to sample (optional)
+    uint8_t component_mask; // bitmask of components to sample (optional)
+    int new_w, new_h; // dimensions of the resulting output (optional)
+    float scale;      // factor to multiply into sampled signal (optional)
+
+    // Note: `component_mask` and `components` are mutually exclusive, the
+    // former is preferred if both are specified.
+};
+
+#define pl_sample_src(...) (&(struct pl_sample_src) { __VA_ARGS__ })
+
+struct pl_deband_params {
+    // The number of debanding steps to perform per sample. Each step reduces a
+    // bit more banding, but takes time to compute. Note that the strength of
+    // each step falls off very quickly, so high numbers (>4) are practically
+    // useless. Defaults to 1.
+    int iterations;
+
+    // The debanding filter's cut-off threshold. Higher numbers increase the
+    // debanding strength dramatically, but progressively diminish image
+    // details. Defaults to 3.0.
+    float threshold;
+
+    // The debanding filter's initial radius. The radius increases linearly
+    // for each iteration. A higher radius will find more gradients, but a
+    // lower radius will smooth more aggressively. Defaults to 16.0.
+    float radius;
+
+    // Add some extra noise to the image. This significantly helps cover up
+    // remaining quantization artifacts. Higher numbers add more noise.
+    // Note: When debanding HDR sources, even a small amount of grain can
+    // result in a very big change to the brightness level. It's recommended to
+    // either scale this value down or disable it entirely for HDR.
+    //
+    // Defaults to 4.0, which is very mild.
+    float grain;
+
+    // 'Neutral' grain value for each channel being debanded (sorted in order
+    // from low to high index). Grain application will be modulated to avoid
+    // disturbing colors close to this value. Set this to a value corresponding
+    // to black in the relevant colorspace.
+    float grain_neutral[3];
+};
+
+#define PL_DEBAND_DEFAULTS  \
+    .iterations = 1,        \
+    .threshold  = 3.0,      \
+    .radius     = 16.0,     \
+    .grain      = 4.0,
+
+#define pl_deband_params(...) (&(struct pl_deband_params) {PL_DEBAND_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_deband_params pl_deband_default_params;
+
+// Debands a given texture and returns the sampled color in `vec4 color`. If
+// `params` is left as NULL, it defaults to &pl_deband_default_params. Note
+// that `tex->params.format` must have PL_FMT_CAP_LINEAR. When the given
+// `pl_sample_src` implies scaling, this effectively performs bilinear
+// sampling on the input (but not the output).
+//
+// Note: This can also be used as a pure grain function, by setting the number
+// of iterations to 0.
+PL_API void pl_shader_deband(pl_shader sh, const struct pl_sample_src *src,
+                             const struct pl_deband_params *params);
+
+// Performs direct / native texture sampling, using whatever texture filter is
+// available (linear for linearly sampleable sources, nearest otherwise).
+//
+// Note: This is generally very low quality and should be avoided if possible,
+// for both upscaling and downscaling.
+PL_API bool pl_shader_sample_direct(pl_shader sh, const struct pl_sample_src *src);
+
+// Performs hardware-accelerated nearest neighbour sampling. This is similar to
+// `pl_shader_sample_direct`, but forces nearest neighbour interpolation.
+PL_API bool pl_shader_sample_nearest(pl_shader sh, const struct pl_sample_src *src);
+
+// Performs hardware-accelerated bilinear sampling. This is similar to
+// `pl_shader_sample_direct`, but forces bilinear interpolation.
+PL_API bool pl_shader_sample_bilinear(pl_shader sh, const struct pl_sample_src *src);
+
+// Optimized versions of specific, strictly positive scaler kernels that take
+// adantage of linear texture sampling to reduce the number of fetches needed
+// by a factor of four. This family of functions performs radius-2 scaling
+// with only four texture fetches, which is far more efficient than using
+// the generalized 1D scaling method. Only works well for upscaling.
+PL_API bool pl_shader_sample_bicubic(pl_shader sh, const struct pl_sample_src *src);
+PL_API bool pl_shader_sample_hermite(pl_shader sh, const struct pl_sample_src *src);
+PL_API bool pl_shader_sample_gaussian(pl_shader sh, const struct pl_sample_src *src);
+
+// A sampler that is similar to nearest neighbour sampling, but tries to
+// preserve pixel aspect ratios. This is mathematically equivalent to taking an
+// idealized image with square pixels, sampling it at an infinite resolution,
+// and then downscaling that to the desired resolution. (Hence it being called
+// "oversample"). Good for pixel art.
+//
+// The threshold provides a cutoff threshold below which the contribution of
+// pixels should be ignored, trading some amount of aspect ratio distortion for
+// a slightly crisper image. A value of `threshold == 0.5` makes this filter
+// equivalent to regular nearest neighbour sampling.
+PL_API bool pl_shader_sample_oversample(pl_shader sh, const struct pl_sample_src *src,
+                                        float threshold);
+
+struct pl_sample_filter_params {
+    // The filter to use for sampling.
+    struct pl_filter_config filter;
+
+    // Antiringing strength. A value of 0.0 disables antiringing, and a value
+    // of 1.0 enables full-strength antiringing. Defaults to 0.0 if
+    // unspecified.
+    //
+    // Note: Ignored if `filter.antiring` is already set to something nonzero.
+    float antiring;
+
+    // Disable the use of compute shaders (e.g. if rendering to non-storable tex)
+    bool no_compute;
+    // Disable the use of filter widening / anti-aliasing (for downscaling)
+    bool no_widening;
+
+    // This shader object is used to store the LUT, and will be recreated
+    // if necessary. To avoid thrashing the resource, users should avoid trying
+    // to re-use the same LUT for different filter configurations or scaling
+    // ratios. Must be set to a valid pointer, and the target NULL-initialized.
+    pl_shader_obj *lut;
+
+    // Deprecated / removed fields
+    int lut_entries PL_DEPRECATED; // hard-coded as 256
+    float cutoff PL_DEPRECATED; // hard-coded as 1e-3
+};
+
+#define pl_sample_filter_params(...) (&(struct pl_sample_filter_params) { __VA_ARGS__ })
+
+// Performs polar sampling. This internally chooses between an optimized compute
+// shader, and various fragment shaders, depending on the supported GLSL version
+// and GPU features. Returns whether or not it was successful.
+//
+// Note: `params->filter.polar` must be true to use this function.
+PL_API bool pl_shader_sample_polar(pl_shader sh, const struct pl_sample_src *src,
+                                   const struct pl_sample_filter_params *params);
+
+// Performs orthogonal (1D) sampling. Using this twice in a row (once vertical
+// and once horizontal) effectively performs a 2D upscale. This is lower
+// quality than polar sampling, but significantly faster, and therefore the
+// recommended default. Returns whether or not it was successful.
+//
+// `src` must represent a scaling operation that only scales in one direction,
+// i.e. either only X or only Y. The other direction must be left unscaled.
+//
+// Note: Due to internal limitations, this may currently only be used on 2D
+// textures - even though the basic principle would work for 1D and 3D textures
+// as well.
+PL_API bool pl_shader_sample_ortho2(pl_shader sh, const struct pl_sample_src *src,
+                                    const struct pl_sample_filter_params *params);
+
+struct pl_distort_params {
+    // An arbitrary 2x2 affine transformation to apply to the input image.
+    // For simplicity, the input image is explicitly centered and scaled such
+    // that the longer dimension is in [-1,1], before applying this.
+    pl_transform2x2 transform;
+
+    // If true, the texture is placed inside the center of the canvas without
+    // scaling. If false, it is effectively stretched to the canvas size.
+    bool unscaled;
+
+    // If true, the transformation is automatically scaled down and shifted to
+    // ensure that the resulting image fits inside the output canvas.
+    bool constrain;
+
+    // If true, use bicubic interpolation rather than faster bilinear
+    // interpolation. Higher quality but slower.
+    bool bicubic;
+
+    // Specifies the texture address mode to use when sampling out of bounds.
+    enum pl_tex_address_mode address_mode;
+
+    // If set, all out-of-bounds accesses will instead be treated as
+    // transparent, according to the given alpha mode. (Which should match the
+    // alpha mode of the texture)
+    //
+    // Note: `address_mode` has no effect when this is specified.
+    enum pl_alpha_mode alpha_mode;
+};
+
+#define PL_DISTORT_DEFAULTS \
+    .transform.mat.m = {{ 1, 0 }, {0, 1}},
+
+#define pl_distort_params(...) (&(struct pl_distort_params) {PL_DISTORT_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_distort_params pl_distort_default_params;
+
+// Distorts the input image using a given set of transformation parameters.
+// `out_w` and `out_h` determine the size of the effective canvas inside which
+// the distorted result may be rendered. Areas outside of this canvas will
+// be implicitly cut off.
+PL_API void pl_shader_distort(pl_shader sh, pl_tex tex, int out_w, int out_h,
+                              const struct pl_distort_params *params);
+
+enum PL_DEPRECATED { // for `int pass`
+    PL_SEP_VERT = 0,
+    PL_SEP_HORIZ,
+    PL_SEP_PASSES
+};
+
+PL_API_END
+
+#endif // LIBPLACEBO_SHADERS_SAMPLING_H_
diff --git a/src/include/libplacebo/swapchain.h b/src/include/libplacebo/swapchain.h
new file mode 100644
index 0000000..b53aa5c
--- /dev/null
+++ b/src/include/libplacebo/swapchain.h
@@ -0,0 +1,171 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_SWAPCHAIN_H_
+#define LIBPLACEBO_SWAPCHAIN_H_
+
+#include <libplacebo/common.h>
+#include <libplacebo/colorspace.h>
+#include <libplacebo/gpu.h>
+
+PL_API_BEGIN
+
+// This abstraction represents a low-level interface to visible surfaces
+// exposed by a graphics API (and accompanying GPU instance), allowing users to
+// directly present frames to the screen (or window, typically). This is a
+// sister API to gpu.h and follows the same convention w.r.t undefined behavior.
+//
+// Thread-safety: Safe
+typedef const struct pl_swapchain_t {
+    pl_log log;
+    pl_gpu gpu;
+} *pl_swapchain;
+
+// Destroys this swapchain. May be used at any time, and may block until the
+// completion of all outstanding rendering commands. The swapchain and any
+// resources retrieved from it must not be used afterwards.
+PL_API void pl_swapchain_destroy(pl_swapchain *sw);
+
+// Returns the approximate current swapchain latency in vsyncs, or 0 if
+// unknown. A latency of 1 means that `submit_frame` followed by `swap_buffers`
+// will block until the just-submitted frame has finished rendering. Typical
+// values are 2 or 3, which enable better pipelining by allowing the GPU to be
+// processing one or two frames at the same time as the user is preparing the
+// next for submission.
+PL_API int pl_swapchain_latency(pl_swapchain sw);
+
+// Update/query the swapchain size. This function performs both roles: it tries
+// setting the swapchain size to the values requested by the user, and returns
+// in the same variables what width/height the swapchain was actually set to -
+// which may be (substantially) different from the values requested by the
+// user. A value of 0 means "unknown/none" (in which case, libplacebo won't try
+// updating the size - it will simply return the current state of the
+// swapchain). It's also possible for libplacebo to return values of 0, such as
+// in the case that the swapchain doesn't exist yet.
+//
+// Returns false on significant errors (e.g. dead surface). This function can
+// effectively be used to probe if creating a swapchain works.
+PL_API bool pl_swapchain_resize(pl_swapchain sw, int *width, int *height);
+
+// Backwards compatibility
+#define pl_swapchain_colors pl_color_space
+
+// Inform the swapchain about the input color space. This API deliberately
+// provides no feedback, because the swapchain can internally decide what to do
+// with this information, including ignoring it entirely, or applying it
+// asynchronously. Users must still base their rendering on the value of
+// `pl_swapchain_frame.color_space`.
+//
+// Note: Calling this function a second time completely overrides any
+// previously specified hint. So calling this on {0} or NULL resets the
+// swapchain back to its initial/preferred colorspace.
+//
+// Note: If `csp->transfer` is a HDR transfer curve but HDR metadata is left
+// unspecified, the HDR metadata defaults to `pl_hdr_metadata_hdr10`.
+// Conversely, if the HDR metadata is non-empty but `csp->transfer` is left as
+// PL_COLOR_TRC_UNKNOWN, then it instead defaults to PL_COLOR_TRC_PQ.
+PL_API void pl_swapchain_colorspace_hint(pl_swapchain sw, const struct pl_color_space *csp);
+
+// The struct used to hold the results of `pl_swapchain_start_frame`
+struct pl_swapchain_frame {
+    // A texture representing the framebuffer users should use for rendering.
+    // It's guaranteed that `fbo->params.renderable` and `fbo->params.blit_dst`
+    // will be true, but no other guarantees are made - not even that
+    // `fbo->params.format` is a real format.
+    pl_tex fbo;
+
+    // If true, the user should assume that this framebuffer will be flipped
+    // as a result of presenting it on-screen. If false, nothing special needs
+    // to be done - but if true, users should flip the coordinate system of
+    // the `pl_pass` that is rendering to this framebuffer.
+    //
+    // Note: Normally, libplacebo follows the convention that (0,0) represents
+    // the top left of the image/screen. So when flipped is true, this means
+    // (0,0) on this framebuffer gets displayed as the bottom left of the image.
+    bool flipped;
+
+    // Indicates the color representation this framebuffer will be interpreted
+    // as by the host system / compositor / display, including the bit depth
+    // and alpha handling (where available).
+    struct pl_color_repr color_repr;
+    struct pl_color_space color_space;
+};
+
+// Retrieve a new frame from the swapchain. Returns whether successful. It's
+// worth noting that this function can fail sporadically for benign reasons,
+// for example the window being invisible or inaccessible. This function may
+// block until an image is available, which may be the case if the GPU is
+// rendering frames significantly faster than the display can output them. It
+// may also be non-blocking, so users shouldn't rely on this call alone in
+// order to meter rendering speed. (Specifics depend on the underlying graphics
+// API)
+PL_API bool pl_swapchain_start_frame(pl_swapchain sw, struct pl_swapchain_frame *out_frame);
+
+// Submits the previously started frame. Non-blocking. This must be issued in
+// lockstep with pl_swapchain_start_frame - there is no way to start multiple
+// frames and submit them out-of-order. The frames submitted this way will
+// generally be made visible in a first-in first-out fashion, although
+// specifics depend on the mechanism used to create the pl_swapchain. (See the
+// platform-specific APIs for more info).
+//
+// Returns whether successful. This should normally never fail, unless the
+// GPU/surface has been lost or some other critical error has occurred. The
+// "started" frame is consumed even in the event of failure.
+//
+// Note that `start_frame` and `submit_frame` form a lock pair, i.e. trying to
+// call e.g. `pl_swapchain_resize` from another thread will block until
+// `pl_swapchain_submit_frame` is finished.
+PL_API bool pl_swapchain_submit_frame(pl_swapchain sw);
+
+// Performs a "buffer swap", or some generalization of the concept. In layman's
+// terms, this blocks until the execution of the Nth previously submitted frame
+// has been "made complete" in some sense. (The N derives from the swapchain's
+// built-in latency. See `pl_swapchain_latency` for more information).
+//
+// Users should include this call in their rendering loops in order to make
+// sure they aren't submitting rendering commands faster than the GPU can
+// process them, which would potentially lead to a queue overrun or exhaust
+// memory.
+//
+// An example loop might look like this:
+//
+//     while (rendering) {
+//         struct pl_swapchain_frame frame;
+//         bool ok = pl_swapchain_start_frame(swapchain, &frame);
+//         if (!ok) {
+//             /* wait some time, or decide to stop rendering */
+//             continue;
+//         }
+//
+//         /* do some rendering with frame.fbo */
+//
+//         ok = pl_swapchain_submit_frame(swapchain);
+//         if (!ok)
+//             break;
+//
+//         pl_swapchain_swap_buffers(swapchain);
+//     }
+//
+// The duration this function blocks for, if at all, may be very inconsistent
+// and should not be used as an authoritative source of vsync timing
+// information without sufficient smoothing/filtering (and if so, the time that
+// `start_frame` blocked for should also be included).
+PL_API void pl_swapchain_swap_buffers(pl_swapchain sw);
+
+PL_API_END
+
+#endif // LIBPLACEBO_SWAPCHAIN_H_
diff --git a/src/include/libplacebo/tone_mapping.h b/src/include/libplacebo/tone_mapping.h
new file mode 100644
index 0000000..48f1eb7
--- /dev/null
+++ b/src/include/libplacebo/tone_mapping.h
@@ -0,0 +1,268 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_TONE_MAPPING_H_
+#define LIBPLACEBO_TONE_MAPPING_H_
+
+#include <stddef.h>
+#include <stdbool.h>
+
+#include <libplacebo/common.h>
+#include <libplacebo/colorspace.h>
+
+PL_API_BEGIN
+
+struct pl_tone_map_params;
+struct pl_tone_map_function {
+    const char *name;        // Identifier
+    const char *description; // Friendly / longer name
+
+    // This controls the type of values input/output to/from `map`
+    enum pl_hdr_scaling scaling;
+
+    // The tone-mapping function itself. Iterates over all values in `lut`, and
+    // adapts them as needed.
+    //
+    // Note that the `params` struct fed into this function is guaranteed to
+    // satisfy `params->input_scaling == params->output_scaling == scaling`,
+    // and also obeys `params->input_max >= params->output_max`.
+    void (*map)(float *lut, const struct pl_tone_map_params *params);
+
+    // Inverse tone mapping function. Optional. If absent, this tone mapping
+    // curve only works in the forwards direction.
+    //
+    // For this function, `params->input_max <= params->output_max`.
+    void (*map_inverse)(float *lut, const struct pl_tone_map_params *params);
+
+    // Private data. Unused by libplacebo, but may be accessed by `map`.
+    void *priv;
+
+    // --- Deprecated fields
+    const char *param_desc PL_DEPRECATED;
+    float param_min PL_DEPRECATED;
+    float param_def PL_DEPRECATED;
+    float param_max PL_DEPRECATED;
+};
+
+struct pl_tone_map_constants {
+    // Configures the knee point, as a ratio between the source average and
+    // target average (in PQ space). An adaptation of 1.0 always adapts the
+    // source scene average brightness to the (scaled) target average,
+    // while a value of 0.0 never modifies scene brightness. [0,1]
+    //
+    // Affects all methods that use the ST2094 knee point determination
+    // (currently ST2094-40, ST2094-10 and spline)
+    float knee_adaptation;
+
+    // Configures the knee point minimum and maximum, respectively, as
+    // a percentage of the PQ luminance range. Provides a hard limit on the
+    // knee point chosen by `knee_adaptation`.
+    float knee_minimum; // (0, 0.5)
+    float knee_maximum; // (0.5, 1.0)
+
+    // Default knee point to use in the absence of source scene average
+    // metadata. Normally, this is ignored in favor of picking the knee
+    // point as the (relative) source scene average brightness level.
+    float knee_default; // [knee_minimum, knee_maximum]
+
+    // Knee point offset (for BT.2390 only). Note that a value of 0.5 is
+    // the spec-defined default behavior, which differs from the libplacebo
+    // default of 1.0. [0.5, 2]
+    float knee_offset;
+
+    // For the single-pivot polynomial (spline) function, this controls the
+    // coefficients used to tune the slope of the curve. This tuning is designed
+    // to make the slope closer to 1.0 when the difference in peaks is low,
+    // and closer to linear when the difference between peaks is high.
+    float slope_tuning;   // [0,10]
+    float slope_offset;   // [0,1]
+
+    // Contrast setting for the spline function. Higher values make the curve
+    // steeper (closer to `clip`), preserving midtones at the cost of losing
+    // shadow/highlight details, while lower values make the curve shallowed
+    // (closer to `linear`), preserving highlights at the cost of losing midtone
+    // contrast. Values above 1.0 are possible, resulting in an output with more
+    // contrast than the input.
+    float spline_contrast; // [0,1.5]
+
+    // For the reinhard function, this specifies the local contrast coefficient
+    // at the display peak. Essentially, a value of 0.5 implies that the
+    // reference white will be about half as bright as when clipping. (0,1)
+    float reinhard_contrast;
+
+    // For legacy functions (mobius, gamma) which operate on linear light, this
+    // directly sets the corresponding knee point. (0,1)
+    float linear_knee;
+
+    // For linear methods (linear, linearlight), this controls the linear
+    // exposure/gain applied to the image. (0,10]
+    float exposure;
+};
+
+#define PL_TONE_MAP_CONSTANTS  \
+    .knee_adaptation   = 0.4f, \
+    .knee_minimum      = 0.1f, \
+    .knee_maximum      = 0.8f, \
+    .knee_default      = 0.4f, \
+    .knee_offset       = 1.0f, \
+    .slope_tuning      = 1.5f, \
+    .slope_offset      = 0.2f, \
+    .spline_contrast   = 0.5f, \
+    .reinhard_contrast = 0.5f, \
+    .linear_knee       = 0.3f, \
+    .exposure          = 1.0f,
+
+struct pl_tone_map_params {
+    // If `function` is NULL, defaults to `pl_tone_map_clip`.
+    const struct pl_tone_map_function *function;
+
+    // Common constants, should be initialized to PL_TONE_MAP_CONSTANTS if
+    // not intending to override them further.
+    struct pl_tone_map_constants constants;
+
+    // The desired input/output scaling of the tone map. If this differs from
+    // `function->scaling`, any required conversion will be performed.
+    //
+    // Note that to maximize LUT efficiency, it's *highly* recommended to use
+    // either PL_HDR_PQ or PL_HDR_SQRT as the input scaling, except when
+    // using `pl_tone_map_sample`.
+    enum pl_hdr_scaling input_scaling;
+    enum pl_hdr_scaling output_scaling;
+
+    // The size of the resulting LUT. (For `pl_tone_map_generate` only)
+    size_t lut_size;
+
+    // The characteristics of the input, in `input_scaling` units.
+    float input_min;
+    float input_max;
+    float input_avg; // or 0 if unknown
+
+    // The desired characteristics of the output, in `output_scaling` units.
+    float output_min;
+    float output_max;
+
+    // The input HDR metadata. Only used by a select few tone-mapping
+    // functions, currently only SMPTE ST2094. (Optional)
+    struct pl_hdr_metadata hdr;
+
+    // --- Deprecated fields
+    float param PL_DEPRECATED; // see `constants`
+};
+
+#define pl_tone_map_params(...) (&(struct pl_tone_map_params) { __VA_ARGS__ });
+
+// Note: Only does pointer equality testing on `function`
+PL_API bool pl_tone_map_params_equal(const struct pl_tone_map_params *a,
+                                     const struct pl_tone_map_params *b);
+
+// Clamps/defaults the parameters, including input/output maximum.
+PL_API void pl_tone_map_params_infer(struct pl_tone_map_params *params);
+
+// Returns true if the given tone mapping configuration effectively represents
+// a no-op configuration. Tone mapping can be skipped in this case (although
+// strictly speaking, the LUT would still clip illegal input values)
+PL_API bool pl_tone_map_params_noop(const struct pl_tone_map_params *params);
+
+// Generate a tone-mapping LUT for a given configuration. This will always
+// span the entire input range, as given by `input_min` and `input_max`.
+PL_API void pl_tone_map_generate(float *out, const struct pl_tone_map_params *params);
+
+// Samples a tone mapping function at a single position. Note that this is less
+// efficient than `pl_tone_map_generate` for generating multiple values.
+//
+// Ignores `params->lut_size`.
+PL_API float pl_tone_map_sample(float x, const struct pl_tone_map_params *params);
+
+// Performs no tone-mapping, just clips out-of-range colors. Retains perfect
+// color accuracy for in-range colors but completely destroys out-of-range
+// information. Does not perform any black point adaptation.
+PL_API extern const struct pl_tone_map_function pl_tone_map_clip;
+
+// EETF from SMPTE ST 2094-40 Annex B, which uses the provided OOTF based on
+// Bezier curves to perform tone-mapping. The OOTF used is adjusted based on
+// the ratio between the targeted and actual display peak luminances. In the
+// absence of HDR10+ metadata, falls back to a simple constant bezier curve.
+PL_API extern const struct pl_tone_map_function pl_tone_map_st2094_40;
+
+// EETF from SMPTE ST 2094-10 Annex B.2, which takes into account the input
+// signal average luminance in addition to the maximum/minimum.
+//
+// Note: This does *not* currently include the subjective gain/offset/gamma
+// controls defined in Annex B.3. (Open an issue with a valid sample file if
+// you want such parameters to be respected.)
+PL_API extern const struct pl_tone_map_function pl_tone_map_st2094_10;
+
+// EETF from the ITU-R Report BT.2390, a hermite spline roll-off with linear
+// segment.
+PL_API extern const struct pl_tone_map_function pl_tone_map_bt2390;
+
+// EETF from ITU-R Report BT.2446, method A. Can be used for both forward
+// and inverse tone mapping.
+PL_API extern const struct pl_tone_map_function pl_tone_map_bt2446a;
+
+// Simple spline consisting of two polynomials, joined by a single pivot point,
+// which is tuned based on the source scene average brightness (taking into
+// account dynamic metadata if available). This function can be used
+// for both forward and inverse tone mapping.
+PL_API extern const struct pl_tone_map_function pl_tone_map_spline;
+
+// Very simple non-linear curve. Named after Erik Reinhard.
+PL_API extern const struct pl_tone_map_function pl_tone_map_reinhard;
+
+// Generalization of the reinhard tone mapping algorithm to support an
+// additional linear slope near black. The name is derived from its function
+// shape (ax+b)/(cx+d), which is known as a Möbius transformation.
+PL_API extern const struct pl_tone_map_function pl_tone_map_mobius;
+
+// Piece-wise, filmic tone-mapping algorithm developed by John Hable for use in
+// Uncharted 2, inspired by a similar tone-mapping algorithm used by Kodak.
+// Popularized by its use in video games with HDR rendering. Preserves both
+// dark and bright details very well, but comes with the drawback of changing
+// the average brightness quite significantly. This is sort of similar to
+// pl_tone_map_reinhard with `reinhard_contrast=0.24`.
+PL_API extern const struct pl_tone_map_function pl_tone_map_hable;
+
+// Fits a gamma (power) function to transfer between the source and target
+// color spaces, effectively resulting in a perceptual hard-knee joining two
+// roughly linear sections. This preserves details at all scales, but can result
+// in an image with a muted or dull appearance.
+PL_API extern const struct pl_tone_map_function pl_tone_map_gamma;
+
+// Linearly stretches the input range to the output range, in PQ space. This
+// will preserve all details accurately, but results in a significantly
+// different average brightness. Can be used for inverse tone-mapping in
+// addition to regular tone-mapping.
+PL_API extern const struct pl_tone_map_function pl_tone_map_linear;
+
+// Like `pl_tone_map_linear`, but in linear light (instead of PQ). Works well
+// for small range adjustments but may cause severe darkening when
+// downconverting from e.g. 10k nits to SDR.
+PL_API extern const struct pl_tone_map_function pl_tone_map_linear_light;
+
+// A list of built-in tone mapping functions, terminated by NULL
+PL_API extern const struct pl_tone_map_function * const pl_tone_map_functions[];
+PL_API extern const int pl_num_tone_map_functions; // excluding trailing NULL
+
+// Find the tone mapping function with the given name, or NULL on failure.
+PL_API const struct pl_tone_map_function *pl_find_tone_map_function(const char *name);
+
+// Deprecated alias, do not use
+#define pl_tone_map_auto pl_tone_map_spline
+
+PL_API_END
+
+#endif // LIBPLACEBO_TONE_MAPPING_H_
diff --git a/src/include/libplacebo/utils/dav1d.h b/src/include/libplacebo/utils/dav1d.h
new file mode 100644
index 0000000..ece97c5
--- /dev/null
+++ b/src/include/libplacebo/utils/dav1d.h
@@ -0,0 +1,129 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_DAV1D_H_
+#define LIBPLACEBO_DAV1D_H_
+
+#include <libplacebo/gpu.h>
+#include <libplacebo/utils/upload.h>
+#include <dav1d/dav1d.h>
+
+#if defined(__cplusplus) && !defined(PL_DAV1D_IMPLEMENTATION)
+# define PL_DAV1D_API
+# define PL_DAV1D_IMPLEMENTATION 0
+# warning Remember to include this file with a PL_DAV1D_IMPLEMENTATION set to 1 in \
+          C translation unit to provide implementation. Suppress this warning by \
+          defining PL_DAV1D_IMPLEMENTATION to 0 in C++ files.
+#elif !defined(PL_DAV1D_IMPLEMENTATION)
+# define PL_DAV1D_API static inline
+# define PL_DAV1D_IMPLEMENTATION 1
+#else
+# define PL_DAV1D_API
+#endif
+
+PL_API_BEGIN
+
+// Fill in the details of a `pl_frame` from a Dav1dPicture. This function will
+// explicitly clear `out_frame`, setting all extra fields to 0. After this
+// function returns, the only missing data is information related to the plane
+// texture itself (`planes[N].texture`).
+//
+// Note: This will include all possible metadata, including HDR metadata and
+// AV1 film grain data. Users should explicitly clear this out if undesired.
+PL_DAV1D_API void pl_frame_from_dav1dpicture(struct pl_frame *out_frame,
+                                             const Dav1dPicture *picture);
+
+// Helper function to generate a `pl_color_space` struct from a Dav1dPicture.
+// Useful to update the swapchain colorspace mode dynamically (e.g. for HDR).
+PL_DAV1D_API void pl_swapchain_colors_from_dav1dpicture(struct pl_color_space *out_colors,
+                                                        const Dav1dPicture *picture);
+
+struct pl_dav1d_upload_params {
+    // The picture to upload. Not modified unless `asynchronous` is true.
+    Dav1dPicture *picture;
+
+    // If true, film grain present in `picture` will be exported to the
+    // `pl_frame` as well. This should be set to false unless the user has
+    // disabled `Dav1dSettings.apply_grain`.
+    bool film_grain;
+
+    // If true, libplacebo will probe for the allocation metadata set by
+    // `pl_allocate_dav1dpicture`, and directly import the attached buffers
+    // (saving a memcpy in some cases). Has no effect if the Dav1dPicture was
+    // not allocated using `pl_allocate_dav1dpicture`.
+    //
+    // Note: When this is the case, `asynchronous` has no further effect -
+    // uploads from attached buffers are already asynchronous.
+    bool gpu_allocated;
+
+    // If true, `picture` will be asynchronously uploaded and unref'd
+    // internally by libplacebo, and the struct passed by the user cleared to
+    // {0}. This is needed to avoid `memcpy` in some cases, so setting it to
+    // true is highly recommended wherever possible.
+    //
+    // Note: If `pl_upload_dav1dpicture` returns false, `picture` does not get
+    // unref'd.
+    bool asynchronous;
+};
+
+#define pl_dav1d_upload_params(...) (&(struct pl_dav1d_upload_params) { __VA_ARGS__ })
+
+// Very high level helper function to take a `Dav1dPicture` and upload it to
+// the GPU. Similar in spirit to `pl_upload_plane`, and the same notes apply.
+// `tex` must be an array of 3 pointers of type `pl_tex`, each
+// either pointing to a valid texture, or NULL. Returns whether successful.
+PL_DAV1D_API bool pl_upload_dav1dpicture(pl_gpu gpu,
+                                         struct pl_frame *out_frame, pl_tex tex[3],
+                                         const struct pl_dav1d_upload_params *params);
+
+// Allocate a Dav1dPicture from persistently mapped buffers. This can be more
+// efficient than regular Dav1dPictures, especially when using the synchronous
+// `pl_upload_dav1dpicture`, or on platforms that don't support importing
+// PL_HANDLE_HOST_PTR as buffers. Returns 0 or a negative DAV1D_ERR value.
+//
+// Note: These may only be used directly as a Dav1dPicAllocator if the `gpu`
+// passed as the value of `cookie` is `pl_gpu.limits.thread_safe`. Otherwise,
+// the user must manually synchronize this to ensure it runs on the correct
+// thread.
+PL_DAV1D_API int pl_allocate_dav1dpicture(Dav1dPicture *picture, void *gpu);
+PL_DAV1D_API void pl_release_dav1dpicture(Dav1dPicture *picture, void *gpu);
+
+// Mapping functions for the various Dav1dColor* enums. Note that these are not
+// quite 1:1, and even for values that exist in both, the semantics sometimes
+// differ. Some special cases (e.g. ICtCp, or XYZ) are handled differently in
+// libplacebo and libdav1d, respectively.
+PL_DAV1D_API enum pl_color_system pl_system_from_dav1d(enum Dav1dMatrixCoefficients mc);
+PL_DAV1D_API enum Dav1dMatrixCoefficients pl_system_to_dav1d(enum pl_color_system sys);
+PL_DAV1D_API enum pl_color_levels pl_levels_from_dav1d(int color_range);
+PL_DAV1D_API int pl_levels_to_dav1d(enum pl_color_levels levels);
+PL_DAV1D_API enum pl_color_primaries pl_primaries_from_dav1d(enum Dav1dColorPrimaries prim);
+PL_DAV1D_API enum Dav1dColorPrimaries pl_primaries_to_dav1d(enum pl_color_primaries prim);
+PL_DAV1D_API enum pl_color_transfer pl_transfer_from_dav1d(enum Dav1dTransferCharacteristics trc);
+PL_DAV1D_API enum Dav1dTransferCharacteristics pl_transfer_to_dav1d(enum pl_color_transfer trc);
+PL_DAV1D_API enum pl_chroma_location pl_chroma_from_dav1d(enum Dav1dChromaSamplePosition loc);
+PL_DAV1D_API enum Dav1dChromaSamplePosition pl_chroma_to_dav1d(enum pl_chroma_location loc);
+
+
+// Actual implementation, included as part of this header to avoid having
+// a compile-time dependency on libdav1d.
+#if PL_DAV1D_IMPLEMENTATION
+# include <libplacebo/utils/dav1d_internal.h>
+#endif
+
+PL_API_END
+
+#endif // LIBPLACEBO_DAV1D_H_
diff --git a/src/include/libplacebo/utils/dav1d_internal.h b/src/include/libplacebo/utils/dav1d_internal.h
new file mode 100644
index 0000000..2e0512a
--- /dev/null
+++ b/src/include/libplacebo/utils/dav1d_internal.h
@@ -0,0 +1,613 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_DAV1D_H_
+#error This header should be included as part of <libplacebo/utils/dav1d.h>
+#elif defined(__cplusplus)
+#error This header cannot be included from C++ define PL_DAV1D_IMPLEMENTATION appropriately
+#else
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+PL_DAV1D_API enum pl_color_system pl_system_from_dav1d(enum Dav1dMatrixCoefficients mc)
+{
+    switch (mc) {
+        case DAV1D_MC_IDENTITY:     return PL_COLOR_SYSTEM_RGB; // or XYZ (unlikely)
+        case DAV1D_MC_BT709:        return PL_COLOR_SYSTEM_BT_709;
+        case DAV1D_MC_UNKNOWN:      return PL_COLOR_SYSTEM_UNKNOWN;
+        case DAV1D_MC_FCC:          return PL_COLOR_SYSTEM_UNKNOWN; // missing
+        case DAV1D_MC_BT470BG:      return PL_COLOR_SYSTEM_BT_601;
+        case DAV1D_MC_BT601:        return PL_COLOR_SYSTEM_BT_601;
+        case DAV1D_MC_SMPTE240:     return PL_COLOR_SYSTEM_SMPTE_240M;
+        case DAV1D_MC_SMPTE_YCGCO:  return PL_COLOR_SYSTEM_YCGCO;
+        case DAV1D_MC_BT2020_NCL:   return PL_COLOR_SYSTEM_BT_2020_NC;
+        case DAV1D_MC_BT2020_CL:    return PL_COLOR_SYSTEM_BT_2020_C;
+        case DAV1D_MC_SMPTE2085:    return PL_COLOR_SYSTEM_UNKNOWN; // missing
+        case DAV1D_MC_CHROMAT_NCL:  return PL_COLOR_SYSTEM_UNKNOWN; // missing
+        case DAV1D_MC_CHROMAT_CL:   return PL_COLOR_SYSTEM_UNKNOWN; // missing
+        // Note: this colorspace is confused between PQ and HLG, which dav1d
+        // requires inferring from other sources, but libplacebo makes
+        // explicit. Default to PQ as it's the more common scenario.
+        case DAV1D_MC_ICTCP:        return PL_COLOR_SYSTEM_BT_2100_PQ;
+        case DAV1D_MC_RESERVED: abort();
+    }
+
+    return PL_COLOR_SYSTEM_UNKNOWN;
+}
+
+PL_DAV1D_API enum Dav1dMatrixCoefficients pl_system_to_dav1d(enum pl_color_system sys)
+{
+    switch (sys) {
+    case PL_COLOR_SYSTEM_UNKNOWN:       return DAV1D_MC_UNKNOWN;
+    case PL_COLOR_SYSTEM_BT_601:        return DAV1D_MC_BT601;
+    case PL_COLOR_SYSTEM_BT_709:        return DAV1D_MC_BT709;
+    case PL_COLOR_SYSTEM_SMPTE_240M:    return DAV1D_MC_SMPTE240;
+    case PL_COLOR_SYSTEM_BT_2020_NC:    return DAV1D_MC_BT2020_NCL;
+    case PL_COLOR_SYSTEM_BT_2020_C:     return DAV1D_MC_BT2020_CL;
+    case PL_COLOR_SYSTEM_BT_2100_PQ:    return DAV1D_MC_ICTCP;
+    case PL_COLOR_SYSTEM_BT_2100_HLG:   return DAV1D_MC_ICTCP;
+    case PL_COLOR_SYSTEM_DOLBYVISION:   return DAV1D_MC_UNKNOWN; // missing
+    case PL_COLOR_SYSTEM_YCGCO:         return DAV1D_MC_SMPTE_YCGCO;
+    case PL_COLOR_SYSTEM_RGB:           return DAV1D_MC_IDENTITY;
+    case PL_COLOR_SYSTEM_XYZ:           return DAV1D_MC_IDENTITY;
+    case PL_COLOR_SYSTEM_COUNT: abort();
+    }
+
+    return DAV1D_MC_UNKNOWN;
+}
+
+PL_DAV1D_API enum pl_color_levels pl_levels_from_dav1d(int color_range)
+{
+    return color_range ? PL_COLOR_LEVELS_FULL : PL_COLOR_LEVELS_LIMITED;
+}
+
+PL_DAV1D_API int pl_levels_to_dav1d(enum pl_color_levels levels)
+{
+    return levels == PL_COLOR_LEVELS_FULL;
+}
+
+PL_DAV1D_API enum pl_color_primaries pl_primaries_from_dav1d(enum Dav1dColorPrimaries prim)
+{
+    switch (prim) {
+    case DAV1D_COLOR_PRI_BT709:         return PL_COLOR_PRIM_BT_709;
+    case DAV1D_COLOR_PRI_UNKNOWN:       return PL_COLOR_PRIM_UNKNOWN;
+    case DAV1D_COLOR_PRI_RESERVED:      return PL_COLOR_PRIM_UNKNOWN;
+    case DAV1D_COLOR_PRI_BT470M:        return PL_COLOR_PRIM_BT_470M;
+    case DAV1D_COLOR_PRI_BT470BG:       return PL_COLOR_PRIM_BT_601_625;
+    case DAV1D_COLOR_PRI_BT601:         return PL_COLOR_PRIM_BT_601_525;
+    case DAV1D_COLOR_PRI_SMPTE240:      return PL_COLOR_PRIM_BT_601_525;
+    case DAV1D_COLOR_PRI_FILM:          return PL_COLOR_PRIM_FILM_C;
+    case DAV1D_COLOR_PRI_BT2020:        return PL_COLOR_PRIM_BT_2020;
+    case DAV1D_COLOR_PRI_XYZ:           return PL_COLOR_PRIM_UNKNOWN;
+    case DAV1D_COLOR_PRI_SMPTE431:      return PL_COLOR_PRIM_DCI_P3;
+    case DAV1D_COLOR_PRI_SMPTE432:      return PL_COLOR_PRIM_DISPLAY_P3;
+    case DAV1D_COLOR_PRI_EBU3213:       return PL_COLOR_PRIM_EBU_3213;
+    }
+
+    return PL_COLOR_PRIM_UNKNOWN;
+}
+
+PL_DAV1D_API enum Dav1dColorPrimaries pl_primaries_to_dav1d(enum pl_color_primaries prim)
+{
+    switch (prim) {
+    case PL_COLOR_PRIM_UNKNOWN:     return DAV1D_COLOR_PRI_UNKNOWN;
+    case PL_COLOR_PRIM_BT_601_525:  return DAV1D_COLOR_PRI_BT601;
+    case PL_COLOR_PRIM_BT_601_625:  return DAV1D_COLOR_PRI_BT470BG;
+    case PL_COLOR_PRIM_BT_709:      return DAV1D_COLOR_PRI_BT709;
+    case PL_COLOR_PRIM_BT_470M:     return DAV1D_COLOR_PRI_BT470M;
+    case PL_COLOR_PRIM_EBU_3213:    return DAV1D_COLOR_PRI_EBU3213;
+    case PL_COLOR_PRIM_BT_2020:     return DAV1D_COLOR_PRI_BT2020;
+    case PL_COLOR_PRIM_APPLE:       return DAV1D_COLOR_PRI_UNKNOWN; // missing
+    case PL_COLOR_PRIM_ADOBE:       return DAV1D_COLOR_PRI_UNKNOWN; // missing
+    case PL_COLOR_PRIM_PRO_PHOTO:   return DAV1D_COLOR_PRI_UNKNOWN; // missing
+    case PL_COLOR_PRIM_CIE_1931:    return DAV1D_COLOR_PRI_UNKNOWN; // missing
+    case PL_COLOR_PRIM_DCI_P3:      return DAV1D_COLOR_PRI_SMPTE431;
+    case PL_COLOR_PRIM_DISPLAY_P3:  return DAV1D_COLOR_PRI_SMPTE432;
+    case PL_COLOR_PRIM_V_GAMUT:     return DAV1D_COLOR_PRI_UNKNOWN; // missing
+    case PL_COLOR_PRIM_S_GAMUT:     return DAV1D_COLOR_PRI_UNKNOWN; // missing
+    case PL_COLOR_PRIM_FILM_C:      return DAV1D_COLOR_PRI_FILM;
+    case PL_COLOR_PRIM_ACES_AP0:    return DAV1D_COLOR_PRI_UNKNOWN; // missing
+    case PL_COLOR_PRIM_ACES_AP1:    return DAV1D_COLOR_PRI_UNKNOWN; // missing
+    case PL_COLOR_PRIM_COUNT: abort();
+    }
+
+    return DAV1D_COLOR_PRI_UNKNOWN;
+}
+
+PL_DAV1D_API enum pl_color_transfer pl_transfer_from_dav1d(enum Dav1dTransferCharacteristics trc)
+{
+    switch (trc) {
+    case DAV1D_TRC_BT709:           return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+    case DAV1D_TRC_UNKNOWN:         return PL_COLOR_TRC_UNKNOWN;
+    case DAV1D_TRC_BT470M:          return PL_COLOR_TRC_GAMMA22;
+    case DAV1D_TRC_BT470BG:         return PL_COLOR_TRC_GAMMA28;
+    case DAV1D_TRC_BT601:           return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+    case DAV1D_TRC_SMPTE240:        return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+    case DAV1D_TRC_LINEAR:          return PL_COLOR_TRC_LINEAR;
+    case DAV1D_TRC_LOG100:          return PL_COLOR_TRC_UNKNOWN; // missing
+    case DAV1D_TRC_LOG100_SQRT10:   return PL_COLOR_TRC_UNKNOWN; // missing
+    case DAV1D_TRC_IEC61966:        return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+    case DAV1D_TRC_BT1361:          return PL_COLOR_TRC_BT_1886; // ETOF != OETF
+    case DAV1D_TRC_SRGB:            return PL_COLOR_TRC_SRGB;
+    case DAV1D_TRC_BT2020_10BIT:    return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+    case DAV1D_TRC_BT2020_12BIT:    return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+    case DAV1D_TRC_SMPTE2084:       return PL_COLOR_TRC_PQ;
+    case DAV1D_TRC_SMPTE428:        return PL_COLOR_TRC_ST428;
+    case DAV1D_TRC_HLG:             return PL_COLOR_TRC_HLG;
+    case DAV1D_TRC_RESERVED: abort();
+    }
+
+    return PL_COLOR_TRC_UNKNOWN;
+}
+
+PL_DAV1D_API enum Dav1dTransferCharacteristics pl_transfer_to_dav1d(enum pl_color_transfer trc)
+{
+    switch (trc) {
+    case PL_COLOR_TRC_UNKNOWN:      return DAV1D_TRC_UNKNOWN;
+    case PL_COLOR_TRC_BT_1886:      return DAV1D_TRC_BT709;       // EOTF != OETF
+    case PL_COLOR_TRC_SRGB:         return DAV1D_TRC_SRGB;
+    case PL_COLOR_TRC_LINEAR:       return DAV1D_TRC_LINEAR;
+    case PL_COLOR_TRC_GAMMA18:      return DAV1D_TRC_UNKNOWN; // missing
+    case PL_COLOR_TRC_GAMMA20:      return DAV1D_TRC_UNKNOWN; // missing
+    case PL_COLOR_TRC_GAMMA22:      return DAV1D_TRC_BT470M;
+    case PL_COLOR_TRC_GAMMA24:      return DAV1D_TRC_UNKNOWN; // missing
+    case PL_COLOR_TRC_GAMMA26:      return DAV1D_TRC_UNKNOWN; // missing
+    case PL_COLOR_TRC_GAMMA28:      return DAV1D_TRC_BT470BG;
+    case PL_COLOR_TRC_ST428:        return DAV1D_TRC_SMPTE428;
+    case PL_COLOR_TRC_PRO_PHOTO:    return DAV1D_TRC_UNKNOWN; // missing
+    case PL_COLOR_TRC_PQ:           return DAV1D_TRC_SMPTE2084;
+    case PL_COLOR_TRC_HLG:          return DAV1D_TRC_HLG;
+    case PL_COLOR_TRC_V_LOG:        return DAV1D_TRC_UNKNOWN; // missing
+    case PL_COLOR_TRC_S_LOG1:       return DAV1D_TRC_UNKNOWN; // missing
+    case PL_COLOR_TRC_S_LOG2:       return DAV1D_TRC_UNKNOWN; // missing
+    case PL_COLOR_TRC_COUNT: abort();
+    }
+
+    return DAV1D_TRC_UNKNOWN;
+}
+
+PL_DAV1D_API enum pl_chroma_location pl_chroma_from_dav1d(enum Dav1dChromaSamplePosition loc)
+{
+    switch (loc) {
+    case DAV1D_CHR_UNKNOWN:     return PL_CHROMA_UNKNOWN;
+    case DAV1D_CHR_VERTICAL:    return PL_CHROMA_LEFT;
+    case DAV1D_CHR_COLOCATED:   return PL_CHROMA_TOP_LEFT;
+    }
+
+    return PL_CHROMA_UNKNOWN;
+}
+
+PL_DAV1D_API enum Dav1dChromaSamplePosition pl_chroma_to_dav1d(enum pl_chroma_location loc)
+{
+    switch (loc) {
+    case PL_CHROMA_UNKNOWN:         return DAV1D_CHR_UNKNOWN;
+    case PL_CHROMA_LEFT:            return DAV1D_CHR_VERTICAL;
+    case PL_CHROMA_CENTER:          return DAV1D_CHR_UNKNOWN; // missing
+    case PL_CHROMA_TOP_LEFT:        return DAV1D_CHR_COLOCATED;
+    case PL_CHROMA_TOP_CENTER:      return DAV1D_CHR_UNKNOWN; // missing
+    case PL_CHROMA_BOTTOM_LEFT:     return DAV1D_CHR_UNKNOWN; // missing
+    case PL_CHROMA_BOTTOM_CENTER:   return DAV1D_CHR_UNKNOWN; // missing
+    case PL_CHROMA_COUNT: abort();
+    }
+
+    return DAV1D_CHR_UNKNOWN;
+}
+
+static inline float pl_fixed24_8(uint32_t n)
+{
+    return (float) n / (1 << 8);
+}
+
+static inline float pl_fixed18_14(uint32_t n)
+{
+    return (float) n / (1 << 14);
+}
+
+static inline float pl_fixed0_16(uint16_t n)
+{
+    return (float) n / (1 << 16);
+}
+
+// Align to a power of 2
+#define PL_ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
+
+PL_DAV1D_API void pl_frame_from_dav1dpicture(struct pl_frame *out,
+                                             const Dav1dPicture *picture)
+{
+    const Dav1dSequenceHeader *seq_hdr = picture->seq_hdr;
+    int num_planes;
+    switch (picture->p.layout) {
+    case DAV1D_PIXEL_LAYOUT_I400:
+        num_planes = 1;
+        break;
+    case DAV1D_PIXEL_LAYOUT_I420:
+    case DAV1D_PIXEL_LAYOUT_I422:
+    case DAV1D_PIXEL_LAYOUT_I444:
+        num_planes = 3;
+        break;
+    default: abort();
+    }
+
+    *out = (struct pl_frame) {
+        .num_planes = num_planes,
+        .planes = {
+            // Components are always in order, which makes things easy
+            {
+                .components = 1,
+                .component_mapping = {0},
+            }, {
+                .components = 1,
+                .component_mapping = {1},
+            }, {
+                .components = 1,
+                .component_mapping = {2},
+            },
+        },
+        .crop = {
+            0, 0, picture->p.w, picture->p.h,
+        },
+        .color = {
+            .primaries = pl_primaries_from_dav1d(seq_hdr->pri),
+            .transfer = pl_transfer_from_dav1d(seq_hdr->trc),
+        },
+        .repr = {
+            .sys = pl_system_from_dav1d(seq_hdr->mtrx),
+            .levels = pl_levels_from_dav1d(seq_hdr->color_range),
+            .bits = {
+                .sample_depth = PL_ALIGN2(picture->p.bpc, 8),
+                .color_depth = picture->p.bpc,
+            },
+        },
+    };
+
+    if (seq_hdr->mtrx == DAV1D_MC_ICTCP && seq_hdr->trc == DAV1D_TRC_HLG) {
+
+        // dav1d makes no distinction between PQ and HLG ICtCp, so we need
+        // to manually fix it in the case that we have HLG ICtCp data.
+        out->repr.sys = PL_COLOR_SYSTEM_BT_2100_HLG;
+
+    } else if (seq_hdr->mtrx == DAV1D_MC_IDENTITY &&
+               seq_hdr->pri == DAV1D_COLOR_PRI_XYZ)
+    {
+
+        // dav1d handles this as a special case, but doesn't provide an
+        // explicit flag for it either, so we have to resort to this ugly hack,
+        // even though CIE 1931 RGB *is* a valid thing in principle!
+        out->repr.sys= PL_COLOR_SYSTEM_XYZ;
+
+    } else if (!out->repr.sys) {
+
+        // PL_COLOR_SYSTEM_UNKNOWN maps to RGB, so hard-code this one
+        out->repr.sys = pl_color_system_guess_ycbcr(picture->p.w, picture->p.h);
+    }
+
+    const Dav1dContentLightLevel *cll = picture->content_light;
+    if (cll) {
+        out->color.hdr.max_cll = cll->max_content_light_level;
+        out->color.hdr.max_fall = cll->max_frame_average_light_level;
+    }
+
+    // This overrides the CLL values above, if both are present
+    const Dav1dMasteringDisplay *md = picture->mastering_display;
+    if (md) {
+        out->color.hdr.max_luma = pl_fixed24_8(md->max_luminance);
+        out->color.hdr.min_luma = pl_fixed18_14(md->min_luminance);
+        out->color.hdr.prim = (struct pl_raw_primaries) {
+            .red.x   = pl_fixed0_16(md->primaries[0][0]),
+            .red.y   = pl_fixed0_16(md->primaries[0][1]),
+            .green.x = pl_fixed0_16(md->primaries[1][0]),
+            .green.y = pl_fixed0_16(md->primaries[1][1]),
+            .blue.x  = pl_fixed0_16(md->primaries[2][0]),
+            .blue.y  = pl_fixed0_16(md->primaries[2][1]),
+            .white.x = pl_fixed0_16(md->white_point[0]),
+            .white.y = pl_fixed0_16(md->white_point[1]),
+        };
+    }
+
+    if (picture->frame_hdr->film_grain.present) {
+        const Dav1dFilmGrainData *fg = &picture->frame_hdr->film_grain.data;
+        out->film_grain = (struct pl_film_grain_data) {
+            .type = PL_FILM_GRAIN_AV1,
+            .seed = fg->seed,
+            .params.av1 = {
+                .num_points_y = fg->num_y_points,
+                .chroma_scaling_from_luma = fg->chroma_scaling_from_luma,
+                .num_points_uv = { fg->num_uv_points[0], fg->num_uv_points[1] },
+                .scaling_shift = fg->scaling_shift,
+                .ar_coeff_lag = fg->ar_coeff_lag,
+                .ar_coeff_shift = (int) fg->ar_coeff_shift,
+                .grain_scale_shift = fg->grain_scale_shift,
+                .uv_mult = { fg->uv_mult[0], fg->uv_mult[1] },
+                .uv_mult_luma = { fg->uv_luma_mult[0], fg->uv_luma_mult[1] },
+                .uv_offset = { fg->uv_offset[0], fg->uv_offset[1] },
+                .overlap = fg->overlap_flag,
+            },
+        };
+
+        struct pl_av1_grain_data *av1 = &out->film_grain.params.av1;
+        memcpy(av1->points_y, fg->y_points, sizeof(av1->points_y));
+        memcpy(av1->points_uv, fg->uv_points, sizeof(av1->points_uv));
+        memcpy(av1->ar_coeffs_y, fg->ar_coeffs_y, sizeof(av1->ar_coeffs_y));
+        memcpy(av1->ar_coeffs_uv[0], fg->ar_coeffs_uv[0], sizeof(av1->ar_coeffs_uv[0]));
+        memcpy(av1->ar_coeffs_uv[1], fg->ar_coeffs_uv[1], sizeof(av1->ar_coeffs_uv[1]));
+    }
+
+    switch (picture->p.layout) {
+    case DAV1D_PIXEL_LAYOUT_I400:
+    case DAV1D_PIXEL_LAYOUT_I444:
+        break;
+    case DAV1D_PIXEL_LAYOUT_I420:
+    case DAV1D_PIXEL_LAYOUT_I422:
+        // Only set the chroma location for definitely subsampled images
+        pl_frame_set_chroma_location(out, pl_chroma_from_dav1d(seq_hdr->chr));
+        break;
+    }
+}
+
+PL_DAV1D_API void pl_swapchain_colors_from_dav1dpicture(struct pl_swapchain_colors *out_colors,
+                                                            const Dav1dPicture *picture)
+{
+    struct pl_frame frame;
+    pl_frame_from_dav1dpicture(&frame, picture);
+
+    *out_colors = (struct pl_swapchain_colors) {
+        .primaries = frame.color.primaries,
+        .transfer = frame.color.transfer,
+    };
+
+    const Dav1dContentLightLevel *cll = picture->content_light;
+    if (cll) {
+        out_colors->hdr.max_cll = cll->max_content_light_level;
+        out_colors->hdr.max_fall = cll->max_frame_average_light_level;
+    }
+
+    const Dav1dMasteringDisplay *md = picture->mastering_display;
+    if (md) {
+        out_colors->hdr.min_luma = pl_fixed18_14(md->min_luminance);
+        out_colors->hdr.max_luma = pl_fixed24_8(md->max_luminance);
+        out_colors->hdr.prim.red.x   = pl_fixed0_16(md->primaries[0][0]);
+        out_colors->hdr.prim.red.y   = pl_fixed0_16(md->primaries[0][1]);
+        out_colors->hdr.prim.green.x = pl_fixed0_16(md->primaries[1][0]);
+        out_colors->hdr.prim.green.y = pl_fixed0_16(md->primaries[1][1]);
+        out_colors->hdr.prim.blue.x  = pl_fixed0_16(md->primaries[2][0]);
+        out_colors->hdr.prim.blue.y  = pl_fixed0_16(md->primaries[2][1]);
+        out_colors->hdr.prim.white.x = pl_fixed0_16(md->white_point[0]);
+        out_colors->hdr.prim.white.y = pl_fixed0_16(md->white_point[1]);
+    }
+}
+
+#define PL_MAGIC0 0x2c2a1269
+#define PL_MAGIC1 0xc6d02577
+
+struct pl_dav1dalloc {
+    uint32_t magic[2];
+    pl_gpu gpu;
+    pl_buf buf;
+};
+
+struct pl_dav1dref {
+    Dav1dPicture pic;
+    uint8_t count;
+};
+
+static void pl_dav1dpicture_unref(void *priv)
+{
+    struct pl_dav1dref *ref = priv;
+    if (--ref->count == 0) {
+        dav1d_picture_unref(&ref->pic);
+        free(ref);
+    }
+}
+
+PL_DAV1D_API bool pl_upload_dav1dpicture(pl_gpu gpu,
+                                             struct pl_frame *out,
+                                             pl_tex tex[3],
+                                             const struct pl_dav1d_upload_params *params)
+{
+    Dav1dPicture *pic = params->picture;
+    pl_frame_from_dav1dpicture(out, pic);
+    if (!params->film_grain)
+        out->film_grain.type = PL_FILM_GRAIN_NONE;
+
+    const int bytes = (pic->p.bpc + 7) / 8; // rounded up
+    int sub_x = 0, sub_y = 0;
+    switch (pic->p.layout) {
+    case DAV1D_PIXEL_LAYOUT_I400:
+    case DAV1D_PIXEL_LAYOUT_I444:
+        break;
+    case DAV1D_PIXEL_LAYOUT_I420:
+        sub_x = sub_y = 1;
+        break;
+    case DAV1D_PIXEL_LAYOUT_I422:
+        sub_x = 1;
+        break;
+    }
+
+    struct pl_plane_data data[3] = {
+        {
+            // Y plane
+            .type           = PL_FMT_UNORM,
+            .width          = pic->p.w,
+            .height         = pic->p.h,
+            .pixel_stride   = bytes,
+            .component_size = {bytes * 8},
+            .component_map  = {0},
+        }, {
+            // U plane
+            .type           = PL_FMT_UNORM,
+            .width          = pic->p.w >> sub_x,
+            .height         = pic->p.h >> sub_y,
+            .pixel_stride   = bytes,
+            .component_size = {bytes * 8},
+            .component_map  = {1},
+        }, {
+            // V plane
+            .type           = PL_FMT_UNORM,
+            .width          = pic->p.w >> sub_x,
+            .height         = pic->p.h >> sub_y,
+            .pixel_stride   = bytes,
+            .component_size = {bytes * 8},
+            .component_map  = {2},
+        },
+    };
+
+    pl_buf buf = NULL;
+    struct pl_dav1dalloc *alloc = params->gpu_allocated ? pic->allocator_data : NULL;
+    struct pl_dav1dref *ref = NULL;
+
+    if (alloc && alloc->magic[0] == PL_MAGIC0 && alloc->magic[1] == PL_MAGIC1) {
+        // Re-use pre-allocated buffers directly
+        assert(alloc->gpu == gpu);
+        buf = alloc->buf;
+    } else if (params->asynchronous && gpu->limits.callbacks) {
+        ref = malloc(sizeof(*ref));
+        if (!ref)
+            return false;
+        memcpy(&ref->pic, pic, sizeof(Dav1dPicture));
+        ref->count = out->num_planes;
+    }
+
+    for (int p = 0; p < out->num_planes; p++) {
+        ptrdiff_t stride = p > 0 ? pic->stride[1] : pic->stride[0];
+        if (stride < 0) {
+            data[p].pixels = (uint8_t *) pic->data[p] + stride * (data[p].height - 1);
+            data[p].row_stride = -stride;
+            out->planes[p].flipped = true;
+        } else {
+            data[p].pixels = pic->data[p];
+            data[p].row_stride = stride;
+        }
+
+        if (buf) {
+            data[p].buf = buf;
+            data[p].buf_offset = (uintptr_t) data[p].pixels - (uintptr_t) buf->data;
+            data[p].pixels = NULL;
+        } else if (ref) {
+            data[p].priv = ref;
+            data[p].callback = pl_dav1dpicture_unref;
+        }
+
+        if (!pl_upload_plane(gpu, &out->planes[p], &tex[p], &data[p])) {
+            free(ref);
+            return false;
+        }
+    }
+
+    if (params->asynchronous) {
+        if (ref) {
+            *pic = (Dav1dPicture) {0};
+        } else {
+            dav1d_picture_unref(pic);
+        }
+    }
+
+    return true;
+}
+
+PL_DAV1D_API int pl_allocate_dav1dpicture(Dav1dPicture *p, void *cookie)
+{
+    pl_gpu gpu = cookie;
+    if (!gpu->limits.max_mapped_size || !gpu->limits.host_cached ||
+        !gpu->limits.buf_transfer)
+    {
+        return DAV1D_ERR(ENOTSUP);
+    }
+
+    // Copied from dav1d_default_picture_alloc
+    const int hbd = p->p.bpc > 8;
+    const int aligned_w = PL_ALIGN2(p->p.w, 128);
+    const int aligned_h = PL_ALIGN2(p->p.h, 128);
+    const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
+    const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    p->stride[0] = aligned_w << hbd;
+    p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0;
+
+    // Align strides up to multiples of the GPU performance hints
+    p->stride[0] = PL_ALIGN2(p->stride[0], gpu->limits.align_tex_xfer_pitch);
+    p->stride[1] = PL_ALIGN2(p->stride[1], gpu->limits.align_tex_xfer_pitch);
+
+    // Aligning offsets to 4 also implicitly aligns to the texel alignment (1 or 2)
+    size_t off_align = PL_ALIGN2(gpu->limits.align_tex_xfer_offset, 4);
+    const size_t y_sz = PL_ALIGN2(p->stride[0] * aligned_h, off_align);
+    const size_t uv_sz = PL_ALIGN2(p->stride[1] * (aligned_h >> ss_ver), off_align);
+
+    // The extra DAV1D_PICTURE_ALIGNMENTs are to brute force plane alignment,
+    // even in the case that the driver gives us insane alignments
+    const size_t pic_size = y_sz + 2 * uv_sz;
+    const size_t total_size = pic_size + DAV1D_PICTURE_ALIGNMENT * 4;
+
+    // Validate size limitations
+    if (total_size > gpu->limits.max_mapped_size)
+        return DAV1D_ERR(ENOMEM);
+
+    pl_buf buf = pl_buf_create(gpu, pl_buf_params(
+        .size = total_size,
+        .host_mapped = true,
+        .memory_type = PL_BUF_MEM_HOST,
+    ));
+
+    if (!buf)
+        return DAV1D_ERR(ENOMEM);
+
+    struct pl_dav1dalloc *alloc = malloc(sizeof(struct pl_dav1dalloc));
+    if (!alloc) {
+        pl_buf_destroy(gpu, &buf);
+        return DAV1D_ERR(ENOMEM);
+    }
+
+    *alloc = (struct pl_dav1dalloc) {
+        .magic = { PL_MAGIC0, PL_MAGIC1 },
+        .gpu = gpu,
+        .buf = buf,
+    };
+
+    assert(buf->data);
+    uintptr_t base = (uintptr_t) buf->data, data[3];
+    data[0] = PL_ALIGN2(base, DAV1D_PICTURE_ALIGNMENT);
+    data[1] = PL_ALIGN2(data[0] + y_sz, DAV1D_PICTURE_ALIGNMENT);
+    data[2] = PL_ALIGN2(data[1] + uv_sz, DAV1D_PICTURE_ALIGNMENT);
+
+    p->allocator_data = alloc;
+    p->data[0] = (void *) data[0];
+    p->data[1] = (void *) data[1];
+    p->data[2] = (void *) data[2];
+    return 0;
+}
+
+PL_DAV1D_API void pl_release_dav1dpicture(Dav1dPicture *p, void *cookie)
+{
+    struct pl_dav1dalloc *alloc = p->allocator_data;
+    if (!alloc)
+        return;
+
+    assert(alloc->magic[0] == PL_MAGIC0);
+    assert(alloc->magic[1] == PL_MAGIC1);
+    assert(alloc->gpu == cookie);
+    pl_buf_destroy(alloc->gpu, &alloc->buf);
+    free(alloc);
+
+    p->data[0] = p->data[1] = p->data[2] = p->allocator_data = NULL;
+}
+
+#undef PL_ALIGN2
+#undef PL_MAGIC0
+#undef PL_MAGIC1
+
+#endif // LIBPLACEBO_DAV1D_H_
diff --git a/src/include/libplacebo/utils/dolbyvision.h b/src/include/libplacebo/utils/dolbyvision.h
new file mode 100644
index 0000000..6d4d72e
--- /dev/null
+++ b/src/include/libplacebo/utils/dolbyvision.h
@@ -0,0 +1,34 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_DOLBYVISION_H_
+#define LIBPLACEBO_DOLBYVISION_H_
+
+#include <libplacebo/colorspace.h>
+
+PL_API_BEGIN
+
+// Parses the Dolby Vision RPU, and sets the `pl_hdr_metadata` dynamic
+// brightness metadata fields accordingly.
+//
+// Note: requires `PL_HAVE_LIBDOVI` to be defined, no-op otherwise.
+PL_API void pl_hdr_metadata_from_dovi_rpu(struct pl_hdr_metadata *out,
+                                          const uint8_t *buf, size_t size);
+
+PL_API_END
+
+#endif // LIBPLACEBO_DOLBYVISION_H_
diff --git a/src/include/libplacebo/utils/frame_queue.h b/src/include/libplacebo/utils/frame_queue.h
new file mode 100644
index 0000000..2a9c90c
--- /dev/null
+++ b/src/include/libplacebo/utils/frame_queue.h
@@ -0,0 +1,230 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_FRAME_QUEUE_H
+#define LIBPLACEBO_FRAME_QUEUE_H
+
+#include <libplacebo/renderer.h>
+#include <libplacebo/shaders/deinterlacing.h>
+
+PL_API_BEGIN
+
+// An abstraction layer for automatically turning a conceptual stream of
+// (frame, pts) pairs, as emitted by a decoder or filter graph, into a
+// `pl_frame_mix` suitable for `pl_render_image_mix`.
+//
+// This API ensures that minimal work is performed (e.g. only mapping frames
+// that are actually required), while also satisfying the requirements
+// of any configured frame mixer.
+//
+// Thread-safety: Safe
+typedef struct pl_queue_t *pl_queue;
+
+enum pl_queue_status {
+    PL_QUEUE_OK,       // success
+    PL_QUEUE_EOF,      // no more frames are available
+    PL_QUEUE_MORE,     // more frames needed, but not (yet) available
+    PL_QUEUE_ERR = -1, // some unknown error occurred while retrieving frames
+};
+
+struct pl_source_frame {
+    // The frame's presentation timestamp, in seconds relative to the first
+    // frame. These must be monotonically increasing for subsequent frames.
+    // To implement a discontinuous jump, users must explicitly reset the
+    // frame queue with `pl_queue_reset` and restart from PTS 0.0.
+    double pts;
+
+    // The frame's duration. This is not needed in normal scenarios, as the
+    // FPS can be inferred from the `pts` values themselves. Providing it
+    // only helps initialize the value for initial frames, which can smooth
+    // out the interpolation weights. Its use is also highly recommended
+    // when displaying interlaced frames. (Optional)
+    float duration;
+
+    // If set to something other than PL_FIELD_NONE, this source frame is
+    // marked as interlaced. It will be split up into two separate frames
+    // internally, and exported to the resulting `pl_frame_mix` as a pair of
+    // fields, referencing the corresponding previous and next frames. The
+    // first field will have the same PTS as `pts`, and the second field will
+    // be inserted at the timestamp `pts + duration/2`.
+    //
+    // Note: As a result of FPS estimates being unreliable around streams with
+    // mixed FPS (or when mixing interlaced and progressive frames), it's
+    // highly recommended to always specify a valid `duration` for interlaced
+    // frames.
+    enum pl_field first_field;
+
+    // Abstract frame data itself. To allow mapping frames only when they're
+    // actually needed, frames use a lazy representation. The provided
+    // callbacks will be invoked to interface with it.
+    void *frame_data;
+
+    // This will be called to map the frame to the GPU, only if needed.
+    //
+    // `tex` is a pointer to an array of 4 texture objects (or NULL), which
+    // *may* serve as backing storage for the texture being mapped. These are
+    // intended to be recreated by `map`, e.g. using `pl_tex_recreate` or
+    // `pl_upload_plane` as appropriate. They will be managed internally by
+    // `pl_queue` and destroyed at some unspecified future point in time.
+    //
+    // Note: If `map` fails, it will not be retried, nor will `discard` be run.
+    // The user should clean up state in this case.
+    bool (*map)(pl_gpu gpu, pl_tex *tex, const struct pl_source_frame *src,
+                struct pl_frame *out_frame);
+
+    // If present, this will be called on frames that are done being used by
+    // `pl_queue`. This may be useful to e.g. unmap textures backed by external
+    // APIs such as hardware decoders. (Optional)
+    void (*unmap)(pl_gpu gpu, struct pl_frame *frame, const struct pl_source_frame *src);
+
+    // This function will be called for frames that are deemed unnecessary
+    // (e.g. never became visible) and should instead be cleanly freed.
+    // (Optional)
+    void (*discard)(const struct pl_source_frame *src);
+};
+
+// Create a new, empty frame queue.
+//
+// It's highly recommended to fully render a single frame with `pts == 0.0`,
+// and flush the GPU pipeline with `pl_gpu_finish`, prior to starting the timed
+// playback loop.
+PL_API pl_queue pl_queue_create(pl_gpu gpu);
+PL_API void pl_queue_destroy(pl_queue *queue);
+
+// Explicitly clear the queue. This is essentially equivalent to destroying
+// and recreating the queue, but preserves any internal memory allocations.
+//
+// Note: Calling `pl_queue_reset` may block, if another thread is currently
+// blocked on a different `pl_queue_*` call.
+PL_API void pl_queue_reset(pl_queue queue);
+
+// Explicitly push a frame. This is an alternative way to feed the frame queue
+// with incoming frames, the other method being the asynchronous callback
+// specified as `pl_queue_params.get_frame`. Both methods may be used
+// simultaneously, although providing `get_frame` is recommended since it
+// avoids the risk of the queue underrunning.
+//
+// When no more frames are available, call this function with `frame == NULL`
+// to indicate EOF and begin draining the frame queue.
+PL_API void pl_queue_push(pl_queue queue, const struct pl_source_frame *frame);
+
+// Variant of `pl_queue_push` that blocks while the queue is judged
+// (internally) to be "too full". This is useful for asynchronous decoder loops
+// in order to prevent the queue from exhausting available RAM if frames are
+// decoded significantly faster than they're displayed.
+//
+// The given `timeout` parameter specifies how long to wait before giving up,
+// in nanoseconds. Returns false if this timeout was reached.
+PL_API bool pl_queue_push_block(pl_queue queue, uint64_t timeout,
+                                const struct pl_source_frame *frame);
+
+struct pl_queue_params {
+    // The PTS of the frame that will be rendered. This should be set to the
+    // timestamp (in seconds) of the next vsync, relative to the initial frame.
+    //
+    // These must be monotonically increasing. To implement a discontinuous
+    // jump, users must explicitly reset the frame queue with `pl_queue_reset`
+    // and restart from PTS 0.0.
+    double pts;
+
+    // The radius of the configured mixer. This should be set to the value
+    // as returned by `pl_frame_mix_radius`.
+    float radius;
+
+    // The estimated duration of a vsync, in seconds. This will only be used as
+    // a hint, the true value will be estimated by comparing `pts` timestamps
+    // between calls to `pl_queue_update`. (Optional)
+    float vsync_duration;
+
+    // If the difference between the (estimated) vsync duration and the
+    // (measured) frame duration is smaller than this threshold, silently
+    // disable interpolation and switch to ZOH semantics instead.
+    //
+    // For example, a value of 0.01 allows the FPS to differ by up to 1%
+    // without being interpolated. Note that this will result in a continuous
+    // phase drift unless also compensated for by the user, which will
+    // eventually resulted in a dropped or duplicated frame. (Though this can
+    // be preferable to seeing that same phase drift result in a temporally
+    // smeared image)
+    float interpolation_threshold;
+
+    // Specifies how long `pl_queue_update` will wait for frames to become
+    // available, in nanoseconds, before giving up and returning with
+    // QUEUE_MORE.
+    //
+    // If `get_frame` is provided, this value is ignored by `pl_queue` and
+    // should instead be interpreted by the provided callback.
+    uint64_t timeout;
+
+    // This callback will be used to pull new frames from the decoder. It may
+    // block if needed. The user is responsible for setting appropriate time
+    // limits and/or returning and interpreting QUEUE_MORE as sensible.
+    //
+    // Providing this callback is entirely optional. Users can instead choose
+    // to manually feed the frame queue with new frames using `pl_queue_push`.
+    enum pl_queue_status (*get_frame)(struct pl_source_frame *out_frame,
+                                      const struct pl_queue_params *params);
+    void *priv;
+};
+
+#define pl_queue_params(...) (&(struct pl_queue_params) { __VA_ARGS__ })
+
+// Advance the frame queue's internal state to the target timestamp. Any frames
+// which are no longer needed (i.e. too far in the past) are automatically
+// unmapped and evicted. Any future frames which are needed to fill the queue
+// must either have been pushed in advance, or will be requested using the
+// provided `get_frame` callback. If you call this on `out_mix == NULL`, the
+// queue state will advance, but no frames will be mapped.
+//
+// This function may return with PL_QUEUE_MORE, in which case the user may wish
+// to ensure more frames are available and then re-run this function with the
+// same parameters. In this case, `out_mix` is still written to, but it may be
+// incomplete (or even contain no frames at all). Additionally, when the source
+// contains interlaced frames (see `pl_source_frame.first_field`), this
+// function may return with PL_QUEUE_MORE if a frame is missing references to
+// a future frame.
+//
+// The resulting mix of frames in `out_mix` will represent the neighbourhood of
+// the target timestamp, and can be passed to `pl_render_image_mix` as-is.
+//
+// Note: `out_mix` will only remain valid until the next call to
+// `pl_queue_update` or `pl_queue_reset`.
+PL_API enum pl_queue_status pl_queue_update(pl_queue queue, struct pl_frame_mix *out_mix,
+                                            const struct pl_queue_params *params);
+
+// Returns a pl_queue's internal estimates for FPS and VPS (vsyncs per second).
+// Returns 0.0 if no estimate is available.
+PL_API float pl_queue_estimate_fps(pl_queue queue);
+PL_API float pl_queue_estimate_vps(pl_queue queue);
+
+// Returns the number of frames currently contained in a pl_queue.
+PL_API int pl_queue_num_frames(pl_queue queue);
+
+// Inspect the contents of the Nth queued frame. Returns false if `idx` is
+// out of range.
+//
+// Warning: No guarantee is made to ensure validity of `out->frame_data`
+// after this call. In particular, pl_queue_* calls made from another thread
+// may call `discard()` on the frame in question. The user bears responsibility
+// to avoid accessing `out->frame_data` in a multi-threaded scenario unless
+// an external guarantee can be made that the frame won't be dequeued until
+// it is done being used by the user.
+PL_API bool pl_queue_peek(pl_queue queue, int idx, struct pl_source_frame *out);
+
+PL_API_END
+
+#endif // LIBPLACEBO_FRAME_QUEUE_H
diff --git a/src/include/libplacebo/utils/libav.h b/src/include/libplacebo/utils/libav.h
new file mode 100644
index 0000000..91f3dd8
--- /dev/null
+++ b/src/include/libplacebo/utils/libav.h
@@ -0,0 +1,284 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_LIBAV_H_
+#define LIBPLACEBO_LIBAV_H_
+
+#include <libplacebo/config.h>
+#include <libplacebo/gpu.h>
+#include <libplacebo/shaders/deinterlacing.h>
+#include <libplacebo/utils/upload.h>
+
+#if defined(__cplusplus) && !defined(PL_LIBAV_IMPLEMENTATION)
+# define PL_LIBAV_API
+# define PL_LIBAV_IMPLEMENTATION 0
+# warning Remember to include this file with a PL_LIBAV_IMPLEMENTATION set to 1 in \
+          C translation unit to provide implementation. Suppress this warning by \
+          defining PL_LIBAV_IMPLEMENTATION to 0 in C++ files.
+#elif !defined(PL_LIBAV_IMPLEMENTATION)
+# define PL_LIBAV_API static inline
+# define PL_LIBAV_IMPLEMENTATION 1
+#else
+# define PL_LIBAV_API
+#endif
+
+PL_API_BEGIN
+
+#include <libavformat/avformat.h>
+#include <libavutil/frame.h>
+#include <libavutil/version.h>
+#include <libavcodec/avcodec.h>
+
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 16, 100) && defined(PL_HAVE_DOVI)
+# define PL_HAVE_LAV_DOLBY_VISION
+# include <libavutil/dovi_meta.h>
+#endif
+
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(56, 61, 100)
+# define PL_HAVE_LAV_FILM_GRAIN
+# include <libavutil/film_grain_params.h>
+#endif
+
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(56, 25, 100)
+# define PL_HAVE_LAV_HDR
+# include <libavutil/hdr_dynamic_metadata.h>
+# include <libavutil/mastering_display_metadata.h>
+#endif
+
+//------------------------------------------------------------------------
+// Important note: For support for AVVkFrame, which depends on <vulkan.h>,
+// users *SHOULD* include <vulkan/vulkan.h> manually before this header.
+//------------------------------------------------------------------------
+
+
+// Fill in the details of a `pl_frame` from an AVFrame. This function will
+// explicitly clear `out_frame`, setting all extra fields to 0. After this
+// function returns, the only missing data is information related to the plane
+// texture itself (`planes[N].texture`), as well as any overlays (e.g.
+// subtitles).
+//
+// Note: If the AVFrame contains an embedded ICC profile or H.274 film grain
+// metadata, the resulting `out_image->profile` will reference this pointer,
+// meaning that in general, the `pl_frame` is only guaranteed to be valid as
+// long as the AVFrame is not freed.
+//
+// Note: This will ignore Dolby Vision metadata by default (to avoid leaking
+// memory), either switch to pl_map_avframe_ex or do it manually using
+// pl_map_dovi_metadata.
+PL_LIBAV_API void pl_frame_from_avframe(struct pl_frame *out_frame, const AVFrame *frame);
+
+// Deprecated aliases for backwards compatibility
+#define pl_image_from_avframe pl_frame_from_avframe
+#define pl_target_from_avframe pl_frame_from_avframe
+
+// Copy extra metadata from an AVStream to a pl_frame. This should be called
+// after `pl_frame_from_avframe` or `pl_map_avframe` (respectively), and sets
+// metadata associated with stream-level side data. This is needed because
+// FFmpeg rather annoyingly does not propagate stream-level metadata to frames.
+PL_LIBAV_API void pl_frame_copy_stream_props(struct pl_frame *out_frame,
+                                             const AVStream *stream);
+
+#ifdef PL_HAVE_LAV_HDR
+struct pl_av_hdr_metadata {
+    // All fields are optional and may be passed as `NULL`.
+    const AVMasteringDisplayMetadata *mdm;
+    const AVContentLightMetadata *clm;
+    const AVDynamicHDRPlus *dhp;
+};
+
+// Helper function to update a `pl_hdr_metadata` struct from HDR10/HDR10+
+// metadata in the FFmpeg format. Unspecified/invalid elements will be left
+// uninitialized in `out`.
+PL_LIBAV_API void pl_map_hdr_metadata(struct pl_hdr_metadata *out,
+                                const struct pl_av_hdr_metadata *metadata);
+#endif
+
+#ifdef PL_HAVE_LAV_DOLBY_VISION
+// Helper function to map Dolby Vision metadata from the FFmpeg format.
+PL_LIBAV_API void pl_map_dovi_metadata(struct pl_dovi_metadata *out,
+                                       const AVDOVIMetadata *metadata);
+
+// Helper function to map Dolby Vision metadata from the FFmpeg format
+// to `pl_dovi_metadata`, and adds it to the `pl_frame`.
+// The `pl_frame` colorspace fields and HDR struct are also updated with
+// values from the `AVDOVIMetadata`.
+//
+// Note: The `pl_dovi_metadata` must be allocated externally.
+// Also, currently the metadata is only used if the `AVDOVIRpuDataHeader`
+// `disable_residual_flag` field is not zero and can be checked before allocating.
+PL_LIBAV_API void pl_frame_map_avdovi_metadata(struct pl_frame *out_frame,
+                                               struct pl_dovi_metadata *dovi,
+                                               const AVDOVIMetadata *metadata);
+#endif
+
+// Helper function to test if a pixfmt would be supported by the GPU.
+// Essentially, this can be used to check if `pl_map_avframe` would work for a
+// given AVPixelFormat, without actually uploading or allocating anything.
+PL_LIBAV_API bool pl_test_pixfmt(pl_gpu gpu, enum AVPixelFormat pixfmt);
+
+// Variant of `pl_test_pixfmt` that also tests for the given capabilities
+// being present. Note that in the presence of hardware accelerated frames,
+// this cannot be tested without frame-specific information (i.e. swformat),
+// but in practice this should be a non-issue as GPU-native hwformats will
+// probably be fully supported.
+PL_LIBAV_API bool pl_test_pixfmt_caps(pl_gpu gpu, enum AVPixelFormat pixfmt,
+                                      enum pl_fmt_caps caps);
+
+// Like `pl_frame_from_avframe`, but the texture pointers are also initialized
+// to ensure they have the correct size and format to match the AVframe.
+// Similar in spirit to `pl_recreate_plane`, and the same notes apply. `tex`
+// must be an array of 4 pointers of type `pl_tex`, each either
+// pointing to a valid texture, or NULL. Returns whether successful.
+PL_LIBAV_API bool pl_frame_recreate_from_avframe(pl_gpu gpu, struct pl_frame *out_frame,
+                                                 pl_tex tex[4], const AVFrame *frame);
+
+struct pl_avframe_params {
+    // The AVFrame to map. Required.
+    const AVFrame *frame;
+
+    // Backing textures for frame data. Required for all non-hwdec formats.
+    // This must point to an array of four valid textures (or NULL entries).
+    //
+    // Note: Not cleaned up by `pl_unmap_avframe`. The intent is for users to
+    // re-use this texture array for subsequent frames, to avoid texture
+    // creation/destruction overhead.
+    pl_tex *tex;
+
+    // Also map Dolby Vision metadata (if supported). Note that this also
+    // overrides the colorimetry metadata (forces BT.2020+PQ).
+    bool map_dovi;
+};
+
+#define PL_AVFRAME_DEFAULTS \
+    .map_dovi = true,
+
+#define pl_avframe_params(...) (&(struct pl_avframe_params) { PL_AVFRAME_DEFAULTS __VA_ARGS__ })
+
+// Very high level helper function to take an `AVFrame` and map it to the GPU.
+// The resulting `pl_frame` remains valid until `pl_unmap_avframe` is called,
+// which must be called at some point to clean up state. The `AVFrame` is
+// automatically ref'd and unref'd if needed. Returns whether successful.
+//
+// Note: `out_frame->user_data` points to a privately managed opaque struct
+// and must not be touched by the user.
+PL_LIBAV_API bool pl_map_avframe_ex(pl_gpu gpu, struct pl_frame *out_frame,
+                                    const struct pl_avframe_params *params);
+PL_LIBAV_API void pl_unmap_avframe(pl_gpu gpu, struct pl_frame *frame);
+
+// Backwards compatibility with previous versions of this API.
+PL_LIBAV_API bool pl_map_avframe(pl_gpu gpu, struct pl_frame *out_frame,
+                                 pl_tex tex[4], const AVFrame *avframe);
+
+// Return the AVFrame* that a pl_frame was mapped from (via pl_map_avframe_ex)
+// Note: This reference is attached to the `pl_frame` and will get freed by
+// pl_unmap_avframe.
+PL_LIBAV_API AVFrame *pl_get_mapped_avframe(const struct pl_frame *frame);
+
+// Download the texture contents of a `pl_frame` back to a corresponding
+// AVFrame. Blocks until completion.
+//
+// Note: This function performs minimal verification, so incorrect usage will
+// likely result in broken frames. Use `pl_frame_recreate_from_avframe` to
+// ensure matching formats.
+PL_LIBAV_API bool pl_download_avframe(pl_gpu gpu,
+                                      const struct pl_frame *frame,
+                                      AVFrame *out_frame);
+
+// Helper functions to update the colorimetry data in an AVFrame based on
+// the values specified in the given color space / color repr / profile.
+//
+// Note: These functions can and will allocate AVFrame side data if needed,
+// in particular to encode HDR metadata in `space.hdr`.
+PL_LIBAV_API void pl_avframe_set_color(AVFrame *frame, struct pl_color_space space);
+PL_LIBAV_API void pl_avframe_set_repr(AVFrame *frame, struct pl_color_repr repr);
+PL_LIBAV_API void pl_avframe_set_profile(AVFrame *frame, struct pl_icc_profile profile);
+
+// Map an AVPixelFormat to an array of pl_plane_data structs. The array must
+// have at least `av_pix_fmt_count_planes(fmt)` elements, but never more than
+// 4. This function leaves `width`, `height` and `row_stride`, as well as the
+// data pointers, uninitialized.
+//
+// If `bits` is non-NULL, this function will attempt aligning the resulting
+// `pl_plane_data` struct for optimal compatibility, placing the resulting
+// `pl_bit_depth` metadata into `bits`.
+//
+// Returns the number of plane structs written to, or 0 on error.
+//
+// Note: This function is usually clumsier to use than the higher-level
+// functions above, but it might have some fringe use cases, for example if
+// the user wants to replace the data buffers by `pl_buf` references in the
+// `pl_plane_data` before uploading it to the GPU.
+PL_LIBAV_API int pl_plane_data_from_pixfmt(struct pl_plane_data data[4],
+                                           struct pl_bit_encoding *bits,
+                                           enum AVPixelFormat pix_fmt);
+
+// Callback for AVCodecContext.get_buffer2 that allocates memory from
+// persistently mapped buffers. This can be more efficient than regular
+// system memory, especially on platforms that don't support importing
+// PL_HANDLE_HOST_PTR as buffers.
+//
+// Note: `avctx->opaque` must be a pointer that *points* to the GPU instance.
+// That is, it should have type `pl_gpu *`.
+PL_LIBAV_API int pl_get_buffer2(AVCodecContext *avctx, AVFrame *pic, int flags);
+
+// Mapping functions for the various libavutil enums. Note that these are not
+// quite 1:1, and even for values that exist in both, the semantics sometimes
+// differ. Some special cases (e.g. ICtCp, or XYZ) are handled differently in
+// libplacebo and libavutil, respectively.
+//
+// Because of this, it's generally recommended to avoid these and instead use
+// helpers like `pl_frame_from_avframe`, which contain extra logic to patch
+// through all of the special cases.
+PL_LIBAV_API enum pl_color_system pl_system_from_av(enum AVColorSpace spc);
+PL_LIBAV_API enum AVColorSpace pl_system_to_av(enum pl_color_system sys);
+PL_LIBAV_API enum pl_color_levels pl_levels_from_av(enum AVColorRange range);
+PL_LIBAV_API enum AVColorRange pl_levels_to_av(enum pl_color_levels levels);
+PL_LIBAV_API enum pl_color_primaries pl_primaries_from_av(enum AVColorPrimaries prim);
+PL_LIBAV_API enum AVColorPrimaries pl_primaries_to_av(enum pl_color_primaries prim);
+PL_LIBAV_API enum pl_color_transfer pl_transfer_from_av(enum AVColorTransferCharacteristic trc);
+PL_LIBAV_API enum AVColorTransferCharacteristic pl_transfer_to_av(enum pl_color_transfer trc);
+PL_LIBAV_API enum pl_chroma_location pl_chroma_from_av(enum AVChromaLocation loc);
+PL_LIBAV_API enum AVChromaLocation pl_chroma_to_av(enum pl_chroma_location loc);
+
+// Helper function to generate a `pl_color_space` struct from an AVFrame.
+PL_LIBAV_API void pl_color_space_from_avframe(struct pl_color_space *out_csp,
+                                              const AVFrame *frame);
+
+// Helper function to pick the right `pl_field` value for an AVFrame.
+PL_LIBAV_API enum pl_field pl_field_from_avframe(const AVFrame *frame);
+
+#ifdef PL_HAVE_LAV_FILM_GRAIN
+// Fill in film grain parameters from an AVFilmGrainParams.
+//
+// Note: The resulting struct will only remain valid as long as the
+// `AVFilmGrainParams` remains valid.
+PL_LIBAV_API void pl_film_grain_from_av(struct pl_film_grain_data *out_data,
+                                        const AVFilmGrainParams *fgp);
+#endif
+
+// Deprecated alias for backwards compatibility
+#define pl_swapchain_colors_from_avframe pl_color_space_from_avframe
+
+// Actual implementation, included as part of this header to avoid having
+// a compile-time dependency on libavutil.
+#if PL_LIBAV_IMPLEMENTATION
+# include <libplacebo/utils/libav_internal.h>
+#endif
+
+PL_API_END
+
+#endif // LIBPLACEBO_LIBAV_H_
diff --git a/src/include/libplacebo/utils/libav_internal.h b/src/include/libplacebo/utils/libav_internal.h
new file mode 100644
index 0000000..4c269e5
--- /dev/null
+++ b/src/include/libplacebo/utils/libav_internal.h
@@ -0,0 +1,1482 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_LIBAV_H_
+#error This header should be included as part of <libplacebo/utils/libav.h>
+#elif defined(__cplusplus)
+#error This header cannot be included from C++ define PL_LIBAV_IMPLEMENTATION appropriately
+#else
+
+#include <assert.h>
+
+#include <libplacebo/utils/dolbyvision.h>
+
+#include <libavutil/hwcontext.h>
+#include <libavutil/hwcontext_drm.h>
+#include <libavutil/imgutils.h>
+#include <libavutil/pixdesc.h>
+#include <libavutil/display.h>
+#include <libavcodec/version.h>
+
+// Try importing <vulkan.h> dynamically if it wasn't already
+#if !defined(VK_API_VERSION_1_2) && defined(__has_include)
+# if __has_include(<vulkan/vulkan.h>)
+#  include <vulkan/vulkan.h>
+# endif
+#endif
+
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 8, 100) && \
+    defined(PL_HAVE_VULKAN) && defined(VK_API_VERSION_1_2)
+# define PL_HAVE_LAV_VULKAN
+# include <libavutil/hwcontext_vulkan.h>
+# include <libplacebo/vulkan.h>
+# if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 11, 100)
+#  define PL_HAVE_LAV_VULKAN_V2
+# endif
+#endif
+
+PL_LIBAV_API enum pl_color_system pl_system_from_av(enum AVColorSpace spc)
+{
+    switch (spc) {
+    case AVCOL_SPC_RGB:                 return PL_COLOR_SYSTEM_RGB;
+    case AVCOL_SPC_BT709:               return PL_COLOR_SYSTEM_BT_709;
+    case AVCOL_SPC_UNSPECIFIED:         return PL_COLOR_SYSTEM_UNKNOWN;
+    case AVCOL_SPC_RESERVED:            return PL_COLOR_SYSTEM_UNKNOWN;
+    case AVCOL_SPC_FCC:                 return PL_COLOR_SYSTEM_UNKNOWN; // missing
+    case AVCOL_SPC_BT470BG:             return PL_COLOR_SYSTEM_BT_601;
+    case AVCOL_SPC_SMPTE170M:           return PL_COLOR_SYSTEM_BT_601;
+    case AVCOL_SPC_SMPTE240M:           return PL_COLOR_SYSTEM_SMPTE_240M;
+    case AVCOL_SPC_YCGCO:               return PL_COLOR_SYSTEM_YCGCO;
+    case AVCOL_SPC_BT2020_NCL:          return PL_COLOR_SYSTEM_BT_2020_NC;
+    case AVCOL_SPC_BT2020_CL:           return PL_COLOR_SYSTEM_BT_2020_C;
+    case AVCOL_SPC_SMPTE2085:           return PL_COLOR_SYSTEM_UNKNOWN; // missing
+    case AVCOL_SPC_CHROMA_DERIVED_NCL:  return PL_COLOR_SYSTEM_UNKNOWN; // missing
+    case AVCOL_SPC_CHROMA_DERIVED_CL:   return PL_COLOR_SYSTEM_UNKNOWN; // missing
+    // Note: this colorspace is confused between PQ and HLG, which libav*
+    // requires inferring from other sources, but libplacebo makes explicit.
+    // Default to PQ as it's the more common scenario.
+    case AVCOL_SPC_ICTCP:               return PL_COLOR_SYSTEM_BT_2100_PQ;
+    case AVCOL_SPC_NB:                  return PL_COLOR_SYSTEM_COUNT;
+    }
+
+    return PL_COLOR_SYSTEM_UNKNOWN;
+}
+
+PL_LIBAV_API enum AVColorSpace pl_system_to_av(enum pl_color_system sys)
+{
+    switch (sys) {
+    case PL_COLOR_SYSTEM_UNKNOWN:       return AVCOL_SPC_UNSPECIFIED;
+    case PL_COLOR_SYSTEM_BT_601:        return AVCOL_SPC_SMPTE170M;
+    case PL_COLOR_SYSTEM_BT_709:        return AVCOL_SPC_BT709;
+    case PL_COLOR_SYSTEM_SMPTE_240M:    return AVCOL_SPC_SMPTE240M;
+    case PL_COLOR_SYSTEM_BT_2020_NC:    return AVCOL_SPC_BT2020_NCL;
+    case PL_COLOR_SYSTEM_BT_2020_C:     return AVCOL_SPC_BT2020_CL;
+    case PL_COLOR_SYSTEM_BT_2100_PQ:    return AVCOL_SPC_ICTCP;
+    case PL_COLOR_SYSTEM_BT_2100_HLG:   return AVCOL_SPC_ICTCP;
+    case PL_COLOR_SYSTEM_DOLBYVISION:   return AVCOL_SPC_UNSPECIFIED; // missing
+    case PL_COLOR_SYSTEM_YCGCO:         return AVCOL_SPC_YCGCO;
+    case PL_COLOR_SYSTEM_RGB:           return AVCOL_SPC_RGB;
+    case PL_COLOR_SYSTEM_XYZ:           return AVCOL_SPC_UNSPECIFIED; // handled differently
+    case PL_COLOR_SYSTEM_COUNT:         return AVCOL_SPC_NB;
+    }
+
+    return AVCOL_SPC_UNSPECIFIED;
+}
+
+PL_LIBAV_API enum pl_color_levels pl_levels_from_av(enum AVColorRange range)
+{
+    switch (range) {
+    case AVCOL_RANGE_UNSPECIFIED:       return PL_COLOR_LEVELS_UNKNOWN;
+    case AVCOL_RANGE_MPEG:              return PL_COLOR_LEVELS_LIMITED;
+    case AVCOL_RANGE_JPEG:              return PL_COLOR_LEVELS_FULL;
+    case AVCOL_RANGE_NB:                return PL_COLOR_LEVELS_COUNT;
+    }
+
+    return PL_COLOR_LEVELS_UNKNOWN;
+}
+
+PL_LIBAV_API enum AVColorRange pl_levels_to_av(enum pl_color_levels levels)
+{
+    switch (levels) {
+    case PL_COLOR_LEVELS_UNKNOWN:       return AVCOL_RANGE_UNSPECIFIED;
+    case PL_COLOR_LEVELS_LIMITED:       return AVCOL_RANGE_MPEG;
+    case PL_COLOR_LEVELS_FULL:          return AVCOL_RANGE_JPEG;
+    case PL_COLOR_LEVELS_COUNT:         return AVCOL_RANGE_NB;
+    }
+
+    return AVCOL_RANGE_UNSPECIFIED;
+}
+
+PL_LIBAV_API enum pl_color_primaries pl_primaries_from_av(enum AVColorPrimaries prim)
+{
+    switch (prim) {
+    case AVCOL_PRI_RESERVED0:       return PL_COLOR_PRIM_UNKNOWN;
+    case AVCOL_PRI_BT709:           return PL_COLOR_PRIM_BT_709;
+    case AVCOL_PRI_UNSPECIFIED:     return PL_COLOR_PRIM_UNKNOWN;
+    case AVCOL_PRI_RESERVED:        return PL_COLOR_PRIM_UNKNOWN;
+    case AVCOL_PRI_BT470M:          return PL_COLOR_PRIM_BT_470M;
+    case AVCOL_PRI_BT470BG:         return PL_COLOR_PRIM_BT_601_625;
+    case AVCOL_PRI_SMPTE170M:       return PL_COLOR_PRIM_BT_601_525;
+    case AVCOL_PRI_SMPTE240M:       return PL_COLOR_PRIM_BT_601_525;
+    case AVCOL_PRI_FILM:            return PL_COLOR_PRIM_FILM_C;
+    case AVCOL_PRI_BT2020:          return PL_COLOR_PRIM_BT_2020;
+    case AVCOL_PRI_SMPTE428:        return PL_COLOR_PRIM_CIE_1931;
+    case AVCOL_PRI_SMPTE431:        return PL_COLOR_PRIM_DCI_P3;
+    case AVCOL_PRI_SMPTE432:        return PL_COLOR_PRIM_DISPLAY_P3;
+    case AVCOL_PRI_JEDEC_P22:       return PL_COLOR_PRIM_EBU_3213;
+    case AVCOL_PRI_NB:              return PL_COLOR_PRIM_COUNT;
+    }
+
+    return PL_COLOR_PRIM_UNKNOWN;
+}
+
+PL_LIBAV_API enum AVColorPrimaries pl_primaries_to_av(enum pl_color_primaries prim)
+{
+    switch (prim) {
+    case PL_COLOR_PRIM_UNKNOWN:     return AVCOL_PRI_UNSPECIFIED;
+    case PL_COLOR_PRIM_BT_601_525:  return AVCOL_PRI_SMPTE170M;
+    case PL_COLOR_PRIM_BT_601_625:  return AVCOL_PRI_BT470BG;
+    case PL_COLOR_PRIM_BT_709:      return AVCOL_PRI_BT709;
+    case PL_COLOR_PRIM_BT_470M:     return AVCOL_PRI_BT470M;
+    case PL_COLOR_PRIM_EBU_3213:    return AVCOL_PRI_JEDEC_P22;
+    case PL_COLOR_PRIM_BT_2020:     return AVCOL_PRI_BT2020;
+    case PL_COLOR_PRIM_APPLE:       return AVCOL_PRI_UNSPECIFIED; // missing
+    case PL_COLOR_PRIM_ADOBE:       return AVCOL_PRI_UNSPECIFIED; // missing
+    case PL_COLOR_PRIM_PRO_PHOTO:   return AVCOL_PRI_UNSPECIFIED; // missing
+    case PL_COLOR_PRIM_CIE_1931:    return AVCOL_PRI_SMPTE428;
+    case PL_COLOR_PRIM_DCI_P3:      return AVCOL_PRI_SMPTE431;
+    case PL_COLOR_PRIM_DISPLAY_P3:  return AVCOL_PRI_SMPTE432;
+    case PL_COLOR_PRIM_V_GAMUT:     return AVCOL_PRI_UNSPECIFIED; // missing
+    case PL_COLOR_PRIM_S_GAMUT:     return AVCOL_PRI_UNSPECIFIED; // missing
+    case PL_COLOR_PRIM_FILM_C:      return AVCOL_PRI_FILM;
+    case PL_COLOR_PRIM_ACES_AP0:    return AVCOL_PRI_UNSPECIFIED; // missing
+    case PL_COLOR_PRIM_ACES_AP1:    return AVCOL_PRI_UNSPECIFIED; // missing
+    case PL_COLOR_PRIM_COUNT:       return AVCOL_PRI_NB;
+    }
+
+    return AVCOL_PRI_UNSPECIFIED;
+}
+
+PL_LIBAV_API enum pl_color_transfer pl_transfer_from_av(enum AVColorTransferCharacteristic trc)
+{
+    switch (trc) {
+    case AVCOL_TRC_RESERVED0:       return PL_COLOR_TRC_UNKNOWN;
+    case AVCOL_TRC_BT709:           return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+    case AVCOL_TRC_UNSPECIFIED:     return PL_COLOR_TRC_UNKNOWN;
+    case AVCOL_TRC_RESERVED:        return PL_COLOR_TRC_UNKNOWN;
+    case AVCOL_TRC_GAMMA22:         return PL_COLOR_TRC_GAMMA22;
+    case AVCOL_TRC_GAMMA28:         return PL_COLOR_TRC_GAMMA28;
+    case AVCOL_TRC_SMPTE170M:       return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+    case AVCOL_TRC_SMPTE240M:       return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+    case AVCOL_TRC_LINEAR:          return PL_COLOR_TRC_LINEAR;
+    case AVCOL_TRC_LOG:             return PL_COLOR_TRC_UNKNOWN; // missing
+    case AVCOL_TRC_LOG_SQRT:        return PL_COLOR_TRC_UNKNOWN; // missing
+    case AVCOL_TRC_IEC61966_2_4:    return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+    case AVCOL_TRC_BT1361_ECG:      return PL_COLOR_TRC_BT_1886; // ETOF != OETF
+    case AVCOL_TRC_IEC61966_2_1:    return PL_COLOR_TRC_SRGB;
+    case AVCOL_TRC_BT2020_10:       return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+    case AVCOL_TRC_BT2020_12:       return PL_COLOR_TRC_BT_1886; // EOTF != OETF
+    case AVCOL_TRC_SMPTE2084:       return PL_COLOR_TRC_PQ;
+    case AVCOL_TRC_SMPTE428:        return PL_COLOR_TRC_ST428;
+    case AVCOL_TRC_ARIB_STD_B67:    return PL_COLOR_TRC_HLG;
+    case AVCOL_TRC_NB:              return PL_COLOR_TRC_COUNT;
+    }
+
+    return PL_COLOR_TRC_UNKNOWN;
+}
+
+PL_LIBAV_API enum AVColorTransferCharacteristic pl_transfer_to_av(enum pl_color_transfer trc)
+{
+    switch (trc) {
+    case PL_COLOR_TRC_UNKNOWN:      return AVCOL_TRC_UNSPECIFIED;
+    case PL_COLOR_TRC_BT_1886:      return AVCOL_TRC_BT709;       // EOTF != OETF
+    case PL_COLOR_TRC_SRGB:         return AVCOL_TRC_IEC61966_2_1;
+    case PL_COLOR_TRC_LINEAR:       return AVCOL_TRC_LINEAR;
+    case PL_COLOR_TRC_GAMMA18:      return AVCOL_TRC_UNSPECIFIED; // missing
+    case PL_COLOR_TRC_GAMMA20:      return AVCOL_TRC_UNSPECIFIED; // missing
+    case PL_COLOR_TRC_GAMMA22:      return AVCOL_TRC_GAMMA22;
+    case PL_COLOR_TRC_GAMMA24:      return AVCOL_TRC_UNSPECIFIED; // missing
+    case PL_COLOR_TRC_GAMMA26:      return AVCOL_TRC_UNSPECIFIED; // missing
+    case PL_COLOR_TRC_GAMMA28:      return AVCOL_TRC_GAMMA28;
+    case PL_COLOR_TRC_ST428:        return AVCOL_TRC_SMPTE428;
+    case PL_COLOR_TRC_PRO_PHOTO:    return AVCOL_TRC_UNSPECIFIED; // missing
+    case PL_COLOR_TRC_PQ:           return AVCOL_TRC_SMPTE2084;
+    case PL_COLOR_TRC_HLG:          return AVCOL_TRC_ARIB_STD_B67;
+    case PL_COLOR_TRC_V_LOG:        return AVCOL_TRC_UNSPECIFIED; // missing
+    case PL_COLOR_TRC_S_LOG1:       return AVCOL_TRC_UNSPECIFIED; // missing
+    case PL_COLOR_TRC_S_LOG2:       return AVCOL_TRC_UNSPECIFIED; // missing
+    case PL_COLOR_TRC_COUNT:        return AVCOL_TRC_NB;
+    }
+
+    return AVCOL_TRC_UNSPECIFIED;
+}
+
+PL_LIBAV_API enum pl_chroma_location pl_chroma_from_av(enum AVChromaLocation loc)
+{
+    switch (loc) {
+    case AVCHROMA_LOC_UNSPECIFIED:  return PL_CHROMA_UNKNOWN;
+    case AVCHROMA_LOC_LEFT:         return PL_CHROMA_LEFT;
+    case AVCHROMA_LOC_CENTER:       return PL_CHROMA_CENTER;
+    case AVCHROMA_LOC_TOPLEFT:      return PL_CHROMA_TOP_LEFT;
+    case AVCHROMA_LOC_TOP:          return PL_CHROMA_TOP_CENTER;
+    case AVCHROMA_LOC_BOTTOMLEFT:   return PL_CHROMA_BOTTOM_LEFT;
+    case AVCHROMA_LOC_BOTTOM:       return PL_CHROMA_BOTTOM_CENTER;
+    case AVCHROMA_LOC_NB:           return PL_CHROMA_COUNT;
+    }
+
+    return PL_CHROMA_UNKNOWN;
+}
+
+PL_LIBAV_API enum AVChromaLocation pl_chroma_to_av(enum pl_chroma_location loc)
+{
+    switch (loc) {
+    case PL_CHROMA_UNKNOWN:         return AVCHROMA_LOC_UNSPECIFIED;
+    case PL_CHROMA_LEFT:            return AVCHROMA_LOC_LEFT;
+    case PL_CHROMA_CENTER:          return AVCHROMA_LOC_CENTER;
+    case PL_CHROMA_TOP_LEFT:        return AVCHROMA_LOC_TOPLEFT;
+    case PL_CHROMA_TOP_CENTER:      return AVCHROMA_LOC_TOP;
+    case PL_CHROMA_BOTTOM_LEFT:     return AVCHROMA_LOC_BOTTOMLEFT;
+    case PL_CHROMA_BOTTOM_CENTER:   return AVCHROMA_LOC_BOTTOM;
+    case PL_CHROMA_COUNT:           return AVCHROMA_LOC_NB;
+    }
+
+    return AVCHROMA_LOC_UNSPECIFIED;
+}
+
+#ifdef PL_HAVE_LAV_HDR
+PL_LIBAV_API void pl_map_hdr_metadata(struct pl_hdr_metadata *out,
+                                      const struct pl_av_hdr_metadata *data)
+{
+    if (data->mdm) {
+        if (data->mdm->has_luminance) {
+            out->max_luma = av_q2d(data->mdm->max_luminance);
+            out->min_luma = av_q2d(data->mdm->min_luminance);
+            if (out->max_luma < 10.0 || out->min_luma >= out->max_luma)
+                out->max_luma = out->min_luma = 0; /* sanity */
+        }
+        if (data->mdm->has_primaries) {
+            out->prim = (struct pl_raw_primaries) {
+                .red.x   = av_q2d(data->mdm->display_primaries[0][0]),
+                .red.y   = av_q2d(data->mdm->display_primaries[0][1]),
+                .green.x = av_q2d(data->mdm->display_primaries[1][0]),
+                .green.y = av_q2d(data->mdm->display_primaries[1][1]),
+                .blue.x  = av_q2d(data->mdm->display_primaries[2][0]),
+                .blue.y  = av_q2d(data->mdm->display_primaries[2][1]),
+                .white.x = av_q2d(data->mdm->white_point[0]),
+                .white.y = av_q2d(data->mdm->white_point[1]),
+            };
+        }
+    }
+
+    if (data->clm) {
+        out->max_cll = data->clm->MaxCLL;
+        out->max_fall = data->clm->MaxFALL;
+    }
+
+    if (data->dhp && data->dhp->application_version < 2) {
+        float hist_max = 0;
+        const AVHDRPlusColorTransformParams *pars = &data->dhp->params[0];
+        assert(data->dhp->num_windows > 0);
+        out->scene_max[0] = 10000 * av_q2d(pars->maxscl[0]);
+        out->scene_max[1] = 10000 * av_q2d(pars->maxscl[1]);
+        out->scene_max[2] = 10000 * av_q2d(pars->maxscl[2]);
+        out->scene_avg = 10000 * av_q2d(pars->average_maxrgb);
+
+        // Calculate largest value from histogram to use as fallback for clips
+        // with missing MaxSCL information. Note that this may end up picking
+        // the "reserved" value at the 5% percentile, which in practice appears
+        // to track the brightest pixel in the scene.
+        for (int i = 0; i < pars->num_distribution_maxrgb_percentiles; i++) {
+            float hist_val = av_q2d(pars->distribution_maxrgb[i].percentile);
+            if (hist_val > hist_max)
+                hist_max = hist_val;
+        }
+        hist_max *= 10000;
+        if (!out->scene_max[0])
+            out->scene_max[0] = hist_max;
+        if (!out->scene_max[1])
+            out->scene_max[1] = hist_max;
+        if (!out->scene_max[2])
+            out->scene_max[2] = hist_max;
+
+        if (pars->tone_mapping_flag == 1) {
+            out->ootf.target_luma = av_q2d(data->dhp->targeted_system_display_maximum_luminance);
+            out->ootf.knee_x = av_q2d(pars->knee_point_x);
+            out->ootf.knee_y = av_q2d(pars->knee_point_y);
+            assert(pars->num_bezier_curve_anchors < 16);
+            for (int i = 0; i < pars->num_bezier_curve_anchors; i++)
+                out->ootf.anchors[i] = av_q2d(pars->bezier_curve_anchors[i]);
+            out->ootf.num_anchors = pars->num_bezier_curve_anchors;
+        }
+    }
+}
+#endif // PL_HAVE_LAV_HDR
+
+static inline void *pl_get_side_data_raw(const AVFrame *frame,
+                                         enum AVFrameSideDataType type)
+{
+    const AVFrameSideData *sd = av_frame_get_side_data(frame, type);
+    return sd ? (void *) sd->data : NULL;
+}
+
+PL_LIBAV_API void pl_color_space_from_avframe(struct pl_color_space *out_csp,
+                                              const AVFrame *frame)
+{
+    *out_csp = (struct pl_color_space) {
+        .primaries = pl_primaries_from_av(frame->color_primaries),
+        .transfer = pl_transfer_from_av(frame->color_trc),
+    };
+
+#ifdef PL_HAVE_LAV_HDR
+    pl_map_hdr_metadata(&out_csp->hdr, &(struct pl_av_hdr_metadata) {
+        .mdm = pl_get_side_data_raw(frame, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA),
+        .clm = pl_get_side_data_raw(frame, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL),
+        .dhp = pl_get_side_data_raw(frame, AV_FRAME_DATA_DYNAMIC_HDR_PLUS),
+    });
+#endif
+}
+
+PL_LIBAV_API enum pl_field pl_field_from_avframe(const AVFrame *frame)
+{
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 7, 100)
+    if (!frame || !(frame->flags & AV_FRAME_FLAG_INTERLACED))
+        return PL_FIELD_NONE;
+    return (frame->flags & AV_FRAME_FLAG_TOP_FIELD_FIRST)
+                ? PL_FIELD_TOP : PL_FIELD_BOTTOM;
+#else
+    if (!frame || !frame->interlaced_frame)
+        return PL_FIELD_NONE;
+    return frame->top_field_first ? PL_FIELD_TOP : PL_FIELD_BOTTOM;
+#endif
+}
+
+#ifdef PL_HAVE_LAV_FILM_GRAIN
+PL_LIBAV_API void pl_film_grain_from_av(struct pl_film_grain_data *out_data,
+                                        const AVFilmGrainParams *fgp)
+{
+    out_data->seed = fgp->seed;
+
+    switch (fgp->type) {
+    case AV_FILM_GRAIN_PARAMS_NONE: break;
+    case AV_FILM_GRAIN_PARAMS_AV1: {
+        const AVFilmGrainAOMParams *src = &fgp->codec.aom;
+        struct pl_av1_grain_data *dst = &out_data->params.av1;
+        out_data->type = PL_FILM_GRAIN_AV1;
+        *dst = (struct pl_av1_grain_data) {
+            .num_points_y = src->num_y_points,
+            .chroma_scaling_from_luma = src->chroma_scaling_from_luma,
+            .num_points_uv = { src->num_uv_points[0], src->num_uv_points[1] },
+            .scaling_shift = src->scaling_shift,
+            .ar_coeff_lag = src->ar_coeff_lag,
+            .ar_coeff_shift = src->ar_coeff_shift,
+            .grain_scale_shift = src->grain_scale_shift,
+            .uv_mult = { src->uv_mult[0], src->uv_mult[1] },
+            .uv_mult_luma = { src->uv_mult_luma[0], src->uv_mult_luma[1] },
+            .uv_offset = { src->uv_offset[0], src->uv_offset[1] },
+            .overlap = src->overlap_flag,
+        };
+
+        assert(sizeof(dst->ar_coeffs_uv) == sizeof(src->ar_coeffs_uv));
+        memcpy(dst->points_y, src->y_points, sizeof(dst->points_y));
+        memcpy(dst->points_uv, src->uv_points, sizeof(dst->points_uv));
+        memcpy(dst->ar_coeffs_y, src->ar_coeffs_y, sizeof(dst->ar_coeffs_y));
+        memcpy(dst->ar_coeffs_uv, src->ar_coeffs_uv, sizeof(dst->ar_coeffs_uv));
+        break;
+    }
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 2, 100)
+    case AV_FILM_GRAIN_PARAMS_H274: {
+        const AVFilmGrainH274Params *src = &fgp->codec.h274;
+        struct pl_h274_grain_data *dst = &out_data->params.h274;
+        out_data->type = PL_FILM_GRAIN_H274;
+        *dst = (struct pl_h274_grain_data) {
+            .model_id = src->model_id,
+            .blending_mode_id = src->blending_mode_id,
+            .log2_scale_factor = src->log2_scale_factor,
+            .component_model_present = {
+                src->component_model_present[0],
+                src->component_model_present[1],
+                src->component_model_present[2],
+            },
+            .intensity_interval_lower_bound = {
+                src->intensity_interval_lower_bound[0],
+                src->intensity_interval_lower_bound[1],
+                src->intensity_interval_lower_bound[2],
+            },
+            .intensity_interval_upper_bound = {
+                src->intensity_interval_upper_bound[0],
+                src->intensity_interval_upper_bound[1],
+                src->intensity_interval_upper_bound[2],
+            },
+            .comp_model_value = {
+                src->comp_model_value[0],
+                src->comp_model_value[1],
+                src->comp_model_value[2],
+            },
+        };
+        memcpy(dst->num_intensity_intervals, src->num_intensity_intervals,
+               sizeof(dst->num_intensity_intervals));
+        memcpy(dst->num_model_values, src->num_model_values,
+               sizeof(dst->num_model_values));
+        break;
+    }
+#endif
+    }
+}
+#endif // PL_HAVE_LAV_FILM_GRAIN
+
+static inline int pl_plane_data_num_comps(const struct pl_plane_data *data)
+{
+    for (int i = 0; i < 4; i++) {
+        if (data->component_size[i] == 0)
+            return i;
+    }
+
+    return 4;
+}
+
+PL_LIBAV_API int pl_plane_data_from_pixfmt(struct pl_plane_data out_data[4],
+                                           struct pl_bit_encoding *out_bits,
+                                           enum AVPixelFormat pix_fmt)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+    int planes = av_pix_fmt_count_planes(pix_fmt);
+    struct pl_plane_data aligned_data[4];
+    struct pl_bit_encoding bits;
+    bool first;
+    if (!desc || planes < 0) // e.g. AV_PIX_FMT_NONE
+        return 0;
+
+    if (desc->flags & AV_PIX_FMT_FLAG_BITSTREAM) {
+        // Bitstream formats will most likely never be supported
+        return 0;
+    }
+
+    if (desc->flags & AV_PIX_FMT_FLAG_PAL) {
+        // Palette formats are (currently) not supported
+        return 0;
+    }
+
+    if (desc->flags & AV_PIX_FMT_FLAG_BAYER) {
+        // Bayer format don't have valid `desc->offset` values, so we can't
+        // use `pl_plane_data_from_mask` on them.
+        return 0;
+    }
+
+    if (desc->nb_components == 0 || desc->nb_components > 4) {
+        // Bogus components, possibly fake/virtual/hwaccel format?
+        return 0;
+    }
+
+    if (planes > 4)
+        return 0; // This shouldn't ever happen
+
+    // Fill in the details for each plane
+    for (int p = 0; p < planes; p++) {
+        struct pl_plane_data *data = &out_data[p];
+        int size[4] = {0};
+        int shift[4] = {0};
+        data->swapped = desc->flags & AV_PIX_FMT_FLAG_BE;
+        data->type = (desc->flags & AV_PIX_FMT_FLAG_FLOAT)
+                        ? PL_FMT_FLOAT
+                        : PL_FMT_UNORM;
+
+        data->pixel_stride = 0;
+
+        for (int c = 0; c < desc->nb_components; c++) {
+            const AVComponentDescriptor *comp = &desc->comp[c];
+            if (comp->plane != p)
+                continue;
+            if (data->swapped && comp->shift) {
+                // We cannot naively handle packed big endian formats because
+                // swapping the words also swaps the component order, so just
+                // exit out as a stupid safety measure
+                return 0;
+            }
+
+            size[c] = comp->depth;
+            shift[c] = comp->shift + comp->offset * 8;
+
+            if (data->pixel_stride && (int) data->pixel_stride != comp->step) {
+                // Pixel format contains components with different pixel stride
+                // (e.g. packed YUYV), this is currently not supported
+                return 0;
+            }
+            data->pixel_stride = comp->step;
+        }
+
+        pl_plane_data_from_comps(data, size, shift);
+    }
+
+    if (!out_bits)
+        return planes;
+
+    // Attempt aligning all of the planes for optimum compatibility
+    first = true;
+    for (int p = 0; p < planes; p++) {
+        aligned_data[p] = out_data[p];
+
+        // Planes with only an alpha component should be ignored
+        if (pl_plane_data_num_comps(&aligned_data[p]) == 1 &&
+            aligned_data[p].component_map[0] == PL_CHANNEL_A)
+        {
+            continue;
+        }
+
+        if (!pl_plane_data_align(&aligned_data[p], &bits))
+            goto misaligned;
+
+        if (first) {
+            *out_bits = bits;
+            first = false;
+        } else {
+            if (!pl_bit_encoding_equal(&bits, out_bits))
+                goto misaligned;
+        }
+    }
+
+    // Overwrite the planes by their aligned versions
+    for (int p = 0; p < planes; p++)
+        out_data[p] = aligned_data[p];
+
+    return planes;
+
+misaligned:
+    *out_bits = (struct pl_bit_encoding) {0};
+    return planes;
+}
+
+PL_LIBAV_API bool pl_test_pixfmt_caps(pl_gpu gpu, enum AVPixelFormat pixfmt,
+                                      enum pl_fmt_caps caps)
+{
+    struct pl_bit_encoding bits;
+    struct pl_plane_data data[4];
+    pl_fmt fmt;
+    int planes;
+
+    switch (pixfmt) {
+    case AV_PIX_FMT_DRM_PRIME:
+    case AV_PIX_FMT_VAAPI:
+        return gpu->import_caps.tex & PL_HANDLE_DMA_BUF;
+
+#ifdef PL_HAVE_LAV_VULKAN
+    case AV_PIX_FMT_VULKAN:
+        return pl_vulkan_get(gpu);
+#endif
+
+    default: break;
+    }
+
+    planes = pl_plane_data_from_pixfmt(data, &bits, pixfmt);
+    if (!planes)
+        return false;
+
+    for (int i = 0; i < planes; i++) {
+        data[i].row_stride = 0;
+        fmt = pl_plane_find_fmt(gpu, NULL, &data[i]);
+        if (!fmt || (fmt->caps & caps) != caps)
+            return false;
+
+    }
+
+    return true;
+}
+
+PL_LIBAV_API bool pl_test_pixfmt(pl_gpu gpu, enum AVPixelFormat pixfmt)
+{
+    return pl_test_pixfmt_caps(gpu, pixfmt, 0);
+}
+
+PL_LIBAV_API void pl_avframe_set_color(AVFrame *frame, struct pl_color_space csp)
+{
+    const AVFrameSideData *sd;
+    (void) sd;
+
+    frame->color_primaries = pl_primaries_to_av(csp.primaries);
+    frame->color_trc = pl_transfer_to_av(csp.transfer);
+
+#ifdef PL_HAVE_LAV_HDR
+    if (csp.hdr.max_cll) {
+        sd = av_frame_get_side_data(frame, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL);
+        if (!sd) {
+            sd = av_frame_new_side_data(frame, AV_FRAME_DATA_CONTENT_LIGHT_LEVEL,
+                                        sizeof(AVContentLightMetadata));
+        }
+
+        if (sd) {
+            AVContentLightMetadata *clm = (AVContentLightMetadata *) sd->data;
+            *clm = (AVContentLightMetadata) {
+                .MaxCLL = csp.hdr.max_cll,
+                .MaxFALL = csp.hdr.max_fall,
+            };
+        }
+    }
+
+    if (csp.hdr.max_luma || csp.hdr.prim.red.x) {
+        sd = av_frame_get_side_data(frame, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA);
+        if (!sd) {
+            sd = av_frame_new_side_data(frame, AV_FRAME_DATA_MASTERING_DISPLAY_METADATA,
+                                        sizeof(AVMasteringDisplayMetadata));
+        }
+
+        if (sd) {
+            AVMasteringDisplayMetadata *mdm = (AVMasteringDisplayMetadata *) sd->data;
+            *mdm = (AVMasteringDisplayMetadata) {
+                .max_luminance = av_d2q(csp.hdr.max_luma, 1000000),
+                .min_luminance = av_d2q(csp.hdr.min_luma, 1000000),
+                .has_luminance = !!csp.hdr.max_luma,
+                .display_primaries = {
+                    {
+                        av_d2q(csp.hdr.prim.red.x, 1000000),
+                        av_d2q(csp.hdr.prim.red.y, 1000000),
+                    }, {
+                        av_d2q(csp.hdr.prim.green.x, 1000000),
+                        av_d2q(csp.hdr.prim.green.y, 1000000),
+                    }, {
+                        av_d2q(csp.hdr.prim.blue.x, 1000000),
+                        av_d2q(csp.hdr.prim.blue.y, 1000000),
+                    }
+                },
+                .white_point = {
+                    av_d2q(csp.hdr.prim.white.x, 1000000),
+                    av_d2q(csp.hdr.prim.white.y, 1000000),
+                },
+                .has_primaries = !!csp.hdr.prim.red.x,
+            };
+        }
+    }
+#endif // PL_HAVE_LAV_HDR
+}
+
+PL_LIBAV_API void pl_avframe_set_repr(AVFrame *frame, struct pl_color_repr repr)
+{
+    frame->colorspace = pl_system_to_av(repr.sys);
+    frame->color_range = pl_levels_to_av(repr.levels);
+
+    // No real way to map repr.bits, the image format already has to match
+}
+
+PL_LIBAV_API void pl_avframe_set_profile(AVFrame *frame, struct pl_icc_profile profile)
+{
+    const AVFrameSideData *sd;
+    av_frame_remove_side_data(frame, AV_FRAME_DATA_ICC_PROFILE);
+
+    if (!profile.len)
+        return;
+
+    sd = av_frame_new_side_data(frame, AV_FRAME_DATA_ICC_PROFILE, profile.len);
+    memcpy(sd->data, profile.data, profile.len);
+}
+
+PL_LIBAV_API void pl_frame_from_avframe(struct pl_frame *out,
+                                         const AVFrame *frame)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+    int planes = av_pix_fmt_count_planes(frame->format);
+    const AVFrameSideData *sd;
+    assert(desc);
+
+    if (desc->flags & AV_PIX_FMT_FLAG_HWACCEL) {
+        const AVHWFramesContext *hwfc = (AVHWFramesContext *) frame->hw_frames_ctx->data;
+        desc = av_pix_fmt_desc_get(hwfc->sw_format);
+        planes = av_pix_fmt_count_planes(hwfc->sw_format);
+    }
+
+    // This should never fail, and there's nothing really useful we can do in
+    // this failure case anyway, since this is a `void` function.
+    assert(planes <= 4);
+
+    *out = (struct pl_frame) {
+        .num_planes = planes,
+        .crop = {
+            .x0 = frame->crop_left,
+            .y0 = frame->crop_top,
+            .x1 = frame->width - frame->crop_right,
+            .y1 = frame->height - frame->crop_bottom,
+        },
+        .repr = {
+            .sys = pl_system_from_av(frame->colorspace),
+            .levels = pl_levels_from_av(frame->color_range),
+            .alpha = (desc->flags & AV_PIX_FMT_FLAG_ALPHA)
+                        ? PL_ALPHA_INDEPENDENT
+                        : PL_ALPHA_UNKNOWN,
+
+            // For sake of simplicity, just use the first component's depth as
+            // the authoritative color depth for the whole image. Usually, this
+            // will be overwritten by more specific information when using e.g.
+            // `pl_map_avframe`, but for the sake of e.g. users wishing to map
+            // hwaccel frames manually, this is a good default.
+            .bits.color_depth = desc->comp[0].depth,
+        },
+    };
+
+    pl_color_space_from_avframe(&out->color, frame);
+
+    if (frame->colorspace == AVCOL_SPC_ICTCP &&
+        frame->color_trc == AVCOL_TRC_ARIB_STD_B67)
+    {
+        // libav* makes no distinction between PQ and HLG ICtCp, so we need
+        // to manually fix it in the case that we have HLG ICtCp data.
+        out->repr.sys = PL_COLOR_SYSTEM_BT_2100_HLG;
+
+    } else if (strncmp(desc->name, "xyz", 3) == 0) {
+
+        // libav* handles this as a special case, but doesn't provide an
+        // explicit flag for it either, so we have to resort to this ugly
+        // hack...
+        out->repr.sys = PL_COLOR_SYSTEM_XYZ;
+
+    } else if (desc->flags & AV_PIX_FMT_FLAG_RGB) {
+
+        out->repr.sys = PL_COLOR_SYSTEM_RGB;
+        out->repr.levels = PL_COLOR_LEVELS_FULL; // libav* ignores levels for RGB
+
+    } else if (!pl_color_system_is_ycbcr_like(out->repr.sys)) {
+        // libav* likes leaving this as UNKNOWN (or even RGB) for YCbCr frames,
+        // which confuses libplacebo since we infer UNKNOWN as RGB. To get
+        // around this, explicitly infer a suitable colorspace.
+        out->repr.sys = pl_color_system_guess_ycbcr(frame->width, frame->height);
+    }
+
+    if ((sd = av_frame_get_side_data(frame, AV_FRAME_DATA_ICC_PROFILE))) {
+        out->profile = (struct pl_icc_profile) {
+            .data = sd->data,
+            .len = sd->size,
+        };
+
+        // Needed to ensure profile uniqueness
+        pl_icc_profile_compute_signature(&out->profile);
+    }
+
+    if ((sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DISPLAYMATRIX))) {
+        double rot = av_display_rotation_get((const int32_t *) sd->data);
+        out->rotation = pl_rotation_normalize(4.5 - rot / 90.0);
+    }
+
+#ifdef PL_HAVE_LAV_FILM_GRAIN
+    if ((sd = av_frame_get_side_data(frame, AV_FRAME_DATA_FILM_GRAIN_PARAMS)))
+        pl_film_grain_from_av(&out->film_grain, (AVFilmGrainParams *) sd->data);
+#endif // HAVE_LAV_FILM_GRAIN
+
+    for (int p = 0; p < out->num_planes; p++) {
+        struct pl_plane *plane = &out->planes[p];
+
+        // Fill in the component mapping array
+        for (int c = 0; c < desc->nb_components; c++) {
+            if (desc->comp[c].plane == p)
+                plane->component_mapping[plane->components++] = c;
+        }
+
+        // Clear the superfluous components
+        for (int c = plane->components; c < 4; c++)
+            plane->component_mapping[c] = PL_CHANNEL_NONE;
+    }
+
+    // Only set the chroma location for definitely subsampled images, makes no
+    // sense otherwise
+    if (desc->log2_chroma_w || desc->log2_chroma_h) {
+        enum pl_chroma_location loc = pl_chroma_from_av(frame->chroma_location);
+        pl_frame_set_chroma_location(out, loc);
+    }
+}
+
+#if LIBAVFORMAT_VERSION_INT >= AV_VERSION_INT(60, 15, 100)
+PL_LIBAV_API const uint8_t *pl_av_stream_get_side_data(const AVStream *st,
+                                                 enum AVPacketSideDataType type)
+{
+    const AVPacketSideData *sd;
+    sd = av_packet_side_data_get(st->codecpar->coded_side_data,
+                                 st->codecpar->nb_coded_side_data,
+                                 type);
+    return sd ? sd->data : NULL;
+}
+#else
+# define pl_av_stream_get_side_data(st, type) av_stream_get_side_data(st, type, NULL)
+#endif
+
+PL_LIBAV_API void pl_frame_copy_stream_props(struct pl_frame *out,
+                                             const AVStream *stream)
+{
+    const uint8_t *sd;
+    if ((sd = pl_av_stream_get_side_data(stream, AV_PKT_DATA_DISPLAYMATRIX))) {
+        double rot = av_display_rotation_get((const int32_t *) sd);
+        out->rotation = pl_rotation_normalize(4.5 - rot / 90.0);
+    }
+
+#ifdef PL_HAVE_LAV_HDR
+    pl_map_hdr_metadata(&out->color.hdr, &(struct pl_av_hdr_metadata) {
+        .mdm = (void *) pl_av_stream_get_side_data(stream,
+                        AV_PKT_DATA_MASTERING_DISPLAY_METADATA),
+        .clm = (void *) pl_av_stream_get_side_data(stream,
+                        AV_PKT_DATA_CONTENT_LIGHT_LEVEL),
+# if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(59, 2, 100)
+        .dhp = (void *) pl_av_stream_get_side_data(stream,
+                        AV_PKT_DATA_DYNAMIC_HDR10_PLUS),
+# endif
+    });
+#endif
+}
+
+#undef pl_av_stream_get_side_data
+
+#ifdef PL_HAVE_LAV_DOLBY_VISION
+PL_LIBAV_API void pl_map_dovi_metadata(struct pl_dovi_metadata *out,
+                                       const AVDOVIMetadata *data)
+{
+    const AVDOVIRpuDataHeader *header;
+    const AVDOVIDataMapping *mapping;
+    const AVDOVIColorMetadata *color;
+    if (!data)
+        return;
+
+    header = av_dovi_get_header(data);
+    mapping = av_dovi_get_mapping(data);
+    color = av_dovi_get_color(data);
+
+    for (int i = 0; i < 3; i++)
+        out->nonlinear_offset[i] = av_q2d(color->ycc_to_rgb_offset[i]);
+    for (int i = 0; i < 9; i++) {
+        float *nonlinear = &out->nonlinear.m[0][0];
+        float *linear = &out->linear.m[0][0];
+        nonlinear[i] = av_q2d(color->ycc_to_rgb_matrix[i]);
+        linear[i] = av_q2d(color->rgb_to_lms_matrix[i]);
+    }
+    for (int c = 0; c < 3; c++) {
+        const AVDOVIReshapingCurve *csrc = &mapping->curves[c];
+        struct pl_reshape_data *cdst = &out->comp[c];
+        cdst->num_pivots = csrc->num_pivots;
+        for (int i = 0; i < csrc->num_pivots; i++) {
+            const float scale = 1.0f / ((1 << header->bl_bit_depth) - 1);
+            cdst->pivots[i] = scale * csrc->pivots[i];
+        }
+        for (int i = 0; i < csrc->num_pivots - 1; i++) {
+            const float scale = 1.0f / (1 << header->coef_log2_denom);
+            cdst->method[i] = csrc->mapping_idc[i];
+            switch (csrc->mapping_idc[i]) {
+            case AV_DOVI_MAPPING_POLYNOMIAL:
+                for (int k = 0; k < 3; k++) {
+                    cdst->poly_coeffs[i][k] = (k <= csrc->poly_order[i])
+                        ? scale * csrc->poly_coef[i][k]
+                        : 0.0f;
+                }
+                break;
+            case AV_DOVI_MAPPING_MMR:
+                cdst->mmr_order[i] = csrc->mmr_order[i];
+                cdst->mmr_constant[i] = scale * csrc->mmr_constant[i];
+                for (int j = 0; j < csrc->mmr_order[i]; j++) {
+                    for (int k = 0; k < 7; k++)
+                        cdst->mmr_coeffs[i][j][k] = scale * csrc->mmr_coef[i][j][k];
+                }
+                break;
+            }
+        }
+    }
+}
+
+PL_LIBAV_API void pl_frame_map_avdovi_metadata(struct pl_frame *out_frame,
+                                               struct pl_dovi_metadata *dovi,
+                                               const AVDOVIMetadata *metadata)
+{
+    const AVDOVIRpuDataHeader *header;
+    const AVDOVIColorMetadata *color;
+    if (!dovi || !metadata)
+        return;
+
+    header = av_dovi_get_header(metadata);
+    color = av_dovi_get_color(metadata);
+    if (header->disable_residual_flag) {
+        pl_map_dovi_metadata(dovi, metadata);
+
+        out_frame->repr.dovi = dovi;
+        out_frame->repr.sys = PL_COLOR_SYSTEM_DOLBYVISION;
+        out_frame->color.primaries = PL_COLOR_PRIM_BT_2020;
+        out_frame->color.transfer = PL_COLOR_TRC_PQ;
+        out_frame->color.hdr.min_luma =
+            pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, color->source_min_pq / 4095.0f);
+        out_frame->color.hdr.max_luma =
+            pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, color->source_max_pq / 4095.0f);
+    }
+}
+#endif // PL_HAVE_LAV_DOLBY_VISION
+
+PL_LIBAV_API bool pl_frame_recreate_from_avframe(pl_gpu gpu,
+                                                 struct pl_frame *out,
+                                                 pl_tex tex[4],
+                                                 const AVFrame *frame)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+    struct pl_plane_data data[4] = {0};
+    int planes;
+
+    pl_frame_from_avframe(out, frame);
+    planes = pl_plane_data_from_pixfmt(data, &out->repr.bits, frame->format);
+    if (!planes)
+        return false;
+
+    for (int p = 0; p < planes; p++) {
+        bool is_chroma = p == 1 || p == 2; // matches lavu logic
+        data[p].width = AV_CEIL_RSHIFT(frame->width, is_chroma ? desc->log2_chroma_w : 0);
+        data[p].height = AV_CEIL_RSHIFT(frame->height, is_chroma ? desc->log2_chroma_h : 0);
+
+        if (!pl_recreate_plane(gpu, &out->planes[p], &tex[p], &data[p]))
+            return false;
+    }
+
+    return true;
+}
+
+static void pl_avframe_free_cb(void *priv)
+{
+    AVFrame *frame = priv;
+    av_frame_free(&frame);
+}
+
+#define PL_MAGIC0 0xfb5b3b8b
+#define PL_MAGIC1 0xee659f6d
+
+struct pl_avalloc {
+    uint32_t magic[2];
+    pl_gpu gpu;
+    pl_buf buf;
+};
+
+// Attached to `pl_frame.user_data` for mapped AVFrames
+struct pl_avframe_priv {
+    AVFrame *avframe;
+    struct pl_dovi_metadata dovi; // backing storage for per-frame dovi metadata
+    pl_tex planar; // for planar vulkan textures
+};
+
+static void pl_fix_hwframe_sample_depth(struct pl_frame *out, const AVFrame *frame)
+{
+    const AVHWFramesContext *hwfc = (AVHWFramesContext *) frame->hw_frames_ctx->data;
+    pl_fmt fmt = out->planes[0].texture->params.format;
+    struct pl_bit_encoding *bits = &out->repr.bits;
+
+    bits->sample_depth = fmt->component_depth[0];
+
+    switch (hwfc->sw_format) {
+    case AV_PIX_FMT_P010: bits->bit_shift = 6; break;
+    default: break;
+    }
+}
+
+static bool pl_map_avframe_drm(pl_gpu gpu, struct pl_frame *out,
+                               const AVFrame *frame)
+{
+    const AVHWFramesContext *hwfc = (AVHWFramesContext *) frame->hw_frames_ctx->data;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(hwfc->sw_format);
+    const AVDRMFrameDescriptor *drm = (AVDRMFrameDescriptor *) frame->data[0];
+    assert(frame->format == AV_PIX_FMT_DRM_PRIME);
+    if (!(gpu->import_caps.tex & PL_HANDLE_DMA_BUF))
+        return false;
+
+    assert(drm->nb_layers >= out->num_planes);
+    for (int n = 0; n < out->num_planes; n++) {
+        const AVDRMLayerDescriptor *layer = &drm->layers[n];
+        const AVDRMPlaneDescriptor *plane = &layer->planes[0];
+        const AVDRMObjectDescriptor *object = &drm->objects[plane->object_index];
+        pl_fmt fmt = pl_find_fourcc(gpu, layer->format);
+        bool is_chroma = n == 1 || n == 2;
+        if (!fmt || !pl_fmt_has_modifier(fmt, object->format_modifier))
+            return false;
+
+        assert(layer->nb_planes == 1); // we only support planar formats
+        assert(plane->pitch >= 0); // definitely requires special handling
+        out->planes[n].texture = pl_tex_create(gpu, pl_tex_params(
+            .w = AV_CEIL_RSHIFT(frame->width, is_chroma ? desc->log2_chroma_w : 0),
+            .h = AV_CEIL_RSHIFT(frame->height, is_chroma ? desc->log2_chroma_h : 0),
+            .format = fmt,
+            .sampleable = true,
+            .blit_src = fmt->caps & PL_FMT_CAP_BLITTABLE,
+            .import_handle = PL_HANDLE_DMA_BUF,
+            .shared_mem = {
+                .handle.fd = object->fd,
+                .size = object->size,
+                .offset = plane->offset,
+                .drm_format_mod = object->format_modifier,
+                .stride_w = plane->pitch,
+            },
+        ));
+        if (!out->planes[n].texture)
+            return false;
+    }
+
+    pl_fix_hwframe_sample_depth(out, frame);
+    return true;
+}
+
+// Derive a DMABUF from any other hwaccel format, and map that instead
+static bool pl_map_avframe_derived(pl_gpu gpu, struct pl_frame *out,
+                                   const AVFrame *frame)
+{
+    const int flags = AV_HWFRAME_MAP_READ | AV_HWFRAME_MAP_DIRECT;
+    struct pl_avframe_priv *priv = out->user_data;
+    AVFrame *derived = av_frame_alloc();
+    derived->width = frame->width;
+    derived->height = frame->height;
+    derived->format = AV_PIX_FMT_DRM_PRIME;
+    derived->hw_frames_ctx = av_buffer_ref(frame->hw_frames_ctx);
+    if (av_hwframe_map(derived, frame, flags) < 0)
+        goto error;
+    if (av_frame_copy_props(derived, frame) < 0)
+        goto error;
+    if (!pl_map_avframe_drm(gpu, out, derived))
+        goto error;
+
+    av_frame_free(&priv->avframe);
+    priv->avframe = derived;
+    return true;
+
+error:
+    av_frame_free(&derived);
+    return false;
+}
+
+#ifdef PL_HAVE_LAV_VULKAN
+static bool pl_acquire_avframe(pl_gpu gpu, struct pl_frame *frame)
+{
+    const struct pl_avframe_priv *priv = frame->user_data;
+    AVHWFramesContext *hwfc = (void *) priv->avframe->hw_frames_ctx->data;
+    AVVulkanFramesContext *vkfc = hwfc->hwctx;
+    AVVkFrame *vkf = (AVVkFrame *) priv->avframe->data[0];
+
+#ifdef PL_HAVE_LAV_VULKAN_V2
+    vkfc->lock_frame(hwfc, vkf);
+#else
+    (void) vkfc;
+#endif
+
+    for (int n = 0; n < frame->num_planes; n++) {
+        pl_vulkan_release_ex(gpu, pl_vulkan_release_params(
+            .tex        = priv->planar ? priv->planar : frame->planes[n].texture,
+            .layout     = vkf->layout[n],
+            .qf         = VK_QUEUE_FAMILY_IGNORED,
+            .semaphore  = {
+                .sem    = vkf->sem[n],
+                .value  = vkf->sem_value[n],
+            },
+        ));
+        if (priv->planar)
+            break;
+    }
+
+    return true;
+}
+
+static void pl_release_avframe(pl_gpu gpu, struct pl_frame *frame)
+{
+    const struct pl_avframe_priv *priv = frame->user_data;
+    AVHWFramesContext *hwfc = (void *) priv->avframe->hw_frames_ctx->data;
+    AVVulkanFramesContext *vkfc = hwfc->hwctx;
+    AVVkFrame *vkf = (AVVkFrame *) priv->avframe->data[0];
+
+    for (int n = 0; n < frame->num_planes; n++) {
+        int ok = pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params(
+            .tex        = priv->planar ? priv->planar : frame->planes[n].texture,
+            .out_layout = &vkf->layout[n],
+            .qf         = VK_QUEUE_FAMILY_IGNORED,
+            .semaphore  = {
+                .sem    = vkf->sem[n],
+                .value  = vkf->sem_value[n] + 1,
+            },
+        ));
+
+        vkf->access[n] = 0;
+        vkf->sem_value[n] += !!ok;
+        if (priv->planar)
+            break;
+    }
+
+#ifdef PL_HAVE_LAV_VULKAN_V2
+    vkfc->unlock_frame(hwfc, vkf);
+#else
+    (void) vkfc;
+#endif
+}
+
+static bool pl_map_avframe_vulkan(pl_gpu gpu, struct pl_frame *out,
+                                  const AVFrame *frame)
+{
+    const AVHWFramesContext *hwfc = (AVHWFramesContext *) frame->hw_frames_ctx->data;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(hwfc->sw_format);
+    const AVVulkanFramesContext *vkfc = hwfc->hwctx;
+    AVVkFrame *vkf = (AVVkFrame *) frame->data[0];
+    struct pl_avframe_priv *priv = out->user_data;
+    pl_vulkan vk = pl_vulkan_get(gpu);
+
+#ifdef PL_HAVE_LAV_VULKAN_V2
+    const VkFormat *vk_fmt = vkfc->format;
+#else
+    const VkFormat *vk_fmt = av_vkfmt_from_pixfmt(hwfc->sw_format);
+#endif
+
+    assert(frame->format == AV_PIX_FMT_VULKAN);
+    priv->planar = NULL;
+    if (!vk)
+        return false;
+
+    for (int n = 0; n < out->num_planes; n++) {
+        struct pl_plane *plane = &out->planes[n];
+        bool chroma = n == 1 || n == 2;
+        int num_subplanes;
+        assert(vk_fmt[n]);
+
+        plane->texture = pl_vulkan_wrap(gpu, pl_vulkan_wrap_params(
+            .image  = vkf->img[n],
+            .width  = AV_CEIL_RSHIFT(hwfc->width, chroma ? desc->log2_chroma_w : 0),
+            .height = AV_CEIL_RSHIFT(hwfc->height, chroma ? desc->log2_chroma_h : 0),
+            .format = vk_fmt[n],
+            .usage  = vkfc->usage,
+        ));
+        if (!plane->texture)
+            return false;
+
+        num_subplanes = plane->texture->params.format->num_planes;
+        if (num_subplanes) {
+            assert(num_subplanes == out->num_planes);
+            priv->planar = plane->texture;
+            for (int i = 0; i < num_subplanes; i++)
+                out->planes[i].texture = priv->planar->planes[i];
+            break;
+        }
+    }
+
+    out->acquire = pl_acquire_avframe;
+    out->release = pl_release_avframe;
+    pl_fix_hwframe_sample_depth(out, frame);
+    return true;
+}
+
+static void pl_unmap_avframe_vulkan(pl_gpu gpu, struct pl_frame *frame)
+{
+    struct pl_avframe_priv *priv = frame->user_data;
+    if (priv->planar) {
+        pl_tex_destroy(gpu, &priv->planar);
+        for (int n = 0; n < frame->num_planes; n++)
+            frame->planes[n].texture = NULL;
+    }
+}
+#endif
+
+PL_LIBAV_API bool pl_map_avframe_ex(pl_gpu gpu, struct pl_frame *out,
+                                    const struct pl_avframe_params *params)
+{
+    const AVFrame *frame = params->frame;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+    struct pl_plane_data data[4] = {0};
+    pl_tex *tex = params->tex;
+    int planes;
+
+    struct pl_avframe_priv *priv = malloc(sizeof(*priv));
+    if (!priv)
+        goto error;
+
+    pl_frame_from_avframe(out, frame);
+    priv->avframe = av_frame_clone(frame);
+    out->user_data = priv;
+
+#ifdef PL_HAVE_LAV_DOLBY_VISION
+    if (params->map_dovi) {
+        AVFrameSideData *sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DOVI_METADATA);
+        if (sd) {
+            const AVDOVIMetadata *metadata = (AVDOVIMetadata *) sd->data;
+            const AVDOVIRpuDataHeader *header = av_dovi_get_header(metadata);
+            // Only automatically map DoVi RPUs that don't require an EL
+            if (header->disable_residual_flag)
+                pl_frame_map_avdovi_metadata(out, &priv->dovi, metadata);
+        }
+
+#ifdef PL_HAVE_LIBDOVI
+        sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DOVI_RPU_BUFFER);
+        if (sd)
+            pl_hdr_metadata_from_dovi_rpu(&out->color.hdr, sd->buf->data, sd->buf->size);
+#endif // PL_HAVE_LIBDOVI
+    }
+
+#endif // PL_HAVE_LAV_DOLBY_VISION
+
+    switch (frame->format) {
+    case AV_PIX_FMT_DRM_PRIME:
+        if (!pl_map_avframe_drm(gpu, out, frame))
+            goto error;
+        return true;
+
+    case AV_PIX_FMT_VAAPI:
+        if (!pl_map_avframe_derived(gpu, out, frame))
+            goto error;
+        return true;
+
+#ifdef PL_HAVE_LAV_VULKAN
+    case AV_PIX_FMT_VULKAN:
+        if (!pl_map_avframe_vulkan(gpu, out, frame))
+            goto error;
+        return true;
+#endif
+
+    default: break;
+    }
+
+    // Backing textures are required from this point onwards
+    if (!tex)
+        goto error;
+
+    planes = pl_plane_data_from_pixfmt(data, &out->repr.bits, frame->format);
+    if (!planes)
+        goto error;
+
+    for (int p = 0; p < planes; p++) {
+        AVBufferRef *buf = av_frame_get_plane_buffer((AVFrame *) frame, p);
+        struct pl_avalloc *alloc = buf ? av_buffer_get_opaque(buf) : NULL;
+        bool is_chroma = p == 1 || p == 2; // matches lavu logic
+
+        data[p].width = AV_CEIL_RSHIFT(frame->width, is_chroma ? desc->log2_chroma_w : 0);
+        data[p].height = AV_CEIL_RSHIFT(frame->height, is_chroma ? desc->log2_chroma_h : 0);
+        if (frame->linesize[p] < 0) {
+            data[p].pixels = frame->data[p] + frame->linesize[p] * (data[p].height - 1);
+            data[p].row_stride = -frame->linesize[p];
+            out->planes[p].flipped = true;
+        } else {
+            data[p].pixels = frame->data[p];
+            data[p].row_stride = frame->linesize[p];
+        }
+
+        // Probe for frames allocated by pl_get_buffer2
+        if (alloc && alloc->magic[0] == PL_MAGIC0 && alloc->magic[1] == PL_MAGIC1) {
+            data[p].buf = alloc->buf;
+            data[p].buf_offset = (uintptr_t) data[p].pixels - (uintptr_t) alloc->buf->data;
+            data[p].pixels = NULL;
+        } else if (gpu->limits.callbacks) {
+            // Use asynchronous upload if possible
+            data[p].callback = pl_avframe_free_cb;
+            data[p].priv = av_frame_clone(frame);
+        }
+
+        if (!pl_upload_plane(gpu, &out->planes[p], &tex[p], &data[p])) {
+            av_frame_free((AVFrame **) &data[p].priv);
+            goto error;
+        }
+
+        out->planes[p].texture = tex[p];
+    }
+
+    return true;
+
+error:
+    pl_unmap_avframe(gpu, out);
+    return false;
+}
+
+// Backwards compatibility with previous versions of this API.
+PL_LIBAV_API bool pl_map_avframe(pl_gpu gpu, struct pl_frame *out_frame,
+                                     pl_tex tex[4], const AVFrame *avframe)
+{
+    return pl_map_avframe_ex(gpu, out_frame, &(struct pl_avframe_params) {
+        .frame  = avframe,
+        .tex    = tex,
+    });
+}
+
+PL_LIBAV_API void pl_unmap_avframe(pl_gpu gpu, struct pl_frame *frame)
+{
+    struct pl_avframe_priv *priv = frame->user_data;
+    const AVPixFmtDescriptor *desc;
+    if (!priv)
+        goto done;
+
+#ifdef PL_HAVE_LAV_VULKAN
+    if (priv->avframe->format == AV_PIX_FMT_VULKAN)
+        pl_unmap_avframe_vulkan(gpu, frame);
+#endif
+
+    desc = av_pix_fmt_desc_get(priv->avframe->format);
+    if (desc->flags & AV_PIX_FMT_FLAG_HWACCEL) {
+        for (int i = 0; i < 4; i++)
+            pl_tex_destroy(gpu, &frame->planes[i].texture);
+    }
+
+    av_frame_free(&priv->avframe);
+    free(priv);
+
+done:
+    memset(frame, 0, sizeof(*frame)); // sanity
+}
+
+PL_LIBAV_API AVFrame *pl_get_mapped_avframe(const struct pl_frame *frame)
+{
+    struct pl_avframe_priv *priv = frame->user_data;
+    return priv->avframe;
+}
+
+static void pl_done_cb(void *priv)
+{
+    bool *status = priv;
+    *status = true;
+}
+
+PL_LIBAV_API bool pl_download_avframe(pl_gpu gpu,
+                                      const struct pl_frame *frame,
+                                      AVFrame *out_frame)
+{
+    bool done[4] = {0};
+    if (frame->num_planes != av_pix_fmt_count_planes(out_frame->format))
+        return false;
+
+    for (int p = 0; p < frame->num_planes; p++) {
+        bool ok = pl_tex_download(gpu, pl_tex_transfer_params(
+            .tex = frame->planes[p].texture,
+            .row_pitch = out_frame->linesize[p],
+            .ptr = out_frame->data[p],
+            // Use synchronous transfer for the last plane
+            .callback = (p+1) < frame->num_planes ? pl_done_cb : NULL,
+            .priv = &done[p],
+        ));
+
+        if (!ok)
+            return false;
+    }
+
+    for (int p = 0; p < frame->num_planes - 1; p++) {
+        while (!done[p])
+            pl_tex_poll(gpu, frame->planes[p].texture, UINT64_MAX);
+    }
+
+    return true;
+}
+
+#define PL_DIV_UP(x, y) (((x) + (y) - 1) / (y))
+#define PL_ALIGN(x, align) ((align) ? PL_DIV_UP(x, align) * (align) : (x))
+#define PL_MAX(x, y) ((x) > (y) ? (x) : (y))
+#define PL_LCM(x, y) ((x) * ((y) / av_gcd(x, y)))
+
+static inline void pl_avalloc_free(void *opaque, uint8_t *data)
+{
+    struct pl_avalloc *alloc = opaque;
+    assert(alloc->magic[0] == PL_MAGIC0);
+    assert(alloc->magic[1] == PL_MAGIC1);
+    assert(alloc->buf->data == data);
+    pl_buf_destroy(alloc->gpu, &alloc->buf);
+    free(alloc);
+}
+
+PL_LIBAV_API int pl_get_buffer2(AVCodecContext *avctx, AVFrame *pic, int flags)
+{
+    int alignment[AV_NUM_DATA_POINTERS];
+    int width = pic->width;
+    int height = pic->height;
+    size_t planesize[4];
+    int ret = 0;
+
+    pl_gpu *pgpu = avctx->opaque;
+    pl_gpu gpu = pgpu ? *pgpu : NULL;
+    struct pl_plane_data data[4];
+    struct pl_avalloc *alloc;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pic->format);
+    int planes = pl_plane_data_from_pixfmt(data, NULL, pic->format);
+
+    // Sanitize frame structs
+    memset(pic->data, 0, sizeof(pic->data));
+    memset(pic->linesize, 0, sizeof(pic->linesize));
+    memset(pic->buf, 0, sizeof(pic->buf));
+    pic->extended_data = pic->data;
+    pic->extended_buf = NULL;
+
+    if (!(avctx->codec->capabilities & AV_CODEC_CAP_DR1) || !planes)
+        goto fallback;
+    if (!gpu || !gpu->limits.thread_safe || !gpu->limits.max_mapped_size ||
+        !gpu->limits.host_cached)
+    {
+        goto fallback;
+    }
+
+    avcodec_align_dimensions2(avctx, &width, &height, alignment);
+    if ((ret = av_image_fill_linesizes(pic->linesize, pic->format, width)))
+        return ret;
+
+    for (int p = 0; p < planes; p++) {
+        alignment[p] = PL_LCM(alignment[p], gpu->limits.align_tex_xfer_pitch);
+        alignment[p] = PL_LCM(alignment[p], gpu->limits.align_tex_xfer_offset);
+        alignment[p] = PL_LCM(alignment[p], data[p].pixel_stride);
+        pic->linesize[p] = PL_ALIGN(pic->linesize[p], alignment[p]);
+    }
+
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(56, 56, 100)
+    ret = av_image_fill_plane_sizes(planesize, pic->format, height, (ptrdiff_t[4]) {
+        pic->linesize[0], pic->linesize[1], pic->linesize[2], pic->linesize[3],
+    });
+    if (ret < 0)
+        return ret;
+#else
+    uint8_t *ptrs[4], * const base = (uint8_t *) 0x10000;
+    ret = av_image_fill_pointers(ptrs, pic->format, height, base, pic->linesize);
+    if (ret < 0)
+        return ret;
+    for (int p = 0; p < 4; p++)
+        planesize[p] = (uintptr_t) ptrs[p] - (uintptr_t) base;
+#endif
+
+    for (int p = 0; p < planes; p++) {
+        const size_t buf_size = planesize[p] + alignment[p];
+        if (buf_size > gpu->limits.max_mapped_size) {
+            av_frame_unref(pic);
+            goto fallback;
+        }
+
+        alloc = malloc(sizeof(*alloc));
+        if (!alloc) {
+            av_frame_unref(pic);
+            return AVERROR(ENOMEM);
+        }
+
+        *alloc = (struct pl_avalloc) {
+            .magic = { PL_MAGIC0, PL_MAGIC1 },
+            .gpu = gpu,
+            .buf = pl_buf_create(gpu, pl_buf_params(
+                .size = buf_size,
+                .memory_type = PL_BUF_MEM_HOST,
+                .host_mapped = true,
+                .storable = desc->flags & AV_PIX_FMT_FLAG_BE,
+            )),
+        };
+
+        if (!alloc->buf) {
+            free(alloc);
+            av_frame_unref(pic);
+            return AVERROR(ENOMEM);
+        }
+
+        pic->data[p] = (uint8_t *) PL_ALIGN((uintptr_t) alloc->buf->data, alignment[p]);
+        pic->buf[p] = av_buffer_create(alloc->buf->data, buf_size, pl_avalloc_free, alloc, 0);
+        if (!pic->buf[p]) {
+            pl_buf_destroy(gpu, &alloc->buf);
+            free(alloc);
+            av_frame_unref(pic);
+            return AVERROR(ENOMEM);
+        }
+    }
+
+    return 0;
+
+fallback:
+    return avcodec_default_get_buffer2(avctx, pic, flags);
+}
+
+#undef PL_MAGIC0
+#undef PL_MAGIC1
+#undef PL_ALIGN
+#undef PL_MAX
+
+#endif // LIBPLACEBO_LIBAV_H_
diff --git a/src/include/libplacebo/utils/upload.h b/src/include/libplacebo/utils/upload.h
new file mode 100644
index 0000000..9e8d436
--- /dev/null
+++ b/src/include/libplacebo/utils/upload.h
@@ -0,0 +1,153 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_UPLOAD_H_
+#define LIBPLACEBO_UPLOAD_H_
+
+#include <stdint.h>
+
+#include <libplacebo/gpu.h>
+#include <libplacebo/renderer.h>
+
+PL_API_BEGIN
+
+// This file contains a utility function to assist in uploading data from host
+// memory to a texture. In particular, the texture will be suitable for use as
+// a `pl_plane`.
+
+// Description of the host representation of an image plane
+struct pl_plane_data {
+    enum pl_fmt_type type;  // meaning of the data (must not be UINT or SINT)
+    int width, height;      // dimensions of the plane
+    int component_size[4];  // size in bits of each coordinate
+    int component_pad[4];   // ignored bits preceding each component
+    int component_map[4];   // semantic meaning of each component (pixel order)
+    size_t pixel_stride;    // offset in bytes between pixels (required)
+    size_t row_stride;      // offset in bytes between rows (optional)
+    bool swapped;           // pixel data is endian-swapped (non-native)
+
+    // Similar to `pl_tex_transfer_params`, you can either upload from a raw
+    // pointer address, or a buffer + offset. Again, the use of these two
+    // mechanisms is mutually exclusive.
+    //
+    // 1. Uploading from host memory
+    const void *pixels;     // the actual data underlying this plane
+
+    // 2. Uploading from a buffer (requires `pl_gpu_limits.buf_transfer`)
+    pl_buf buf;             // the buffer to use
+    size_t buf_offset;      // offset of data within buffer, must be a
+                            // multiple of `pixel_stride` as well as of 4
+
+    // Similar to `pl_tex_transfer_params.callback`, this allows turning the
+    // upload of a plane into an asynchronous upload. The same notes apply.
+    void (*callback)(void *priv);
+    void *priv;
+
+    // Note: When using this together with `pl_frame`, there is some amount of
+    // overlap between `component_pad` and `pl_color_repr.bits`. Some key
+    // differences between the two:
+    //
+    // - the bits from `component_pad` are ignored; whereas the superfluous bits
+    //   in a `pl_color_repr` must be 0.
+    // - the `component_pad` exists to align the component size and placement
+    //   with the capabilities of GPUs; the `pl_color_repr` exists to control
+    //   the semantics of the color samples on a finer granularity.
+    // - the `pl_color_repr` applies to the color sample as a whole, and
+    //   therefore applies to all planes; the `component_pad` can be different
+    //   for each plane.
+    // - `component_pad` interacts with float textures by moving the actual
+    //   float in memory. `pl_color_repr` interacts with float data as if
+    //   the float was converted from an integer under full range semantics.
+    //
+    // To help establish the motivating difference, a typical example of a use
+    // case would be yuv420p10. Since 10-bit GPU texture support is limited,
+    // and working with non-byte-aligned pixels is awkward in general, the
+    // convention is to represent yuv420p10 as 16-bit samples with either the
+    // high or low bits set to 0. In this scenario, the `component_size` of the
+    // `pl_plane_data` and `pl_bit_encoding.sample_depth` would be 16, while
+    // the `pl_bit_encoding.color_depth` would be 10 (and additionally, the
+    // `pl_bit_encoding.bit_shift` would be either 0 or  6, depending on
+    // whether the low or the high bits are used).
+    //
+    // On the contrary, something like a packed, 8-bit XBGR format (where the
+    // X bits are ignored and may contain garbage) would set `component_pad[0]`
+    // to 8, and the component_size[0:2] (respectively) to 8 as well.
+    //
+    // As a general rule of thumb, for maximum compatibility, you should try
+    // and align component_size/component_pad to multiples of 8 and explicitly
+    // clear any remaining superfluous bits (+ use `pl_color_repr.bits` to
+    // ensure they're decoded correctly). You should also try to align the
+    // `pixel_stride` to a power of two.
+};
+
+// Fills in the `component_size`, `component_pad` and `component_map` fields
+// based on the supplied mask for each component (in semantic order, i.e.
+// RGBA). Each element of `mask` must have a contiguous range of set bits.
+PL_API void pl_plane_data_from_mask(struct pl_plane_data *data, uint64_t mask[4]);
+
+// Fills in the `component_size`, `component_pad` and `component_map` fields
+// based on the supplied sizes (in bits) and shift of each component (in
+// semantic order).
+//
+// Similar to `pl_plane_data_from_mask` but not limited to 64-bit pixels.
+PL_API void pl_plane_data_from_comps(struct pl_plane_data *data, int size[4],
+                                     int shift[4]);
+
+// Helper function to take a `pl_plane_data` struct and try and improve its
+// alignment to make it more likely to correspond to a real `pl_fmt`. It does
+// this by attempting to round each component up to the nearest byte boundary.
+// This relies on the assumption (true in practice) that superfluous bits of
+// byte-misaligned formats are explicitly set to 0.
+//
+// The resulting shift must be consistent across all components, in which case
+// it's returned in `out_bits`. If no alignment was possible, `out_bits` is set
+// to {0}, and this function returns false.
+PL_API bool pl_plane_data_align(struct pl_plane_data *data, struct pl_bit_encoding *out_bits);
+
+// Helper function to find a suitable `pl_fmt` based on a pl_plane_data's
+// requirements. This is called internally by `pl_upload_plane`, but it's
+// exposed to users both as a convenience and so they may pre-emptively check
+// if a format would be supported without actually having to attempt the upload.
+PL_API pl_fmt pl_plane_find_fmt(pl_gpu gpu, int out_map[4], const struct pl_plane_data *data);
+
+// Upload an image plane to a texture, and output the resulting `pl_plane`
+// struct to `out_plane` (optional). `tex` must be a valid pointer to a texture
+// (or NULL), which will be destroyed and reinitialized if it does not already
+// exist or is incompatible. Returns whether successful.
+//
+// The resulting texture is guaranteed to be `sampleable`, and it will also try
+// and maximize compatibility with the other `pl_renderer` requirements
+// (blittable, linear filterable, etc.).
+//
+// Note: `out_plane->shift_x/y` and `out_plane->flipped` are left
+// uninitialized, and should be set explicitly by the user.
+PL_API bool pl_upload_plane(pl_gpu gpu, struct pl_plane *out_plane,
+                            pl_tex *tex, const struct pl_plane_data *data);
+
+// Like `pl_upload_plane`, but only creates an uninitialized texture object
+// rather than actually performing an upload. This can be useful to, for
+// example, prepare textures to be used as the target of rendering.
+//
+// The resulting texture is guaranteed to be `renderable`, and it will also try
+// to maximize compatibility with the other `pl_renderer` requirements
+// (blittable, storable, etc.).
+PL_API bool pl_recreate_plane(pl_gpu gpu, struct pl_plane *out_plane,
+                              pl_tex *tex, const struct pl_plane_data *data);
+
+PL_API_END
+
+#endif // LIBPLACEBO_UPLOAD_H_
diff --git a/src/include/libplacebo/vulkan.h b/src/include/libplacebo/vulkan.h
new file mode 100644
index 0000000..4e5db95
--- /dev/null
+++ b/src/include/libplacebo/vulkan.h
@@ -0,0 +1,638 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef LIBPLACEBO_VULKAN_H_
+#define LIBPLACEBO_VULKAN_H_
+
+#include <vulkan/vulkan.h>
+#include <libplacebo/gpu.h>
+#include <libplacebo/swapchain.h>
+
+PL_API_BEGIN
+
+#define PL_VK_MIN_VERSION VK_API_VERSION_1_2
+
+// Structure representing a VkInstance. Using this is not required.
+typedef const struct pl_vk_inst_t {
+    VkInstance instance;
+
+    // The Vulkan API version supported by this VkInstance.
+    uint32_t api_version;
+
+    // The associated vkGetInstanceProcAddr pointer.
+    PFN_vkGetInstanceProcAddr get_proc_addr;
+
+    // The instance extensions that were successfully enabled, including
+    // extensions enabled by libplacebo internally. May contain duplicates.
+    const char * const *extensions;
+    int num_extensions;
+
+    // The instance layers that were successfully enabled, including
+    // layers enabled by libplacebo internally. May contain duplicates.
+    const char * const *layers;
+    int num_layers;
+} *pl_vk_inst;
+
+struct pl_vk_inst_params {
+    // If set, enable the debugging and validation layers. These should
+    // generally be lightweight and relatively harmless to enable.
+    bool debug;
+
+    // If set, also enable GPU-assisted verification and best practices
+    // layers. (Note: May cause substantial slowdown and/or result in lots of
+    // false positive spam)
+    bool debug_extra;
+
+    // If nonzero, restricts the Vulkan API version to be at most this. This
+    // is only really useful for explicitly testing backwards compatibility.
+    uint32_t max_api_version;
+
+    // Pointer to a user-provided `vkGetInstanceProcAddr`. If this is NULL,
+    // libplacebo will use the directly linked version (if available).
+    PFN_vkGetInstanceProcAddr get_proc_addr;
+
+    // Enables extra instance extensions. Instance creation will fail if these
+    // extensions are not all supported. The user may use this to enable e.g.
+    // windowing system integration.
+    const char * const *extensions;
+    int num_extensions;
+
+    // Enables extra optional instance extensions. These are opportunistically
+    // enabled if supported by the device, but otherwise skipped.
+    const char * const *opt_extensions;
+    int num_opt_extensions;
+
+    // Enables extra layers. Instance creation will fail if these layers are
+    // not all supported.
+    //
+    // NOTE: Layers needed for required/optional extensions are automatically
+    // enabled. The user does not specifically need to enable layers related
+    // to extension support.
+    const char * const *layers;
+    int num_layers;
+
+    // Enables extra optional layers. These are opportunistically enabled if
+    // supported by the platform, but otherwise skipped.
+    const char * const *opt_layers;
+    int num_opt_layers;
+};
+
+#define pl_vk_inst_params(...) (&(struct pl_vk_inst_params) { __VA_ARGS__ })
+PL_API extern const struct pl_vk_inst_params pl_vk_inst_default_params;
+
+// Helper function to simplify instance creation. The user could also bypass
+// these helpers and do it manually, but this function is provided as a
+// convenience. It also sets up a debug callback which forwards all vulkan
+// messages to the `pl_log` callback.
+PL_API pl_vk_inst pl_vk_inst_create(pl_log log, const struct pl_vk_inst_params *params);
+PL_API void pl_vk_inst_destroy(pl_vk_inst *inst);
+
+struct pl_vulkan_queue {
+    uint32_t index; // Queue family index
+    uint32_t count; // Queue family count
+};
+
+// Structure representing the actual vulkan device and associated GPU instance
+typedef const struct pl_vulkan_t *pl_vulkan;
+struct pl_vulkan_t {
+    pl_gpu gpu;
+
+    // The vulkan objects in use. The user may use this for their own purposes,
+    // but please note that the lifetime is tied to the lifetime of the
+    // pl_vulkan object, and must not be destroyed by the user. Note that the
+    // created vulkan device may have any number of queues and queue family
+    // assignments; so using it for queue submission commands is ill-advised.
+    VkInstance instance;
+    VkPhysicalDevice phys_device;
+    VkDevice device;
+
+    // The associated vkGetInstanceProcAddr pointer.
+    PFN_vkGetInstanceProcAddr get_proc_addr;
+
+    // The Vulkan API version supported by this VkPhysicalDevice.
+    uint32_t api_version;
+
+    // The device extensions that were successfully enabled, including
+    // extensions enabled by libplacebo internally. May contain duplicates.
+    const char * const *extensions;
+    int num_extensions;
+
+    // The device features that were enabled at device creation time.
+    //
+    // Note: Whenever a feature flag is ambiguious between several alternative
+    // locations, for completeness' sake, we include both.
+    const VkPhysicalDeviceFeatures2 *features;
+
+    // The explicit queue families we are using to provide a given capability.
+    struct pl_vulkan_queue queue_graphics; // provides VK_QUEUE_GRAPHICS_BIT
+    struct pl_vulkan_queue queue_compute;  // provides VK_QUEUE_COMPUTE_BIT
+    struct pl_vulkan_queue queue_transfer; // provides VK_QUEUE_TRANSFER_BIT
+
+    // Functions for locking a queue. These must be used to lock VkQueues for
+    // submission or other related operations when sharing the VkDevice between
+    // multiple threads, Using this on queue families or indices not contained
+    // in `queues` is undefined behavior.
+    void (*lock_queue)(pl_vulkan vk, uint32_t qf, uint32_t qidx);
+    void (*unlock_queue)(pl_vulkan vk, uint32_t qf, uint32_t qidx);
+
+    // --- Deprecated fields
+
+    // These are the same active queue families and their queue counts in list
+    // form. This list does not contain duplicates, nor any extra queues
+    // enabled at device creation time. Deprecated in favor of querying
+    // `vkGetPhysicalDeviceQueueFamilyProperties` directly.
+    const struct pl_vulkan_queue *queues PL_DEPRECATED;
+    int num_queues PL_DEPRECATED;
+};
+
+struct pl_vulkan_params {
+    // The vulkan instance. Optional, if NULL then libplacebo will internally
+    // create a VkInstance with the settings from `instance_params`.
+    //
+    // Note: The VkInstance provided by the user *MUST* be created with a
+    // VkApplicationInfo.apiVersion of PL_VK_MIN_VERSION or higher.
+    VkInstance instance;
+
+    // Pointer to `vkGetInstanceProcAddr`. If this is NULL, libplacebo will
+    // use the directly linked version (if available).
+    //
+    // Note: This overwrites the same value from `instance_params`.
+    PFN_vkGetInstanceProcAddr get_proc_addr;
+
+    // Configures the settings used for creating an internal vulkan instance.
+    // May be NULL. Ignored if `instance` is set.
+    const struct pl_vk_inst_params *instance_params;
+
+    // When choosing the device, rule out all devices that don't support
+    // presenting to this surface. When creating a device, enable all extensions
+    // needed to ensure we can present to this surface. Optional. Only legal
+    // when specifying an existing VkInstance to use.
+    VkSurfaceKHR surface;
+
+    // --- Physical device selection options
+
+    // The vulkan physical device. May be set by the caller to indicate the
+    // physical device to use. Otherwise, libplacebo will pick the "best"
+    // available GPU, based on the advertised device type. (i.e., it will
+    // prefer discrete GPUs over integrated GPUs). Only legal when specifying
+    // an existing VkInstance to use.
+    VkPhysicalDevice device;
+
+    // When choosing the device, only choose a device with this exact name.
+    // This overrides `allow_software`. No effect if `device` is set. Note: A
+    // list of devices and their names are logged at level PL_LOG_INFO.
+    const char *device_name;
+
+    // When choosing the device, only choose a device with this exact UUID.
+    // This overrides `allow_software` and `device_name`. No effect if `device`
+    // is set.
+    uint8_t device_uuid[16];
+
+    // When choosing the device, controls whether or not to also allow software
+    // GPUs. No effect if `device` or `device_name` are set.
+    bool allow_software;
+
+    // --- Logical device creation options
+
+    // Controls whether or not to allow asynchronous transfers, using transfer
+    // queue families, if supported by the device. This can be significantly
+    // faster and more power efficient, and also allows streaming uploads in
+    // parallel with rendering commands. Enabled by default.
+    bool async_transfer;
+
+    // Controls whether or not to allow asynchronous compute, using dedicated
+    // compute queue families, if supported by the device. On some devices,
+    // these can allow the GPU to schedule compute shaders in parallel with
+    // fragment shaders. Enabled by default.
+    bool async_compute;
+
+    // Limits the number of queues to use. If left as 0, libplacebo will use as
+    // many queues as the device supports. Multiple queues can result in
+    // improved efficiency when submitting multiple commands that can entirely
+    // or partially execute in parallel. Defaults to 1, since using more queues
+    // can actually decrease performance.
+    //
+    // Note: libplacebo will always *create* logical devices with all available
+    // queues for a given QF enabled, regardless of this setting.
+    int queue_count;
+
+    // Bitmask of extra queue families to enable. If set, then *all* queue
+    // families matching *any* of these flags will be enabled at device
+    // creation time. Setting this to VK_QUEUE_FLAG_BITS_MAX_ENUM effectively
+    // enables all queue families supported by the device.
+    VkQueueFlags extra_queues;
+
+    // Enables extra device extensions. Device creation will fail if these
+    // extensions are not all supported. The user may use this to enable e.g.
+    // interop extensions.
+    const char * const *extensions;
+    int num_extensions;
+
+    // Enables extra optional device extensions. These are opportunistically
+    // enabled if supported by the device, but otherwise skipped.
+    const char * const *opt_extensions;
+    int num_opt_extensions;
+
+    // Optional extra features to enable at device creation time. These are
+    // opportunistically enabled if supported by the physical device, but
+    // otherwise kept disabled.
+    const VkPhysicalDeviceFeatures2 *features;
+
+    // --- Misc/debugging options
+
+    // Restrict specific features to e.g. work around driver bugs, or simply
+    // for testing purposes
+    int max_glsl_version;       // limit the maximum GLSL version
+    uint32_t max_api_version;   // limit the maximum vulkan API version
+};
+
+// Default/recommended parameters. Should generally be safe and efficient.
+#define PL_VULKAN_DEFAULTS                              \
+    .async_transfer = true,                             \
+    .async_compute  = true,                             \
+    /* enabling multiple queues often decreases perf */ \
+    .queue_count    = 1,
+
+#define pl_vulkan_params(...) (&(struct pl_vulkan_params) { PL_VULKAN_DEFAULTS __VA_ARGS__ })
+PL_API extern const struct pl_vulkan_params pl_vulkan_default_params;
+
+// Creates a new vulkan device based on the given parameters and initializes
+// a new GPU. If `params` is left as NULL, it defaults to
+// &pl_vulkan_default_params.
+//
+// Thread-safety: Safe
+PL_API pl_vulkan pl_vulkan_create(pl_log log, const struct pl_vulkan_params *params);
+
+// Destroys the vulkan device and all associated objects, except for the
+// VkInstance provided by the user.
+//
+// Note that all resources allocated from this vulkan object (e.g. via the
+// `vk->ra` or using `pl_vulkan_create_swapchain`) *must* be explicitly
+// destroyed by the user before calling this.
+//
+// Also note that this function will block until all in-flight GPU commands are
+// finished processing. You can avoid this by manually calling `pl_gpu_finish`
+// before `pl_vulkan_destroy`.
+PL_API void pl_vulkan_destroy(pl_vulkan *vk);
+
+// For a `pl_gpu` backed by `pl_vulkan`, this function can be used to retrieve
+// the underlying `pl_vulkan`. Returns NULL for any other type of `gpu`.
+PL_API pl_vulkan pl_vulkan_get(pl_gpu gpu);
+
+struct pl_vulkan_device_params {
+    // The instance to use. Required!
+    //
+    // Note: The VkInstance provided by the user *must* be created with a
+    // VkApplicationInfo.apiVersion of PL_VK_MIN_VERSION or higher.
+    VkInstance instance;
+
+    // Mirrored from `pl_vulkan_params`. All of these fields are optional.
+    PFN_vkGetInstanceProcAddr get_proc_addr;
+    VkSurfaceKHR surface;
+    const char *device_name;
+    uint8_t device_uuid[16];
+    bool allow_software;
+};
+
+#define pl_vulkan_device_params(...) (&(struct pl_vulkan_device_params) { __VA_ARGS__ })
+
+// Helper function to choose the best VkPhysicalDevice, given a VkInstance.
+// This uses the same logic as `pl_vulkan_create` uses internally. If no
+// matching device was found, this returns VK_NULL_HANDLE.
+PL_API VkPhysicalDevice pl_vulkan_choose_device(pl_log log,
+                              const struct pl_vulkan_device_params *params);
+
+struct pl_vulkan_swapchain_params {
+    // The surface to use for rendering. Required, the user is in charge of
+    // creating this. Must belong to the same VkInstance as `vk->instance`.
+    VkSurfaceKHR surface;
+
+    // The preferred presentation mode. See the vulkan documentation for more
+    // information about these. If the device/surface combination does not
+    // support this mode, libplacebo will fall back to VK_PRESENT_MODE_FIFO_KHR.
+    //
+    // Warning: Leaving this zero-initialized is the same as having specified
+    // VK_PRESENT_MODE_IMMEDIATE_KHR, which is probably not what the user
+    // wants!
+    VkPresentModeKHR present_mode;
+
+    // Allow up to N in-flight frames. This essentially controls how many
+    // rendering commands may be queued up at the same time. See the
+    // documentation for `pl_swapchain_get_latency` for more information. For
+    // vulkan specifically, we are only able to wait until the GPU has finished
+    // rendering a frame - we are unable to wait until the display has actually
+    // finished displaying it. So this only provides a rough guideline.
+    // Optional, defaults to 3.
+    int swapchain_depth;
+
+    // This suppresses automatic recreation of the swapchain when any call
+    // returns VK_SUBOPTIMAL_KHR. Normally, libplacebo will recreate the
+    // swapchain internally on the next `pl_swapchain_start_frame`. If enabled,
+    // clients are assumed to take care of swapchain recreations themselves, by
+    // calling `pl_swapchain_resize` as appropriate. libplacebo will tolerate
+    // the "suboptimal" status indefinitely.
+    bool allow_suboptimal;
+
+    // Disable high-bit (10 or more) SDR formats. May help work around buggy
+    // drivers which don't dither properly when outputting high bit depth
+    // SDR backbuffers to 8-bit screens.
+    bool disable_10bit_sdr;
+};
+
+#define pl_vulkan_swapchain_params(...) (&(struct pl_vulkan_swapchain_params) { __VA_ARGS__ })
+
+// Creates a new vulkan swapchain based on an existing VkSurfaceKHR. Using this
+// function requires that the vulkan device was created with the
+// VK_KHR_swapchain extension. The easiest way of accomplishing this is to set
+// the `pl_vulkan_params.surface` explicitly at creation time.
+PL_API pl_swapchain pl_vulkan_create_swapchain(pl_vulkan vk,
+                              const struct pl_vulkan_swapchain_params *params);
+
+// This will return true if the vulkan swapchain is internally detected
+// as being suboptimal (VK_SUBOPTIMAL_KHR). This might be of use to clients
+// who have `params->allow_suboptimal` enabled.
+PL_API bool pl_vulkan_swapchain_suboptimal(pl_swapchain sw);
+
+// Vulkan interop API, for sharing a single VkDevice (and associated vulkan
+// resources) directly with the API user. The use of this API is a bit sketchy
+// and requires careful communication of Vulkan API state.
+
+struct pl_vulkan_import_params {
+    // The vulkan instance. Required.
+    //
+    // Note: The VkInstance provided by the user *must* be created with a
+    // VkApplicationInfo.apiVersion of PL_VK_MIN_VERSION or higher.
+    VkInstance instance;
+
+    // Pointer to `vkGetInstanceProcAddr`. If this is NULL, libplacebo will
+    // use the directly linked version (if available).
+    PFN_vkGetInstanceProcAddr get_proc_addr;
+
+    // The physical device selected by the user. Required.
+    VkPhysicalDevice phys_device;
+
+    // The logical device created by the user. Required.
+    VkDevice device;
+
+    // --- Logical device parameters
+
+    // List of all device-level extensions that were enabled. (Instance-level
+    // extensions need not be re-specified here, since it's guaranteed that any
+    // instance-level extensions that device-level extensions depend on were
+    // enabled at the instance level)
+    const char * const *extensions;
+    int num_extensions;
+
+    // Enabled queue families. At least `queue_graphics` is required.
+    //
+    // It's okay for multiple queue families to be specified with the same
+    // index, e.g. in the event that a dedicated compute queue also happens to
+    // be the dedicated transfer queue.
+    //
+    // It's also okay to leave the queue struct as {0} in the event that no
+    // dedicated queue exists for a given operation type. libplacebo will
+    // automatically fall back to using e.g. the graphics queue instead.
+    struct pl_vulkan_queue queue_graphics; // must support VK_QUEUE_GRAPHICS_BIT
+    struct pl_vulkan_queue queue_compute;  // must support VK_QUEUE_COMPUTE_BIT
+    struct pl_vulkan_queue queue_transfer; // must support VK_QUEUE_TRANSFER_BIT
+
+    // Enabled VkPhysicalDeviceFeatures. The device *must* be created with
+    // all of the features in `pl_vulkan_required_features` enabled.
+    const VkPhysicalDeviceFeatures2 *features;
+
+    // Functions for locking a queue. If set, these will be used instead of
+    // libplacebo's internal functions for `pl_vulkan.(un)lock_queue`.
+    void (*lock_queue)(void *ctx, uint32_t qf, uint32_t qidx);
+    void (*unlock_queue)(void *ctx, uint32_t qf, uint32_t qidx);
+    void *queue_ctx;
+
+    // --- Misc/debugging options
+
+    // Restrict specific features to e.g. work around driver bugs, or simply
+    // for testing purposes. See `pl_vulkan_params` for a description of these.
+    int max_glsl_version;
+    uint32_t max_api_version;
+};
+
+#define pl_vulkan_import_params(...) (&(struct pl_vulkan_import_params) { __VA_ARGS__ })
+
+// For purely informative reasons, this contains a list of extensions and
+// device features that libplacebo *can* make use of. These are all strictly
+// optional, but provide a hint to the API user as to what might be worth
+// enabling at device creation time.
+//
+// Note: This also includes physical device features provided by extensions.
+// They are all provided using extension-specific features structs, rather
+// than the more general purpose VkPhysicalDeviceVulkan11Features etc.
+PL_API extern const char * const pl_vulkan_recommended_extensions[];
+PL_API extern const int pl_vulkan_num_recommended_extensions;
+PL_API extern const VkPhysicalDeviceFeatures2 pl_vulkan_recommended_features;
+
+// A list of device features that are required by libplacebo. These
+// *must* be provided by imported Vulkan devices.
+//
+// Note: `pl_vulkan_recommended_features` does not include this list.
+PL_API extern const VkPhysicalDeviceFeatures2 pl_vulkan_required_features;
+
+// Import an existing VkDevice instead of creating a new one, and wrap it into
+// a `pl_vulkan` abstraction. It's safe to `pl_vulkan_destroy` this, which will
+// destroy application state related to libplacebo but leave the underlying
+// VkDevice intact.
+PL_API pl_vulkan pl_vulkan_import(pl_log log, const struct pl_vulkan_import_params *params);
+
+struct pl_vulkan_wrap_params {
+    // The image itself. It *must* be usable concurrently by all of the queue
+    // family indices listed in `pl_vulkan->queues`. Note that this requires
+    // the use of VK_SHARING_MODE_CONCURRENT if `pl_vulkan->num_queues` is
+    // greater than 1. If this is difficult to achieve for the user, then
+    // `async_transfer` / `async_compute` should be turned off, which
+    // guarantees the use of only one queue family.
+    VkImage image;
+
+    // Which aspect of `image` to wrap. Only useful for wrapping individual
+    // sub-planes of planar images. If left as 0, it defaults to the entire
+    // image (i.e. the union of VK_IMAGE_ASPECT_PLANE_N_BIT for planar formats,
+    // and VK_IMAGE_ASPECT_COLOR_BIT otherwise).
+    VkImageAspectFlags aspect;
+
+    // The image's dimensions (unused dimensions must be 0)
+    int width;
+    int height;
+    int depth;
+
+    // The image's format. libplacebo will try to map this to an equivalent
+    // pl_fmt. If no compatible pl_fmt is found, wrapping will fail.
+    VkFormat format;
+
+    // The usage flags the image was created with. libplacebo will set the
+    // pl_tex capabilities to include whatever it can, as determined by the set
+    // of enabled usage flags.
+    VkImageUsageFlags usage;
+
+    // See `pl_tex_params`
+    void *user_data;
+    pl_debug_tag debug_tag;
+};
+
+#define pl_vulkan_wrap_params(...) (&(struct pl_vulkan_wrap_params) {   \
+        .debug_tag = PL_DEBUG_TAG,                                      \
+        __VA_ARGS__                                                     \
+    })
+
+// Wraps an external VkImage into a pl_tex abstraction. By default, the image
+// is considered "held" by the user and must be released before calling any
+// pl_tex_* API calls on it (see `pl_vulkan_release`).
+//
+// This wrapper can be destroyed by simply calling `pl_tex_destroy` on it,
+// which will not destroy the underlying VkImage. If a pl_tex wrapper is
+// destroyed while an image is not currently being held by the user, that
+// image is left in an undefined state.
+//
+// Wrapping the same VkImage multiple times is undefined behavior, as is trying
+// to wrap an image belonging to a different VkDevice than the one in use by
+// `gpu`.
+//
+// This function may fail, in which case it returns NULL.
+PL_API pl_tex pl_vulkan_wrap(pl_gpu gpu, const struct pl_vulkan_wrap_params *params);
+
+// Analogous to `pl_vulkan_wrap`, this function takes any `pl_tex` (including
+// ones created by `pl_tex_create`) and unwraps it to expose the underlying
+// VkImage to the user. Unlike `pl_vulkan_wrap`, this `pl_tex` is *not*
+// considered held after calling this function - the user must explicitly
+// `pl_vulkan_hold` before accessing the VkImage.
+//
+// `out_format` and `out_flags` will be updated to hold the VkImage's
+// format and usage flags. (Optional)
+PL_API VkImage pl_vulkan_unwrap(pl_gpu gpu, pl_tex tex,
+                                VkFormat *out_format, VkImageUsageFlags *out_flags);
+
+// Represents a vulkan semaphore/value pair (for compatibility with timeline
+// semaphores). When using normal, binary semaphores, `value` may be ignored.
+typedef struct pl_vulkan_sem {
+    VkSemaphore sem;
+    uint64_t value;
+} pl_vulkan_sem;
+
+struct pl_vulkan_hold_params {
+    // The Vulkan image to hold. It will be marked as held. Attempting to
+    // perform any pl_tex_* operation (except pl_tex_destroy) on a held image
+    // is undefined behavior.
+    pl_tex tex;
+
+    // The layout to transition the image to when holding. Alternatively, a
+    // pointer to receive the current image layout. If `out_layout` is
+    // provided, `layout` is ignored.
+    VkImageLayout layout;
+    VkImageLayout *out_layout;
+
+    // The queue family index to transition the image to. This can be used with
+    // VK_QUEUE_FAMILY_EXTERNAL to transition the image to an external API. As
+    // a special case, if set to VK_QUEUE_FAMILY_IGNORED, libplacebo will not
+    // transition the image, even if this image was not set up for concurrent
+    // usage. Ignored for concurrent images.
+    uint32_t qf;
+
+    // The semaphore to fire when the image is available for use. (Required)
+    pl_vulkan_sem semaphore;
+};
+
+#define pl_vulkan_hold_params(...) (&(struct pl_vulkan_hold_params) { __VA_ARGS__ })
+
+// "Hold" a shared image, transferring control over the image to the user.
+// Returns whether successful.
+PL_API bool pl_vulkan_hold_ex(pl_gpu gpu, const struct pl_vulkan_hold_params *params);
+
+struct pl_vulkan_release_params {
+    // The image to be released. It must be marked as "held". Performing any
+    // operation on the VkImage underlying this `pl_tex` while it is not being
+    // held by the user is undefined behavior.
+    pl_tex tex;
+
+    // The current layout of the image at the point in time when `semaphore`
+    // fires, or if no semaphore is specified, at the time of call.
+    VkImageLayout layout;
+
+    // The queue family index to transition the image to. This can be used with
+    // VK_QUEUE_FAMILY_EXTERNAL to transition the image rom an external API. As
+    // a special case, if set to VK_QUEUE_FAMILY_IGNORED, libplacebo will not
+    // transition the image, even if this image was not set up for concurrent
+    // usage. Ignored for concurrent images.
+    uint32_t qf;
+
+    // The semaphore to wait on before libplacebo will actually use or modify
+    // the image. (Optional)
+    //
+    // Note: the lifetime of `semaphore` is indeterminate, and destroying it
+    // while the texture is still depending on that semaphore is undefined
+    // behavior.
+    //
+    // Technically, the only way to be sure that it's safe to free is to use
+    // `pl_gpu_finish()` or similar (e.g. `pl_vulkan_destroy` or
+    // `vkDeviceWaitIdle`) after another operation involving `tex` has been
+    // emitted (or the texture has been destroyed).
+    //
+    //
+    // Warning: If `tex` is a planar image (`pl_fmt.num_planes > 0`), and
+    // `semaphore` is specified, it *must* be a timeline semaphore! Failure to
+    // respect this will result in undefined behavior. This warning does not
+    // apply to individual planes (as exposed by `pl_tex.planes`).
+    pl_vulkan_sem semaphore;
+};
+
+#define pl_vulkan_release_params(...) (&(struct pl_vulkan_release_params) { __VA_ARGS__ })
+
+// "Release" a shared image, transferring control to libplacebo.
+PL_API void pl_vulkan_release_ex(pl_gpu gpu, const struct pl_vulkan_release_params *params);
+
+struct pl_vulkan_sem_params {
+    // The type of semaphore to create.
+    VkSemaphoreType type;
+
+    // For VK_SEMAPHORE_TYPE_TIMELINE, sets the initial timeline value.
+    uint64_t initial_value;
+
+    // If set, exports this VkSemaphore to the handle given in `out_handle`.
+    // The user takes over ownership, and should manually close it before
+    // destroying this VkSemaphore (via `pl_vulkan_sem_destroy`).
+    enum pl_handle_type export_handle;
+    union pl_handle *out_handle;
+
+    // Optional debug tag to identify this semaphore.
+    pl_debug_tag debug_tag;
+};
+
+#define pl_vulkan_sem_params(...) (&(struct pl_vulkan_sem_params) {     \
+        .debug_tag = PL_DEBUG_TAG,                                      \
+        __VA_ARGS__                                                     \
+    })
+
+// Helper functions to create and destroy vulkan semaphores. Returns
+// VK_NULL_HANDLE on failure.
+PL_API VkSemaphore pl_vulkan_sem_create(pl_gpu gpu, const struct pl_vulkan_sem_params *params);
+PL_API void pl_vulkan_sem_destroy(pl_gpu gpu, VkSemaphore *semaphore);
+
+// Backwards-compatibility wrappers for older versions of the API.
+PL_DEPRECATED PL_API bool pl_vulkan_hold(pl_gpu gpu, pl_tex tex, VkImageLayout layout,
+                                         pl_vulkan_sem sem_out);
+PL_DEPRECATED PL_API bool pl_vulkan_hold_raw(pl_gpu gpu, pl_tex tex, VkImageLayout *out_layout,
+                                             pl_vulkan_sem sem_out);
+PL_DEPRECATED PL_API void pl_vulkan_release(pl_gpu gpu, pl_tex tex, VkImageLayout layout,
+                                            pl_vulkan_sem sem_in);
+
+PL_API_END
+
+#endif // LIBPLACEBO_VULKAN_H_
diff --git a/src/log.c b/src/log.c
new file mode 100644
index 0000000..0829dd3
--- /dev/null
+++ b/src/log.c
@@ -0,0 +1,471 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <locale.h>
+
+#include "common.h"
+#include "log.h"
+#include "pl_thread.h"
+
+struct priv {
+    pl_mutex lock;
+    enum pl_log_level log_level_cap;
+    pl_str logbuffer;
+};
+
+pl_log pl_log_create(int api_ver, const struct pl_log_params *params)
+{
+    (void) api_ver;
+    struct pl_log_t *log = pl_zalloc_obj(NULL, log, struct priv);
+    struct priv *p = PL_PRIV(log);
+    log->params = *PL_DEF(params, &pl_log_default_params);
+    pl_mutex_init(&p->lock);
+    pl_info(log, "Initialized libplacebo %s (API v%d)", PL_VERSION, PL_API_VER);
+    return log;
+}
+
+const struct pl_log_params pl_log_default_params = {0};
+
+void pl_log_destroy(pl_log *plog)
+{
+    pl_log log = *plog;
+    if (!log)
+        return;
+
+    struct priv *p = PL_PRIV(log);
+    pl_mutex_destroy(&p->lock);
+    pl_free((void *) log);
+    *plog = NULL;
+}
+
+struct pl_log_params pl_log_update(pl_log ptr, const struct pl_log_params *params)
+{
+    struct pl_log_t *log = (struct pl_log_t *) ptr;
+    if (!log)
+        return pl_log_default_params;
+
+    struct priv *p = PL_PRIV(log);
+    pl_mutex_lock(&p->lock);
+    struct pl_log_params prev_params = log->params;
+    log->params = *PL_DEF(params, &pl_log_default_params);
+    pl_mutex_unlock(&p->lock);
+
+    return prev_params;
+}
+
+enum pl_log_level pl_log_level_update(pl_log ptr, enum pl_log_level level)
+{
+    struct pl_log_t *log = (struct pl_log_t *) ptr;
+    if (!log)
+        return PL_LOG_NONE;
+
+    struct priv *p = PL_PRIV(log);
+    pl_mutex_lock(&p->lock);
+    enum pl_log_level prev_level = log->params.log_level;
+    log->params.log_level = level;
+    pl_mutex_unlock(&p->lock);
+
+    return prev_level;
+}
+
+void pl_log_level_cap(pl_log log, enum pl_log_level cap)
+{
+    if (!log)
+        return;
+
+    struct priv *p = PL_PRIV(log);
+    pl_mutex_lock(&p->lock);
+    p->log_level_cap = cap;
+    pl_mutex_unlock(&p->lock);
+}
+
+static FILE *default_stream(void *stream, enum pl_log_level level)
+{
+    return PL_DEF(stream, level <= PL_LOG_WARN ? stderr : stdout);
+}
+
+void pl_log_simple(void *stream, enum pl_log_level level, const char *msg)
+{
+    static const char *prefix[] = {
+        [PL_LOG_FATAL] = "fatal",
+        [PL_LOG_ERR]   = "error",
+        [PL_LOG_WARN]  = "warn",
+        [PL_LOG_INFO]  = "info",
+        [PL_LOG_DEBUG] = "debug",
+        [PL_LOG_TRACE] = "trace",
+    };
+
+    FILE *h = default_stream(stream, level);
+    fprintf(h, "%5s: %s\n", prefix[level], msg);
+    if (level <= PL_LOG_WARN)
+        fflush(h);
+}
+
+void pl_log_color(void *stream, enum pl_log_level level, const char *msg)
+{
+    static const char *color[] = {
+        [PL_LOG_FATAL] = "31;1", // bright red
+        [PL_LOG_ERR]   = "31",   // red
+        [PL_LOG_WARN]  = "33",   // yellow/orange
+        [PL_LOG_INFO]  = "32",   // green
+        [PL_LOG_DEBUG] = "34",   // blue
+        [PL_LOG_TRACE] = "30;1", // bright black
+    };
+
+    FILE *h = default_stream(stream, level);
+    fprintf(h, "\033[%sm%s\033[0m\n", color[level], msg);
+    if (level <= PL_LOG_WARN)
+        fflush(h);
+}
+
+static void pl_msg_va(pl_log log, enum pl_log_level lev,
+                      const char *fmt, va_list va)
+{
+    // Test log message without taking the lock, to avoid thrashing the
+    // lock for thousands of trace messages unless those are actually
+    // enabled. This may be a false negative, in which case log messages may
+    // be lost as a result. But this shouldn't be a big deal, since any
+    // situation leading to lost log messages would itself be a race condition.
+    if (!pl_msg_test(log, lev))
+        return;
+
+    // Re-test the log message level with held lock to avoid false positives,
+    // which would be a considerably bigger deal than false negatives
+    struct priv *p = PL_PRIV(log);
+    pl_mutex_lock(&p->lock);
+
+    // Apply this cap before re-testing the log level, to avoid giving users
+    // messages that should have been dropped by the log level.
+    lev = PL_MAX(lev, p->log_level_cap);
+    if (!pl_msg_test(log, lev))
+        goto done;
+
+    p->logbuffer.len = 0;
+    pl_str_append_vasprintf((void *) log, &p->logbuffer, fmt, va);
+    log->params.log_cb(log->params.log_priv, lev, (char *) p->logbuffer.buf);
+
+done:
+    pl_mutex_unlock(&p->lock);
+}
+
+void pl_msg(pl_log log, enum pl_log_level lev, const char *fmt, ...)
+{
+    va_list va;
+    va_start(va, fmt);
+    pl_msg_va(log, lev, fmt, va);
+    va_end(va);
+}
+
+void pl_msg_source(pl_log log, enum pl_log_level lev, const char *src)
+{
+    if (!pl_msg_test(log, lev) || !src)
+        return;
+
+    int line = 1;
+    while (*src) {
+        const char *end = strchr(src, '\n');
+        if (!end) {
+            pl_msg(log, lev, "[%3d] %s", line, src);
+            break;
+        }
+
+        pl_msg(log, lev, "[%3d] %.*s", line, (int)(end - src), src);
+        src = end + 1;
+        line++;
+    }
+}
+
+#ifdef PL_HAVE_DBGHELP
+
+#include <windows.h>
+#include <dbghelp.h>
+#include <shlwapi.h>
+
+// https://github.com/llvm/llvm-project/blob/f03cd763384bbb67ddfa12957859ed58841d4b34/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h#L85-L106
+static inline uintptr_t get_prev_inst_pc(uintptr_t pc) {
+#if defined(__arm__)
+  // T32 (Thumb) branch instructions might be 16 or 32 bit long,
+  // so we return (pc-2) in that case in order to be safe.
+  // For A32 mode we return (pc-4) because all instructions are 32 bit long.
+  return (pc - 3) & (~1);
+#elif defined(__x86_64__) || defined(__i386__)
+  return pc - 1;
+#else
+  return pc - 4;
+#endif
+}
+
+static DWORD64 get_preferred_base(const char *module)
+{
+    DWORD64 image_base = 0;
+    HANDLE file_mapping = NULL;
+    HANDLE file_view = NULL;
+
+    HANDLE file = CreateFile(module, GENERIC_READ, FILE_SHARE_READ, NULL,
+                             OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
+    if (file == INVALID_HANDLE_VALUE)
+        goto done;
+
+    file_mapping = CreateFileMapping(file, NULL, PAGE_READONLY, 0, 0, NULL);
+    if (file_mapping == NULL)
+        goto done;
+
+    file_view = MapViewOfFile(file_mapping, FILE_MAP_READ, 0, 0, 0);
+    if (file_view == NULL)
+        goto done;
+
+    PIMAGE_DOS_HEADER dos_header = (PIMAGE_DOS_HEADER) file_view;
+    if (dos_header->e_magic != IMAGE_DOS_SIGNATURE)
+        goto done;
+
+    PIMAGE_NT_HEADERS pe_header = (PIMAGE_NT_HEADERS) ((char *) file_view +
+                                                                dos_header->e_lfanew);
+    if (pe_header->Signature != IMAGE_NT_SIGNATURE)
+        goto done;
+
+    if (pe_header->FileHeader.SizeOfOptionalHeader != sizeof(pe_header->OptionalHeader))
+        goto done;
+
+    image_base = pe_header->OptionalHeader.ImageBase;
+
+done:
+    if (file_view)
+        UnmapViewOfFile(file_view);
+    if (file_mapping)
+        CloseHandle(file_mapping);
+    if (file != INVALID_HANDLE_VALUE)
+        CloseHandle(file);
+
+    return image_base;
+}
+
+void pl_log_stack_trace(pl_log log, enum pl_log_level lev)
+{
+    if (!pl_msg_test(log, lev))
+        return;
+
+    void *tmp = pl_tmp(NULL);
+    PL_ARRAY(void *) frames = {0};
+
+    size_t capacity = 16;
+    do {
+        capacity *= 2;
+        PL_ARRAY_RESIZE(tmp, frames, capacity);
+        // Skip first frame, we don't care about this function
+        frames.num = CaptureStackBackTrace(1, capacity, frames.elem, NULL);
+    } while (capacity == frames.num);
+
+    if (!frames.num) {
+        pl_free(tmp);
+        return;
+    }
+
+    // Load dbghelp on demand. While it is available on all Windows versions,
+    // no need to keep it loaded all the time as stack trace printing function,
+    // in theory should be used repetitively rarely.
+    HANDLE process = GetCurrentProcess();
+    HMODULE dbghelp = LoadLibrary("dbghelp.dll");
+    DWORD options;
+    SYMBOL_INFO *symbol = NULL;
+    BOOL use_dbghelp = !!dbghelp;
+
+#define DBGHELP_SYM(sym)                                                        \
+    __typeof__(&sym) p##sym = (__typeof__(&sym))(void *) GetProcAddress(dbghelp, #sym); \
+    use_dbghelp &= !!p##sym
+
+    DBGHELP_SYM(SymCleanup);
+    DBGHELP_SYM(SymFromAddr);
+    DBGHELP_SYM(SymGetLineFromAddr64);
+    DBGHELP_SYM(SymGetModuleInfo64);
+    DBGHELP_SYM(SymGetOptions);
+    DBGHELP_SYM(SymGetSearchPathW);
+    DBGHELP_SYM(SymInitialize);
+    DBGHELP_SYM(SymSetOptions);
+    DBGHELP_SYM(SymSetSearchPathW);
+
+#undef DBGHELP_SYM
+
+    struct priv *p = PL_PRIV(log);
+    PL_ARRAY(wchar_t) base_search = { .num = 1024 };
+
+    if (use_dbghelp) {
+        // DbgHelp is not thread-safe. Note that on Windows mutex is recursive,
+        // so no need to unlock before calling pl_msg.
+        pl_mutex_lock(&p->lock);
+
+        options = pSymGetOptions();
+        pSymSetOptions(SYMOPT_UNDNAME | SYMOPT_DEFERRED_LOADS |
+                       SYMOPT_LOAD_LINES | SYMOPT_FAVOR_COMPRESSED);
+        use_dbghelp &= pSymInitialize(process, NULL, TRUE);
+
+        if (use_dbghelp) {
+            symbol = pl_alloc(tmp, sizeof(SYMBOL_INFO) + 512);
+            symbol->SizeOfStruct = sizeof(SYMBOL_INFO);
+            symbol->MaxNameLen = 512;
+
+            PL_ARRAY_RESIZE(tmp, base_search, base_search.num);
+            BOOL ret = pSymGetSearchPathW(process, base_search.elem,
+                                          base_search.num);
+            base_search.num = ret ? wcslen(base_search.elem) : 0;
+            PL_ARRAY_APPEND(tmp, base_search, L'\0');
+        } else {
+            pSymSetOptions(options);
+            pl_mutex_unlock(&p->lock);
+        }
+    }
+
+    pl_msg(log, lev, "  Backtrace:");
+    for (int n = 0; n < frames.num; n++) {
+        uintptr_t pc = get_prev_inst_pc((uintptr_t) frames.elem[n]);
+        pl_str out = {0};
+        pl_str_append_asprintf(tmp, &out, "    #%-2d 0x%"PRIxPTR, n, pc);
+
+        MEMORY_BASIC_INFORMATION meminfo = {0};
+        char module_path[MAX_PATH] = {0};
+        if (VirtualQuery((LPCVOID) pc, &meminfo, sizeof(meminfo))) {
+            DWORD sz = GetModuleFileNameA(meminfo.AllocationBase, module_path,
+                                          sizeof(module_path));
+            if (sz == sizeof(module_path))
+                pl_msg(log, PL_LOG_ERR, "module path truncated");
+
+            if (use_dbghelp) {
+                // According to documentation it should search in "The directory
+                // that contains the corresponding module.", but it doesn't appear
+                // to work, so manually set the path to module path.
+                // https://learn.microsoft.com/windows/win32/debug/symbol-paths
+                PL_ARRAY(wchar_t) mod_search = { .num = MAX_PATH };
+                PL_ARRAY_RESIZE(tmp, mod_search, mod_search.num);
+
+                sz = GetModuleFileNameW(meminfo.AllocationBase,
+                                        mod_search.elem, mod_search.num);
+
+                if (sz > 0 && sz != MAX_PATH &&
+                    // TODO: Replace with PathCchRemoveFileSpec once mingw-w64
+                    // >= 8.0.1 is commonly available, at the time of writing
+                    // there are a few high profile Linux distributions that ship
+                    // 8.0.0.
+                    PathRemoveFileSpecW(mod_search.elem))
+                {
+                    mod_search.num = wcslen(mod_search.elem);
+                    PL_ARRAY_APPEND(tmp, mod_search, L';');
+                    PL_ARRAY_CONCAT(tmp, mod_search, base_search);
+                    pSymSetSearchPathW(process, mod_search.elem);
+                }
+            }
+        }
+
+        DWORD64 sym_displacement;
+        if (use_dbghelp && pSymFromAddr(process, pc, &sym_displacement, symbol))
+            pl_str_append_asprintf(tmp, &out, " in %s+0x%llx",
+                                   symbol->Name, sym_displacement);
+
+        DWORD line_displacement;
+        IMAGEHLP_LINE64 line = {sizeof(line)};
+        if (use_dbghelp &&
+            pSymGetLineFromAddr64(process, pc, &line_displacement, &line))
+        {
+            pl_str_append_asprintf(tmp, &out, " %s:%lu+0x%lx", line.FileName,
+                                   line.LineNumber, line_displacement);
+            goto done;
+        }
+
+        // LLVM tools by convention use absolute addresses with "prefered" base
+        // image offset. We need to read this offset from binary, because due to
+        // ASLR we are not loaded at this base. While Windows tools like WinDbg
+        // expect relative offset to image base. So to be able to easily use it
+        // with both worlds, print both values.
+        DWORD64 module_base = get_preferred_base(module_path);
+        pl_str_append_asprintf(tmp, &out, " (%s+0x%"PRIxPTR") (0x%llx)", module_path,
+                               pc - (uintptr_t) meminfo.AllocationBase,
+                               module_base + (pc - (uintptr_t) meminfo.AllocationBase));
+
+done:
+        pl_msg(log, lev, "%s", out.buf);
+    }
+
+    if (use_dbghelp) {
+        pSymSetOptions(options);
+        pSymCleanup(process);
+        pl_mutex_unlock(&p->lock);
+    }
+    // Unload dbghelp. Maybe it is better to keep it loaded?
+    if (dbghelp)
+        FreeLibrary(dbghelp);
+    pl_free(tmp);
+}
+
+#elif defined(PL_HAVE_UNWIND)
+#define UNW_LOCAL_ONLY
+#include <libunwind.h>
+#include <dlfcn.h>
+
+void pl_log_stack_trace(pl_log log, enum pl_log_level lev)
+{
+    if (!pl_msg_test(log, lev))
+        return;
+
+    unw_cursor_t cursor;
+    unw_context_t uc;
+    unw_word_t ip, off;
+    unw_getcontext(&uc);
+    unw_init_local(&cursor, &uc);
+
+    int depth = 0;
+    pl_msg(log, lev, "  Backtrace:");
+    while (unw_step(&cursor) > 0) {
+        char symbol[256] = "<unknown>";
+        Dl_info info = {
+            .dli_fname = "<unknown>",
+        };
+
+        unw_get_reg(&cursor, UNW_REG_IP, &ip);
+        unw_get_proc_name(&cursor, symbol, sizeof(symbol), &off);
+        dladdr((void *) (uintptr_t) ip, &info);
+        pl_msg(log, lev, "    #%-2d 0x%016" PRIxPTR " in %s+0x%" PRIxPTR" at %s+0x%" PRIxPTR,
+               depth++, ip, symbol, off, info.dli_fname, ip - (uintptr_t) info.dli_fbase);
+    }
+}
+
+#elif defined(PL_HAVE_EXECINFO)
+#include <execinfo.h>
+
+void pl_log_stack_trace(pl_log log, enum pl_log_level lev)
+{
+    if (!pl_msg_test(log, lev))
+        return;
+
+    PL_ARRAY(void *) buf = {0};
+    size_t buf_avail = 16;
+    do {
+        buf_avail *= 2;
+        PL_ARRAY_RESIZE(NULL, buf, buf_avail);
+        buf.num = backtrace(buf.elem, buf_avail);
+    } while (buf.num == buf_avail);
+
+    pl_msg(log, lev, "  Backtrace:");
+    char **strings = backtrace_symbols(buf.elem, buf.num);
+    for (int i = 1; i < buf.num; i++)
+        pl_msg(log, lev, "    #%-2d %s", i - 1, strings[i]);
+
+    free(strings);
+    pl_free(buf.elem);
+}
+
+#else
+void pl_log_stack_trace(pl_log log, enum pl_log_level lev) { }
+#endif
diff --git a/src/log.h b/src/log.h
new file mode 100644
index 0000000..dcf8d28
--- /dev/null
+++ b/src/log.h
@@ -0,0 +1,84 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <stdarg.h>
+
+#include "common.h"
+
+#include <libplacebo/log.h>
+
+// Internal logging-related functions
+
+// Warning: Not entirely thread-safe. Exercise caution when using. May result
+// in either false positives or false negatives. Make sure to re-run this
+// function while `lock` is held, to ensure no race conditions on the check.
+static inline bool pl_msg_test(pl_log log, enum pl_log_level lev)
+{
+    return log && log->params.log_cb && log->params.log_level >= lev;
+}
+
+void pl_msg(pl_log log, enum pl_log_level lev, const char *fmt, ...)
+    PL_PRINTF(3, 4);
+
+// Convenience macros
+#define pl_fatal(log, ...)      pl_msg(log, PL_LOG_FATAL, __VA_ARGS__)
+#define pl_err(log, ...)        pl_msg(log, PL_LOG_ERR, __VA_ARGS__)
+#define pl_warn(log, ...)       pl_msg(log, PL_LOG_WARN, __VA_ARGS__)
+#define pl_info(log, ...)       pl_msg(log, PL_LOG_INFO, __VA_ARGS__)
+#define pl_debug(log, ...)      pl_msg(log, PL_LOG_DEBUG, __VA_ARGS__)
+#define pl_trace(log, ...)      pl_msg(log, PL_LOG_TRACE, __VA_ARGS__)
+
+#define PL_MSG(obj, lev, ...)   pl_msg((obj)->log, lev, __VA_ARGS__)
+
+#define PL_FATAL(obj, ...)      PL_MSG(obj, PL_LOG_FATAL, __VA_ARGS__)
+#define PL_ERR(obj, ...)        PL_MSG(obj, PL_LOG_ERR, __VA_ARGS__)
+#define PL_WARN(obj, ...)       PL_MSG(obj, PL_LOG_WARN, __VA_ARGS__)
+#define PL_INFO(obj, ...)       PL_MSG(obj, PL_LOG_INFO, __VA_ARGS__)
+#define PL_DEBUG(obj, ...)      PL_MSG(obj, PL_LOG_DEBUG, __VA_ARGS__)
+#define PL_TRACE(obj, ...)      PL_MSG(obj, PL_LOG_TRACE, __VA_ARGS__)
+
+// Log something with line numbers included
+void pl_msg_source(pl_log log, enum pl_log_level lev, const char *src);
+
+// Temporarily cap the log level to a certain verbosity. This is intended for
+// things like probing formats, attempting to create buffers that may fail, and
+// other types of operations in which we want to suppress errors. Call with
+// PL_LOG_NONE to disable this cap.
+//
+// Warning: This is generally not thread-safe, and only provided as a temporary
+// hack until a better solution can be thought of.
+void pl_log_level_cap(pl_log log, enum pl_log_level cap);
+
+// CPU execution time reporting helper
+static inline void pl_log_cpu_time(pl_log log, pl_clock_t start, pl_clock_t stop,
+                                   const char *operation)
+{
+    double ms = pl_clock_diff(stop, start) * 1e3;
+    enum pl_log_level lev = PL_LOG_DEBUG;
+    if (ms > 10)
+        lev = PL_LOG_INFO;
+    if (ms > 1000)
+        lev = PL_LOG_WARN;
+
+    pl_msg(log, lev, "Spent %.3f ms %s%s", ms, operation,
+           ms > 100 ? " (slow!)" : "");
+}
+
+// Log stack trace
+PL_NOINLINE void pl_log_stack_trace(pl_log log, enum pl_log_level lev);
diff --git a/src/meson.build b/src/meson.build
new file mode 100644
index 0000000..63f9d53
--- /dev/null
+++ b/src/meson.build
@@ -0,0 +1,347 @@
+### Common dependencies
+unwind = dependency('libunwind', required: get_option('unwind'))
+libexecinfo = cc.find_library('execinfo', required: false)
+has_execinfo = cc.has_function('backtrace_symbols', dependencies: libexecinfo, prefix: '#include <execinfo.h>')
+dbghelp = cc.check_header('dbghelp.h', prefix: '#include <windows.h>')
+conf_internal.set('PL_HAVE_DBGHELP', dbghelp)
+conf_internal.set('PL_HAVE_UNWIND', unwind.found())
+conf_internal.set('PL_HAVE_EXECINFO', has_execinfo)
+if dbghelp
+  build_deps += cc.find_library('shlwapi', required: true)
+elif unwind.found()
+  build_deps += [unwind, cc.find_library('dl', required : false)]
+elif has_execinfo
+  build_deps += libexecinfo
+endif
+
+link_args = []
+link_depends = []
+
+# Looks like meson in certain configuration returns ' ' instead of empty string
+mingw32 = cc.get_define('__MINGW32__').strip()
+if host_machine.system() == 'windows' and mingw32 != '' and host_machine.cpu() in ['aarch64', 'arm', 'x86_64']
+  # MinGW-w64 math functions are significantly slower than the UCRT ones.
+  # In particular powf is over 7 times slower than UCRT counterpart.
+  # MinGW-w64 explicitly excludes some math functions from their ucrtbase def
+  # file and replaces with own versions. To workaround the issue, generate the
+  # import library and link it with UCRT versions of math functions.
+  dlltool = find_program('llvm-dlltool', 'dlltool')
+  ucrt_math = custom_target('ucrt_math.lib',
+                            output : ['ucrt_math.lib'],
+                            input : 'ucrt_math.def',
+                            command : [dlltool, '-d', '@INPUT@', '-l', '@OUTPUT@'])
+  link_args += ucrt_math.full_path()
+  link_depends += ucrt_math
+  # MinGW-w64 inlines functions like powf, rewriting them to pow. We want to use
+  # the powf specialization from UCRT, so disable inlining.
+  add_project_arguments(['-D__CRT__NO_INLINE'], language: ['c', 'cpp'])
+endif
+
+# Work around missing atomics on some (obscure) platforms
+atomic_test = '''
+#include <stdatomic.h>
+#include <stdint.h>
+int main(void) {
+  _Atomic uint32_t x32;
+  atomic_init(&x32, 0);
+}'''
+
+if not cc.links(atomic_test)
+  build_deps += cc.find_library('atomic')
+endif
+
+
+### Common source files
+headers = [
+  'cache.h',
+  'colorspace.h',
+  'common.h',
+  'd3d11.h',
+  'dispatch.h',
+  'dither.h',
+  'dummy.h',
+  'filters.h',
+  'gamut_mapping.h',
+  'gpu.h',
+  'log.h',
+  'opengl.h',
+  'options.h',
+  'renderer.h',
+  'shaders/colorspace.h',
+  'shaders/custom.h',
+  'shaders/deinterlacing.h',
+  'shaders/dithering.h',
+  'shaders/film_grain.h',
+  'shaders/icc.h',
+  'shaders/lut.h',
+  'shaders/sampling.h',
+  'shaders.h',
+  'swapchain.h',
+  'tone_mapping.h',
+  'utils/dav1d.h',
+  'utils/dav1d_internal.h',
+  'utils/dolbyvision.h',
+  'utils/frame_queue.h',
+  'utils/libav.h',
+  'utils/libav_internal.h',
+  'utils/upload.h',
+  'vulkan.h',
+]
+
+sources = [
+  'cache.c',
+  'colorspace.c',
+  'common.c',
+  'convert.cc',
+  'dither.c',
+  'dispatch.c',
+  'dummy.c',
+  'filters.c',
+  'format.c',
+  'gamut_mapping.c',
+  'glsl/spirv.c',
+  'gpu.c',
+  'gpu/utils.c',
+  'log.c',
+  'options.c',
+  'pl_alloc.c',
+  'pl_string.c',
+  'swapchain.c',
+  'tone_mapping.c',
+  'utils/dolbyvision.c',
+  'utils/frame_queue.c',
+  'utils/upload.c',
+]
+
+# Source files that may use GLSL pragmas, we need to use custom_target
+# to the proper environment and dependency information for these
+foreach f : ['renderer.c', 'shaders.c']
+  sources += custom_target(f,
+    command: glsl_preproc,
+    depend_files: glsl_deps,
+    env: python_env,
+    input: f,
+    output: f,
+  )
+endforeach
+
+# More .c files defined here, we can't put them in this file because of meson
+# preventing the use of / in custom_target output filenames
+subdir('shaders')
+
+tests = [
+  'cache.c',
+  'colorspace.c',
+  'common.c',
+  'dither.c',
+  'dummy.c',
+  'lut.c',
+  'filters.c',
+  'options.c',
+  'string.c',
+  'tone_mapping.c',
+  'utils.c',
+]
+
+fuzzers = [
+  'lut.c',
+  'options.c',
+  'shaders.c',
+  'user_shaders.c',
+]
+
+components = configuration_data()
+
+
+### Optional dependencies / components
+subdir('glsl')
+subdir('d3d11')
+subdir('opengl')
+subdir('vulkan')
+
+lcms = dependency('lcms2', version: '>=2.9', required: get_option('lcms'))
+components.set('lcms', lcms.found())
+if lcms.found()
+  build_deps += lcms
+  tests += 'icc.c'
+endif
+
+# Check to see if libplacebo built this way is sane
+if not (components.get('vulkan') or components.get('opengl') or components.get('d3d11'))
+  warning('Building without any graphics API. libplacebo built this way still ' +
+          'has some limited use (e.g. generating GLSL shaders), but most of ' +
+          'its functionality will be missing or impaired!')
+endif
+
+has_spirv = components.get('shaderc') or components.get('glslang')
+needs_spirv = components.get('vulkan') or components.get('d3d11')
+if needs_spirv and not has_spirv
+  warning('Building without any GLSL compiler (shaderc, glslang), but with ' +
+          'APIs required that require one (vulkan, d3d11). This build is very ' +
+          'likely to be very limited in functionality!')
+endif
+
+dovi = get_option('dovi')
+components.set('dovi', dovi.allowed())
+
+libdovi = dependency('dovi', version: '>=1.6.7', required: get_option('libdovi').require(dovi.allowed()))
+components.set('libdovi', libdovi.found())
+if libdovi.found()
+  build_deps += libdovi
+endif
+
+xxhash_inc = include_directories()
+xxhash = dependency('libxxhash', required: get_option('xxhash'))
+components.set('xxhash', xxhash.found())
+if xxhash.found()
+  xxhash_inc = xxhash.get_variable('includedir')
+endif
+
+# Generate configuration files
+defs = ''
+pc_vars = []
+
+foreach comp : components.keys()
+  found = components.get(comp)
+  varname = comp.underscorify().to_upper()
+  summary(comp, found, section: 'Optional features', bool_yn: true)
+  defs += (found ? '#define PL_HAVE_@0@ 1\n' : '#undef PL_HAVE_@0@\n').format(varname)
+  pc_vars += 'pl_has_@0@=@1@'.format(varname.to_lower(), found ? 1 : 0)
+endforeach
+
+conf_public.set('extra_defs', defs)
+subdir('./include/libplacebo') # generate config.h in the right location
+sources += configure_file(
+  output: 'config_internal.h',
+  configuration: conf_internal
+)
+
+version_h = vcs_tag(
+  command: ['git', 'describe', '--dirty'],
+  fallback: version_pretty,
+  replace_string: '@buildver@',
+  input: 'version.h.in',
+  output: 'version.h',
+)
+
+sources += version_h
+
+if host_machine.system() == 'windows'
+  windows = import('windows')
+  sources += windows.compile_resources(libplacebo_rc, depends: version_h,
+                                       include_directories: meson.project_source_root()/'win32')
+endif
+
+fast_float_inc = include_directories()
+if fs.is_dir('../3rdparty/fast_float/include')
+  fast_float_inc = include_directories('../3rdparty/fast_float/include')
+endif
+
+### Main library build process
+inc = include_directories('./include')
+lib = library('placebo', sources,
+  c_args: ['-DPL_EXPORT'],
+  install: true,
+  dependencies: build_deps + glad_dep,
+  soversion: apiver,
+  include_directories: [ inc, vulkan_headers_inc, fast_float_inc, xxhash_inc ],
+  link_args: link_args,
+  link_depends: link_depends,
+  gnu_symbol_visibility: 'hidden',
+  name_prefix: 'lib'
+)
+
+libplacebo = declare_dependency(
+  include_directories: inc,
+  compile_args: get_option('default_library') == 'static' ? ['-DPL_STATIC'] : [],
+  link_with: lib,
+  variables: pc_vars,
+)
+
+
+### Install process
+proj_name = meson.project_name()
+foreach h : headers
+  parts = h.split('/')
+  path = proj_name
+  foreach p : parts
+    if p != parts[-1]
+      path = path / p
+    endif
+  endforeach
+
+  install_headers('include' / proj_name / h, subdir: path)
+endforeach
+
+extra_cflags = []
+if get_option('default_library') == 'static'
+  extra_cflags = ['-DPL_STATIC']
+elif get_option('default_library') == 'both'
+  # meson doesn't support Cflags.private, insert it forcefully...
+  extra_cflags = ['\nCflags.private:', '-DPL_STATIC']
+endif
+
+pkg = import('pkgconfig')
+pkg.generate(
+  name: proj_name,
+  description: 'Reusable library for GPU-accelerated video/image rendering',
+  libraries: lib,
+  version: version,
+  variables: pc_vars,
+  extra_cflags: extra_cflags,
+)
+
+
+### Testing
+tdep_static = declare_dependency(
+    dependencies: build_deps,
+    include_directories: [ inc, include_directories('.') ],
+    compile_args: '-DPL_STATIC'
+    # TODO: Define objects here once Meson 1.1.0 is ok to use
+    # objects: lib.extract_all_objects(recursive: false)
+  )
+
+tdep_shared = declare_dependency(
+    include_directories: [ inc, include_directories('.') ],
+    compile_args: get_option('default_library') == 'static' ? ['-DPL_STATIC'] : [],
+    link_with: lib,
+  )
+
+if get_option('tests')
+  subdir('tests')
+endif
+
+if get_option('bench')
+  if not components.get('vk-proc-addr')
+    error('Compiling the benchmark suite requires vulkan support!')
+  endif
+
+  bench = executable('bench',
+    'tests/bench.c',
+    dependencies: [tdep_shared, vulkan_headers],
+    link_args: link_args,
+    link_depends: link_depends,
+    include_directories: vulkan_headers_inc,
+  )
+  test('benchmark', bench, is_parallel: false, timeout: 600)
+endif
+
+if get_option('fuzz')
+  foreach f : fuzzers
+    executable('fuzz.' + f, 'tests/fuzz/' + f,
+        objects: lib.extract_all_objects(recursive: false),
+        dependencies: tdep_static,
+        link_args: link_args,
+        link_depends: link_depends,
+    )
+  endforeach
+endif
+
+pl_thread = declare_dependency(
+  include_directories: include_directories('.'),
+  dependencies: threads,
+)
+
+pl_clock = declare_dependency(
+  include_directories: include_directories('.'),
+)
diff --git a/src/opengl/common.h b/src/opengl/common.h
new file mode 100644
index 0000000..c84c69f
--- /dev/null
+++ b/src/opengl/common.h
@@ -0,0 +1,66 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../common.h"
+#include "../log.h"
+#include "../gpu.h"
+#include "pl_thread.h"
+
+#include <libplacebo/opengl.h>
+
+// Collision with llvm-mingw <winnt.h>
+#undef MemoryBarrier
+
+#define GLAD_GL
+#define GLAD_GLES2
+#include <glad/gl.h>
+#include <glad/egl.h>
+
+typedef GladGLContext gl_funcs;
+
+// PL_PRIV(pl_opengl)
+struct gl_ctx {
+    pl_log log;
+    struct pl_opengl_params params;
+    bool is_debug;
+    bool is_debug_egl;
+    bool is_gles;
+
+    // For context locking
+    pl_mutex lock;
+    int count;
+
+    // Dispatch table
+    gl_funcs func;
+};
+
+struct gl_cb {
+    void (*callback)(void *priv);
+    void *priv;
+    GLsync sync;
+};
+
+struct fbo_format {
+    pl_fmt fmt;
+    const struct gl_format *glfmt;
+};
+
+// For locking/unlocking
+bool gl_make_current(pl_opengl gl);
+void gl_release_current(pl_opengl gl);
diff --git a/src/opengl/context.c b/src/opengl/context.c
new file mode 100644
index 0000000..6ca14b8
--- /dev/null
+++ b/src/opengl/context.c
@@ -0,0 +1,332 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <ctype.h>
+
+#include "common.h"
+#include "utils.h"
+#include "gpu.h"
+
+const struct pl_opengl_params pl_opengl_default_params = {0};
+
+static void GLAPIENTRY debug_cb(GLenum source, GLenum type, GLuint id,
+                                GLenum severity, GLsizei length,
+                                const GLchar *message, const void *userParam)
+{
+    pl_log log = (void *) userParam;
+    enum pl_log_level level = PL_LOG_ERR;
+
+    switch (severity) {
+    case GL_DEBUG_SEVERITY_NOTIFICATION:level = PL_LOG_DEBUG; break;
+    case GL_DEBUG_SEVERITY_LOW:         level = PL_LOG_INFO; break;
+    case GL_DEBUG_SEVERITY_MEDIUM:      level = PL_LOG_WARN; break;
+    case GL_DEBUG_SEVERITY_HIGH:        level = PL_LOG_ERR; break;
+    }
+
+    pl_msg(log, level, "GL: %s", message);
+
+    if (level <= PL_LOG_ERR)
+        pl_log_stack_trace(log, level);
+}
+
+static void GLAPIENTRY debug_cb_egl(EGLenum error, const char *command,
+                                    EGLint messageType, EGLLabelKHR threadLabel,
+                                    EGLLabelKHR objectLabel, const char *message)
+{
+    pl_log log = threadLabel;
+    enum pl_log_level level = PL_LOG_ERR;
+
+    switch (messageType) {
+    case EGL_DEBUG_MSG_CRITICAL_KHR:    level = PL_LOG_FATAL; break;
+    case EGL_DEBUG_MSG_ERROR_KHR:       level = PL_LOG_ERR; break;
+    case EGL_DEBUG_MSG_WARN_KHR:        level = PL_LOG_WARN; break;
+    case EGL_DEBUG_MSG_INFO_KHR:        level = PL_LOG_DEBUG; break;
+    }
+
+    pl_msg(log, level, "EGL: %s: %s %s", command, egl_err_str(error),
+           message);
+
+    if (level <= PL_LOG_ERR)
+        pl_log_stack_trace(log, level);
+}
+
+// Guards access to the (thread-unsafe) glad global EGL state
+static pl_static_mutex glad_egl_mutex = PL_STATIC_MUTEX_INITIALIZER;
+
+void pl_opengl_destroy(pl_opengl *ptr)
+{
+    pl_opengl pl_gl = *ptr;
+    if (!pl_gl)
+        return;
+
+    struct gl_ctx *p = PL_PRIV(pl_gl);
+    gl_funcs *gl = &p->func;
+    if (!gl_make_current(pl_gl)) {
+        PL_WARN(p, "Failed uninitializing OpenGL context, leaking resources!");
+        return;
+    }
+
+    if (p->is_debug)
+        gl->DebugMessageCallback(NULL, NULL);
+
+    if (p->is_debug_egl)
+        eglDebugMessageControlKHR(NULL, NULL);
+
+    pl_gpu_destroy(pl_gl->gpu);
+
+#ifdef PL_HAVE_GL_PROC_ADDR
+    if (p->is_gles) {
+        gladLoaderUnloadGLES2Context(gl);
+    } else {
+        gladLoaderUnloadGLContext(gl);
+    }
+
+    bool used_loader = !p->params.get_proc_addr && !p->params.get_proc_addr_ex;
+    if (p->params.egl_display && used_loader) {
+        pl_static_mutex_lock(&glad_egl_mutex);
+        gladLoaderUnloadEGL();
+        pl_static_mutex_unlock(&glad_egl_mutex);
+    }
+#endif
+
+    gl_release_current(pl_gl);
+    pl_mutex_destroy(&p->lock);
+    pl_free_ptr((void **) ptr);
+
+}
+
+typedef PL_ARRAY(const char *) ext_arr_t;
+static void add_exts_str(void *alloc, ext_arr_t *arr, const char *extstr)
+{
+    pl_str rest = pl_str_strip(pl_str0(pl_strdup0(alloc, pl_str0(extstr))));
+    while (rest.len) {
+        pl_str ext = pl_str_split_char(rest, ' ', &rest);
+        ext.buf[ext.len] = '\0'; // re-use separator for terminator
+        PL_ARRAY_APPEND(alloc, *arr, (char *) ext.buf);
+    }
+}
+
+pl_opengl pl_opengl_create(pl_log log, const struct pl_opengl_params *params)
+{
+    params = PL_DEF(params, &pl_opengl_default_params);
+    struct pl_opengl_t *pl_gl = pl_zalloc_obj(NULL, pl_gl, struct gl_ctx);
+    struct gl_ctx *p = PL_PRIV(pl_gl);
+    gl_funcs *gl = &p->func;
+    p->params = *params;
+    p->log = log;
+
+    pl_mutex_init_type(&p->lock, PL_MUTEX_RECURSIVE);
+    if (!gl_make_current(pl_gl)) {
+        pl_free(pl_gl);
+        return NULL;
+    }
+
+    bool ok;
+    if (params->get_proc_addr_ex) {
+        ok = gladLoadGLContextUserPtr(gl, params->get_proc_addr_ex, params->proc_ctx);
+    } else if (params->get_proc_addr) {
+        ok = gladLoadGLContext(gl, params->get_proc_addr);
+    } else {
+#ifdef PL_HAVE_GL_PROC_ADDR
+        ok = gladLoaderLoadGLContext(gl);
+#else
+        PL_FATAL(p, "No `glGetProcAddress` function provided, and libplacebo "
+                 "built without its built-in OpenGL loader!");
+        goto error;
+#endif
+    }
+
+    if (!ok) {
+        PL_INFO(p, "Failed loading core GL, retrying as GLES...");
+    } else if (gl_is_gles(pl_gl)) {
+        PL_INFO(p, "GL context seems to be OpenGL ES, reloading as GLES...");
+        ok = false;
+    }
+
+    if (!ok) {
+        memset(gl, 0, sizeof(*gl));
+        if (params->get_proc_addr_ex) {
+            ok = gladLoadGLES2ContextUserPtr(gl, params->get_proc_addr_ex, params->proc_ctx);
+        } else if (params->get_proc_addr) {
+            ok = gladLoadGLES2Context(gl, params->get_proc_addr);
+        } else {
+#ifdef PL_HAVE_GL_PROC_ADDR
+            ok = gladLoaderLoadGLES2Context(gl);
+#else
+            pl_unreachable();
+#endif
+        }
+        p->is_gles = ok;
+    }
+
+    if (!ok) {
+        PL_FATAL(p, "Failed to initialize OpenGL context - make sure a valid "
+                 "OpenGL context is bound to the current thread!");
+        goto error;
+    }
+
+    const char *version = (const char *) gl->GetString(GL_VERSION);
+    if (version) {
+        const char *ver = version;
+        while (!isdigit(*ver) && *ver != '\0')
+            ver++;
+        if (sscanf(ver, "%d.%d", &pl_gl->major, &pl_gl->minor) != 2) {
+            PL_FATAL(p, "Invalid GL_VERSION string: %s\n", version);
+            goto error;
+        }
+    }
+
+    if (!pl_gl->major) {
+        PL_FATAL(p, "No OpenGL version detected - make sure an OpenGL context "
+                 "is bound to the current thread!");
+        goto error;
+    }
+
+    static const int gl_ver_req = 3;
+    if (pl_gl->major < gl_ver_req) {
+        PL_FATAL(p, "OpenGL version too old (%d < %d), please use a newer "
+                 "OpenGL implementation or downgrade libplacebo!",
+                 pl_gl->major, gl_ver_req);
+        goto error;
+    }
+
+    PL_INFO(p, "Detected OpenGL version strings:");
+    PL_INFO(p, "    GL_VERSION:  %s", version);
+    PL_INFO(p, "    GL_VENDOR:   %s", (char *) gl->GetString(GL_VENDOR));
+    PL_INFO(p, "    GL_RENDERER: %s", (char *) gl->GetString(GL_RENDERER));
+
+    ext_arr_t exts = {0};
+    if (pl_gl->major >= 3) {
+        gl->GetIntegerv(GL_NUM_EXTENSIONS, &exts.num);
+        PL_ARRAY_RESIZE(pl_gl, exts, exts.num);
+        for (int i = 0; i < exts.num; i++)
+            exts.elem[i] = (const char *) gl->GetStringi(GL_EXTENSIONS, i);
+    } else {
+        add_exts_str(pl_gl, &exts, (const char *) gl->GetString(GL_EXTENSIONS));
+    }
+
+    if (pl_msg_test(log, PL_LOG_DEBUG)) {
+        PL_DEBUG(p, "    GL_EXTENSIONS:");
+        for (int i = 0; i < exts.num; i++)
+            PL_DEBUG(p, "        %s", exts.elem[i]);
+    }
+
+    if (params->egl_display) {
+        pl_static_mutex_lock(&glad_egl_mutex);
+        if (params->get_proc_addr_ex) {
+            ok = gladLoadEGLUserPtr(params->egl_display, params->get_proc_addr_ex,
+                                    params->proc_ctx);
+        } else if (params->get_proc_addr) {
+            ok = gladLoadEGL(params->egl_display, params->get_proc_addr);
+        } else {
+#ifdef PL_HAVE_GL_PROC_ADDR
+            ok = gladLoaderLoadEGL(params->egl_display);
+#else
+            pl_unreachable();
+#endif
+        }
+        pl_static_mutex_unlock(&glad_egl_mutex);
+
+        if (!ok) {
+            PL_FATAL(p, "Failed loading EGL functions - double check EGLDisplay?");
+            goto error;
+        }
+
+        int start = exts.num;
+        add_exts_str(pl_gl, &exts, eglQueryString(params->egl_display,
+                                                  EGL_EXTENSIONS));
+        if (exts.num > start) {
+            PL_DEBUG(p, "    EGL_EXTENSIONS:");
+            for (int i = start; i < exts.num; i++)
+                PL_DEBUG(p, "        %s", exts.elem[i]);
+        }
+    }
+
+    pl_gl->extensions = exts.elem;
+    pl_gl->num_extensions = exts.num;
+
+    if (!params->allow_software && gl_is_software(pl_gl)) {
+        PL_FATAL(p, "OpenGL context is suspected to be a software rasterizer, "
+                 "but `allow_software` is false.");
+        goto error;
+    }
+
+    if (params->debug) {
+        if (pl_opengl_has_ext(pl_gl, "GL_KHR_debug")) {
+            gl->DebugMessageCallback(debug_cb, log);
+            gl->Enable(GL_DEBUG_OUTPUT);
+            p->is_debug = true;
+        } else {
+            PL_WARN(p, "OpenGL debugging requested, but GL_KHR_debug is not "
+                    "available... ignoring!");
+        }
+
+        if (params->egl_display && pl_opengl_has_ext(pl_gl, "EGL_KHR_debug")) {
+            static const EGLAttrib attribs[] = {
+                // Enable everything under the sun, because the `pl_ctx` log
+                // level may change at runtime.
+                EGL_DEBUG_MSG_CRITICAL_KHR, EGL_TRUE,
+                EGL_DEBUG_MSG_ERROR_KHR,    EGL_TRUE,
+                EGL_DEBUG_MSG_WARN_KHR,     EGL_TRUE,
+                EGL_DEBUG_MSG_INFO_KHR,     EGL_TRUE,
+                EGL_NONE,
+            };
+
+            eglDebugMessageControlKHR(debug_cb_egl, attribs);
+            eglLabelObjectKHR(NULL, EGL_OBJECT_THREAD_KHR, NULL, (void *) log);
+            p->is_debug_egl = true;
+        }
+    }
+
+    pl_gl->gpu = pl_gpu_create_gl(log, pl_gl, params);
+    if (!pl_gl->gpu)
+        goto error;
+
+    gl_release_current(pl_gl);
+    return pl_gl;
+
+error:
+    PL_FATAL(p, "Failed initializing opengl context!");
+    gl_release_current(pl_gl);
+    pl_opengl_destroy((pl_opengl *) &pl_gl);
+    return NULL;
+}
+
+bool gl_make_current(pl_opengl pl_gl)
+{
+    struct gl_ctx *p = PL_PRIV(pl_gl);
+    pl_mutex_lock(&p->lock);
+    if (!p->count && p->params.make_current) {
+        if (!p->params.make_current(p->params.priv)) {
+            PL_ERR(p, "Failed making OpenGL context current on calling thread!");
+            pl_mutex_unlock(&p->lock);
+            return false;
+        }
+    }
+
+    p->count++;
+    return true;
+}
+
+void gl_release_current(pl_opengl pl_gl)
+{
+    struct gl_ctx *p = PL_PRIV(pl_gl);
+    p->count--;
+    if (!p->count && p->params.release_current)
+        p->params.release_current(p->params.priv);
+    pl_mutex_unlock(&p->lock);
+}
diff --git a/src/opengl/formats.c b/src/opengl/formats.c
new file mode 100644
index 0000000..6604835
--- /dev/null
+++ b/src/opengl/formats.c
@@ -0,0 +1,485 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+#include "common.h"
+#include "formats.h"
+#include "utils.h"
+
+#ifdef PL_HAVE_UNIX
+static bool supported_fourcc(struct pl_gl *p, EGLint fourcc)
+{
+    for (int i = 0; i < p->egl_formats.num; ++i)
+        if (fourcc == p->egl_formats.elem[i])
+            return true;
+    return false;
+}
+#endif
+
+#define FMT(_name, bits, ftype, _caps)               \
+    (struct pl_fmt_t) {                              \
+        .name = _name,                               \
+        .type = PL_FMT_##ftype,                      \
+        .caps = (enum pl_fmt_caps) (_caps),          \
+        .sample_order = {0, 1, 2, 3},                \
+        .component_depth = {bits, bits, bits, bits}, \
+    }
+
+// Convenience to make the names simpler
+enum {
+    // Type aliases
+    U8    = GL_UNSIGNED_BYTE,
+    U16   = GL_UNSIGNED_SHORT,
+    U32   = GL_UNSIGNED_INT,
+    I8    = GL_BYTE,
+    I16   = GL_SHORT,
+    I32   = GL_INT,
+    FLT   = GL_FLOAT,
+
+    // Component aliases
+    R     = GL_RED,
+    RG    = GL_RG,
+    RGB   = GL_RGB,
+    RGBA  = GL_RGBA,
+    BGRA  = GL_BGRA,
+    RI    = GL_RED_INTEGER,
+    RGI   = GL_RG_INTEGER,
+    RGBI  = GL_RGB_INTEGER,
+    RGBAI = GL_RGBA_INTEGER,
+
+    // Capability aliases
+    S     = PL_FMT_CAP_SAMPLEABLE,
+    L     = PL_FMT_CAP_LINEAR,
+    F     = PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_BLITTABLE, // FBO support
+    V     = PL_FMT_CAP_VERTEX,
+};
+
+// Basic 8-bit formats
+const struct gl_format formats_norm8[] = {
+    {GL_R8,             R,     U8,  FMT("r8",       8, UNORM, S|L|F|V)},
+    {GL_RG8,            RG,    U8,  FMT("rg8",      8, UNORM, S|L|F|V)},
+    {GL_RGB8,           RGB,   U8,  FMT("rgb8",     8, UNORM, S|L|F|V)},
+    {GL_RGBA8,          RGBA,  U8,  FMT("rgba8",    8, UNORM, S|L|F|V)},
+};
+
+// Signed variants
+/* TODO: these are broken in mesa
+const struct gl_format formats_snorm8[] = {
+    {GL_R8_SNORM,       R,     I8,  FMT("r8s",      8, SNORM, S|L|F|V)},
+    {GL_RG8_SNORM,      RG,    I8,  FMT("rg8s",     8, SNORM, S|L|F|V)},
+    {GL_RGB8_SNORM,     RGB,   I8,  FMT("rgb8s",    8, SNORM, S|L|F|V)},
+    {GL_RGBA8_SNORM,    RGBA,  I8,  FMT("rgba8s",   8, SNORM, S|L|F|V)},
+};
+*/
+
+// BGRA 8-bit
+const struct gl_format formats_bgra8[] = {
+    {GL_RGBA8,          BGRA,  U8,  {
+        .name               = "bgra8",
+        .type               = PL_FMT_UNORM,
+        .caps               = S|L|F|V,
+        .sample_order       = {2, 1, 0, 3},
+        .component_depth    = {8, 8, 8, 8},
+    }},
+};
+
+// Basic 16-bit formats, excluding rgb16 (special cased below)
+const struct gl_format formats_norm16[] = {
+    {GL_R16,            R,     U16, FMT("r16",     16, UNORM, S|L|F|V)},
+    {GL_RG16,           RG,    U16, FMT("rg16",    16, UNORM, S|L|F|V)},
+    {GL_RGBA16,         RGBA,  U16, FMT("rgba16",  16, UNORM, S|L|F|V)},
+};
+
+// Renderable version of rgb16
+const struct gl_format formats_rgb16_fbo[] = {
+    {GL_RGB16,          RGB,   U16, FMT("rgb16",   16, UNORM, S|L|F|V)},
+};
+
+// Non-renderable version of rgb16
+const struct gl_format formats_rgb16_fallback[] = {
+    {GL_RGB16,          RGB,   U16, FMT("rgb16",   16, UNORM, S|L|V)},
+};
+
+// Signed 16-bit variants
+/* TODO: these are broken in mesa and nvidia
+const struct gl_format formats_snorm16[] = {
+    {GL_R16_SNORM,      R,     I16, FMT("r16s",    16, SNORM, S|L|F|V)},
+    {GL_RG16_SNORM,     RG,    I16, FMT("rg16s",   16, SNORM, S|L|F|V)},
+    {GL_RGB16_SNORM,    RGB,   I16, FMT("rgb16s",  16, SNORM, S|L|F|V)},
+    {GL_RGBA16_SNORM,   RGBA,  I16, FMT("rgba16s", 16, SNORM, S|L|F|V)},
+};
+*/
+
+// Floating point texture formats
+const struct gl_format formats_float[] = {
+    {GL_R16F,           R,     FLT, FMT("r16f",    16, FLOAT, S|L|F)},
+    {GL_RG16F,          RG,    FLT, FMT("rg16f",   16, FLOAT, S|L|F)},
+    {GL_RGB16F,         RGB,   FLT, FMT("rgb16f",  16, FLOAT, S|L|F)},
+    {GL_RGBA16F,        RGBA,  FLT, FMT("rgba16f", 16, FLOAT, S|L|F)},
+    {GL_R32F,           R,     FLT, FMT("r32f",    32, FLOAT, S|L|F|V)},
+    {GL_RG32F,          RG,    FLT, FMT("rg32f",   32, FLOAT, S|L|F|V)},
+    {GL_RGB32F,         RGB,   FLT, FMT("rgb32f",  32, FLOAT, S|L|F|V)},
+    {GL_RGBA32F,        RGBA,  FLT, FMT("rgba32f", 32, FLOAT, S|L|F|V)},
+};
+
+// Renderable 16-bit float formats (excluding rgb16f)
+const struct gl_format formats_float16_fbo[] = {
+    {GL_R16F,           R,     FLT, FMT("r16f",    16, FLOAT, S|L|F)},
+    {GL_RG16F,          RG,    FLT, FMT("rg16f",   16, FLOAT, S|L|F)},
+    {GL_RGB16F,         RGB,   FLT, FMT("rgb16f",  16, FLOAT, S|L)},
+    {GL_RGBA16F,        RGBA,  FLT, FMT("rgba16f", 16, FLOAT, S|L|F)},
+};
+
+// Non-renderable 16-bit float formats
+const struct gl_format formats_float16_fallback[] = {
+    {GL_R16F,           R,     FLT, FMT("r16f",    16, FLOAT, S|L)},
+    {GL_RG16F,          RG,    FLT, FMT("rg16f",   16, FLOAT, S|L)},
+    {GL_RGB16F,         RGB,   FLT, FMT("rgb16f",  16, FLOAT, S|L)},
+    {GL_RGBA16F,        RGBA,  FLT, FMT("rgba16f", 16, FLOAT, S|L)},
+};
+
+// (Unsigned) integer formats
+const struct gl_format formats_uint[] = {
+    {GL_R8UI,           RI,    U8,  FMT("r8u",      8, UINT, S|F|V)},
+    {GL_RG8UI,          RGI,   U8,  FMT("rg8u",     8, UINT, S|F|V)},
+    {GL_RGB8UI,         RGBI,  U8,  FMT("rgb8u",    8, UINT, S|V)},
+    {GL_RGBA8UI,        RGBAI, U8,  FMT("rgba8u",   8, UINT, S|F|V)},
+    {GL_R16UI,          RI,    U16, FMT("r16u",    16, UINT, S|F|V)},
+    {GL_RG16UI,         RGI,   U16, FMT("rg16u",   16, UINT, S|F|V)},
+    {GL_RGB16UI,        RGBI,  U16, FMT("rgb16u",  16, UINT, S|V)},
+    {GL_RGBA16UI,       RGBAI, U16, FMT("rgba16u", 16, UINT, S|F|V)},
+};
+
+/* TODO
+    {GL_R32UI,          RI,    U32, FMT("r32u",    32, UINT)},
+    {GL_RG32UI,         RGI,   U32, FMT("rg32u",   32, UINT)},
+    {GL_RGB32UI,        RGBI,  U32, FMT("rgb32u",  32, UINT)},
+    {GL_RGBA32UI,       RGBAI, U32, FMT("rgba32u", 32, UINT)},
+
+    {GL_R8I,            RI,    I8,  FMT("r8i",      8, SINT)},
+    {GL_RG8I,           RGI,   I8,  FMT("rg8i",     8, SINT)},
+    {GL_RGB8I,          RGBI,  I8,  FMT("rgb8i",    8, SINT)},
+    {GL_RGBA8I,         RGBAI, I8,  FMT("rgba8i",   8, SINT)},
+    {GL_R16I,           RI,    I16, FMT("r16i",    16, SINT)},
+    {GL_RG16I,          RGI,   I16, FMT("rg16i",   16, SINT)},
+    {GL_RGB16I,         RGBI,  I16, FMT("rgb16i",  16, SINT)},
+    {GL_RGBA16I,        RGBAI, I16, FMT("rgba16i", 16, SINT)},
+    {GL_R32I,           RI,    I32, FMT("r32i",    32, SINT)},
+    {GL_RG32I,          RGI,   I32, FMT("rg32i",   32, SINT)},
+    {GL_RGB32I,         RGBI,  I32, FMT("rgb32i",  32, SINT)},
+    {GL_RGBA32I,        RGBAI, I32, FMT("rgba32i", 32, SINT)},
+*/
+
+// GL2 legacy formats
+const struct gl_format formats_legacy_gl2[] = {
+    {GL_RGB8,           RGB,   U8,  FMT("rgb8",     8, UNORM, S|L|V)},
+    {GL_RGBA8,          RGBA,  U8,  FMT("rgba8",    8, UNORM, S|L|V)},
+    {GL_RGB16,          RGB,   U16, FMT("rgb16",   16, UNORM, S|L|V)},
+    {GL_RGBA16,         RGBA,  U16, FMT("rgba16",  16, UNORM, S|L|V)},
+};
+
+// GLES2 legacy formats
+const struct gl_format formats_legacy_gles2[] = {
+    {GL_RGB,            RGB,   U8,  FMT("rgb",      8, UNORM, S|L)},
+    {GL_RGBA,           RGBA,  U8,  FMT("rgba",     8, UNORM, S|L)},
+};
+
+// GLES BGRA
+const struct gl_format formats_bgra_gles[] = {
+    {GL_BGRA,           BGRA,  U8,  {
+        .name               = "bgra8",
+        .type               = PL_FMT_UNORM,
+        .caps               = S|L|F|V,
+        .sample_order       = {2, 1, 0, 3},
+        .component_depth    = {8, 8, 8, 8},
+    }},
+};
+
+// Fallback for vertex-only formats, as a last resort
+const struct gl_format formats_basic_vertex[] = {
+    {GL_R32F,           R,     FLT, FMT("r32f",    32, FLOAT, V)},
+    {GL_RG32F,          RG,    FLT, FMT("rg32f",   32, FLOAT, V)},
+    {GL_RGB32F,         RGB,   FLT, FMT("rgb32f",  32, FLOAT, V)},
+    {GL_RGBA32F,        RGBA,  FLT, FMT("rgba32f", 32, FLOAT, V)},
+};
+
+static void add_format(pl_gpu pgpu, const struct gl_format *gl_fmt)
+{
+    struct pl_gpu_t *gpu = (struct pl_gpu_t *) pgpu;
+    struct pl_gl *p = PL_PRIV(gpu);
+
+    struct pl_fmt_t *fmt = pl_alloc_obj(gpu, fmt, gl_fmt);
+    const struct gl_format **fmtp = PL_PRIV(fmt);
+    *fmt = gl_fmt->tmpl;
+    *fmtp = gl_fmt;
+
+    // Calculate the host size and number of components
+    switch (gl_fmt->fmt) {
+    case GL_RED:
+    case GL_RED_INTEGER:
+        fmt->num_components = 1;
+        break;
+    case GL_RG:
+    case GL_RG_INTEGER:
+        fmt->num_components = 2;
+        break;
+    case GL_RGB:
+    case GL_RGB_INTEGER:
+        fmt->num_components = 3;
+        break;
+    case GL_RGBA:
+    case GL_RGBA_INTEGER:
+    case GL_BGRA:
+        fmt->num_components = 4;
+        break;
+    default:
+        pl_unreachable();
+    }
+
+    int size;
+    switch (gl_fmt->type) {
+    case GL_BYTE:
+    case GL_UNSIGNED_BYTE:
+        size = 1;
+        break;
+    case GL_SHORT:
+    case GL_UNSIGNED_SHORT:
+        size = 2;
+        break;
+    case GL_INT:
+    case GL_UNSIGNED_INT:
+    case GL_FLOAT:
+        size = 4;
+        break;
+    default:
+        pl_unreachable();
+    }
+
+    // Host visible representation
+    fmt->texel_size = fmt->num_components * size;
+    fmt->texel_align = 1;
+    for (int i = 0; i < fmt->num_components; i++)
+        fmt->host_bits[i] = size * 8;
+
+    // Compute internal size by summing up the depth
+    int ibits = 0;
+    for (int i = 0; i < fmt->num_components; i++)
+        ibits += fmt->component_depth[i];
+    fmt->internal_size = (ibits + 7) / 8;
+
+    // We're not the ones actually emulating these texture format - the
+    // driver is - but we might as well set the hint.
+    fmt->emulated = fmt->texel_size != fmt->internal_size;
+
+    // 3-component formats are almost surely also emulated
+    if (fmt->num_components == 3)
+        fmt->emulated = true;
+
+    // Older OpenGL most likely emulates 32-bit float formats as well
+    if (p->gl_ver < 30 && fmt->component_depth[0] >= 32)
+        fmt->emulated = true;
+
+    // For sanity, clear the superfluous fields
+    for (int i = fmt->num_components; i < 4; i++) {
+        fmt->component_depth[i] = 0;
+        fmt->sample_order[i] = 0;
+        fmt->host_bits[i] = 0;
+    }
+
+    fmt->glsl_type = pl_var_glsl_type_name(pl_var_from_fmt(fmt, ""));
+    fmt->glsl_format = pl_fmt_glsl_format(fmt, fmt->num_components);
+    fmt->fourcc = pl_fmt_fourcc(fmt);
+    pl_assert(fmt->glsl_type);
+
+#ifdef PL_HAVE_UNIX
+    if (p->has_modifiers && fmt->fourcc && supported_fourcc(p, fmt->fourcc)) {
+        int num_mods = 0;
+        bool ok = eglQueryDmaBufModifiersEXT(p->egl_dpy, fmt->fourcc,
+                                             0, NULL, NULL, &num_mods);
+        if (ok && num_mods) {
+            // On my system eglQueryDmaBufModifiersEXT seems to never return
+            // MOD_INVALID even though eglExportDMABUFImageQueryMESA happily
+            // returns such modifiers. Since we handle INVALID by not
+            // requiring modifiers at all, always add this value to the
+            // list of supported modifiers. May result in duplicates, but
+            // whatever.
+            uint64_t *mods = pl_calloc(fmt, num_mods + 1, sizeof(uint64_t));
+            mods[0] = DRM_FORMAT_MOD_INVALID;
+            ok = eglQueryDmaBufModifiersEXT(p->egl_dpy, fmt->fourcc, num_mods,
+                                            &mods[1], NULL, &num_mods);
+
+            if (ok) {
+                fmt->modifiers = mods;
+                fmt->num_modifiers = num_mods + 1;
+            } else {
+                pl_free(mods);
+            }
+        }
+
+        eglGetError(); // ignore probing errors
+    }
+
+    if (!fmt->num_modifiers) {
+        // Hacky fallback for older drivers that don't support properly
+        // querying modifiers
+        static const uint64_t static_mods[] = {
+            DRM_FORMAT_MOD_INVALID,
+            DRM_FORMAT_MOD_LINEAR,
+        };
+
+        fmt->num_modifiers = PL_ARRAY_SIZE(static_mods);
+        fmt->modifiers = static_mods;
+    }
+#endif
+
+    // Gathering requires checking the format type (and extension presence)
+    if (fmt->caps & PL_FMT_CAP_SAMPLEABLE)
+        fmt->gatherable = p->gather_comps >= fmt->num_components;
+
+    // Reading from textures on GLES requires FBO support for this fmt
+    if (p->has_readback && (p->gl_ver || (fmt->caps & PL_FMT_CAP_RENDERABLE)))
+        fmt->caps |= PL_FMT_CAP_HOST_READABLE;
+
+    if (gpu->glsl.compute && fmt->glsl_format && p->has_storage)
+        fmt->caps |= PL_FMT_CAP_STORABLE | PL_FMT_CAP_READWRITE;
+
+    // Only float-type formats are considered blendable in OpenGL
+    switch (fmt->type) {
+    case PL_FMT_UNKNOWN:
+    case PL_FMT_UINT:
+    case PL_FMT_SINT:
+        break;
+    case PL_FMT_FLOAT:
+    case PL_FMT_UNORM:
+    case PL_FMT_SNORM:
+        if (fmt->caps & PL_FMT_CAP_RENDERABLE)
+            fmt->caps |= PL_FMT_CAP_BLENDABLE;
+        break;
+    case PL_FMT_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+    // TODO: Texel buffers
+
+    PL_ARRAY_APPEND_RAW(gpu, gpu->formats, gpu->num_formats, fmt);
+}
+
+#define DO_FORMATS(formats)                                 \
+    do {                                                    \
+        for (int i = 0; i < PL_ARRAY_SIZE(formats); i++)    \
+            add_format(gpu, &formats[i]);                   \
+    } while (0)
+
+bool gl_setup_formats(struct pl_gpu_t *gpu)
+{
+    struct pl_gl *p = PL_PRIV(gpu);
+
+#ifdef PL_HAVE_UNIX
+    if (p->has_modifiers) {
+        EGLint num_formats = 0;
+        bool ok = eglQueryDmaBufFormatsEXT(p->egl_dpy, 0, NULL,
+                                           &num_formats);
+        if (ok && num_formats) {
+            p->egl_formats.elem = pl_calloc(gpu, num_formats, sizeof(EGLint));
+            p->egl_formats.num = num_formats;
+            ok = eglQueryDmaBufFormatsEXT(p->egl_dpy, num_formats,
+                                          p->egl_formats.elem, &num_formats);
+            pl_assert(ok);
+
+            PL_DEBUG(gpu, "EGL formats supported:");
+            for (int i = 0; i < num_formats; ++i) {
+                PL_DEBUG(gpu, "    0x%08x(%.4s)", p->egl_formats.elem[i],
+                         PRINT_FOURCC(p->egl_formats.elem[i]));
+            }
+        }
+    }
+#endif
+
+    if (p->gl_ver >= 30) {
+        // Desktop GL3+ has everything
+        DO_FORMATS(formats_norm8);
+        DO_FORMATS(formats_bgra8);
+        DO_FORMATS(formats_norm16);
+        DO_FORMATS(formats_rgb16_fbo);
+        DO_FORMATS(formats_float);
+        DO_FORMATS(formats_uint);
+        goto done;
+    }
+
+    if (p->gl_ver >= 21) {
+        // If we have a reasonable set of extensions, we can enable most
+        // things. Otherwise, pick simple fallback formats
+        if (pl_opengl_has_ext(p->gl, "GL_ARB_texture_float") &&
+            pl_opengl_has_ext(p->gl, "GL_ARB_texture_rg") &&
+            pl_opengl_has_ext(p->gl, "GL_ARB_framebuffer_object"))
+        {
+            DO_FORMATS(formats_norm8);
+            DO_FORMATS(formats_bgra8);
+            DO_FORMATS(formats_norm16);
+            DO_FORMATS(formats_rgb16_fbo);
+            DO_FORMATS(formats_float);
+        } else {
+            // Fallback for GL2
+            DO_FORMATS(formats_legacy_gl2);
+            DO_FORMATS(formats_basic_vertex);
+        }
+        goto done;
+    }
+
+    if (p->gles_ver >= 30) {
+        // GLES 3.0 has some basic formats, with framebuffers for float16
+        // depending on GL_EXT_color_buffer_(half_)float support
+        DO_FORMATS(formats_norm8);
+        if (pl_opengl_has_ext(p->gl, "GL_EXT_texture_norm16")) {
+            DO_FORMATS(formats_norm16);
+            DO_FORMATS(formats_rgb16_fallback);
+        }
+        if (pl_opengl_has_ext(p->gl, "GL_EXT_texture_format_BGRA8888"))
+            DO_FORMATS(formats_bgra_gles);
+        if (pl_opengl_has_ext(p->gl, "GL_EXT_texture_integer"))
+            DO_FORMATS(formats_uint);
+        DO_FORMATS(formats_basic_vertex);
+        if (p->gles_ver >= 32 || pl_opengl_has_ext(p->gl, "GL_EXT_color_buffer_half_float")
+                              || pl_opengl_has_ext(p->gl, "GL_EXT_color_buffer_float")) {
+            DO_FORMATS(formats_float16_fbo);
+        } else {
+            DO_FORMATS(formats_float16_fallback);
+        }
+        goto done;
+    }
+
+    if (p->gles_ver >= 20) {
+        // GLES 2.0 only has some legacy fallback formats, with support for
+        // float16 depending on GL_EXT_texture_norm16 being present
+        DO_FORMATS(formats_legacy_gles2);
+        DO_FORMATS(formats_basic_vertex);
+        if (pl_opengl_has_ext(p->gl, "GL_EXT_texture_rg")) {
+            DO_FORMATS(formats_norm8);
+        }
+        if (pl_opengl_has_ext(p->gl, "GL_EXT_texture_format_BGRA8888")) {
+            DO_FORMATS(formats_bgra_gles);
+        }
+        goto done;
+    }
+
+    // Last resort fallback. Probably not very useful
+    DO_FORMATS(formats_basic_vertex);
+    goto done;
+
+done:
+    return gl_check_err(gpu, "gl_setup_formats");
+}
diff --git a/src/opengl/formats.h b/src/opengl/formats.h
new file mode 100644
index 0000000..b98c872
--- /dev/null
+++ b/src/opengl/formats.h
@@ -0,0 +1,32 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+struct gl_format {
+    GLint ifmt;             // sized internal format (e.g. GL_RGBA16F)
+    GLenum fmt;             // base internal format (e.g. GL_RGBA)
+    GLenum type;            // host-visible type (e.g. GL_FLOAT)
+    struct pl_fmt_t tmpl;   // pl_fmt template
+};
+
+typedef void (gl_format_cb)(pl_gpu gpu, const struct gl_format *glfmt);
+
+// Add all supported formats to the `pl_gpu` format list.
+bool gl_setup_formats(struct pl_gpu_t *gpu);
diff --git a/src/opengl/gpu.c b/src/opengl/gpu.c
new file mode 100644
index 0000000..b711ac5
--- /dev/null
+++ b/src/opengl/gpu.c
@@ -0,0 +1,645 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+#include "common.h"
+#include "formats.h"
+#include "utils.h"
+
+#ifdef PL_HAVE_UNIX
+#include <unistd.h>
+#endif
+
+#ifdef PL_HAVE_WIN32
+#include <windows.h>
+#include <sysinfoapi.h>
+#endif
+
+static const struct pl_gpu_fns pl_fns_gl;
+
+static void gl_gpu_destroy(pl_gpu gpu)
+{
+    struct pl_gl *p = PL_PRIV(gpu);
+
+    pl_gpu_finish(gpu);
+    while (p->callbacks.num > 0)
+        gl_poll_callbacks(gpu);
+
+    pl_free((void *) gpu);
+}
+
+pl_opengl pl_opengl_get(pl_gpu gpu)
+{
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    if (impl->destroy == gl_gpu_destroy) {
+        struct pl_gl *p = (struct pl_gl *) impl;
+        return p->gl;
+    }
+
+    return NULL;
+}
+
+static pl_handle_caps tex_handle_caps(pl_gpu gpu, bool import)
+{
+    pl_handle_caps caps = 0;
+    struct pl_gl *p = PL_PRIV(gpu);
+
+    if (!p->egl_dpy || (!p->has_egl_storage && !p->has_egl_import))
+        return 0;
+
+    if (import) {
+        if (pl_opengl_has_ext(p->gl, "EGL_EXT_image_dma_buf_import"))
+            caps |= PL_HANDLE_DMA_BUF;
+    } else if (!import && p->egl_ctx) {
+        if (pl_opengl_has_ext(p->gl, "EGL_MESA_image_dma_buf_export"))
+            caps |= PL_HANDLE_DMA_BUF;
+    }
+
+    return caps;
+}
+
+static inline size_t get_page_size(void)
+{
+
+#ifdef PL_HAVE_UNIX
+    return sysconf(_SC_PAGESIZE);
+#endif
+
+#ifdef PL_HAVE_WIN32
+    SYSTEM_INFO sysInfo;
+    GetSystemInfo(&sysInfo);
+    return sysInfo.dwAllocationGranularity;
+#endif
+
+    pl_assert(!"Unsupported platform!");
+}
+
+#define get(pname, field)                   \
+    do {                                    \
+        GLint tmp = 0;                      \
+        gl->GetIntegerv((pname), &tmp);     \
+        *(field) = tmp;                     \
+    } while (0)
+
+#define geti(pname, i, field)               \
+    do {                                    \
+        GLint tmp = 0;                      \
+        gl->GetIntegeri_v((pname), i, &tmp);\
+        *(field) = tmp;                     \
+    } while (0)
+
+pl_gpu pl_gpu_create_gl(pl_log log, pl_opengl pl_gl, const struct pl_opengl_params *params)
+{
+    struct pl_gpu_t *gpu = pl_zalloc_obj(NULL, gpu, struct pl_gl);
+    gpu->log = log;
+
+    struct pl_gl *p = PL_PRIV(gpu);
+    p->impl = pl_fns_gl;
+    p->gl = pl_gl;
+
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    struct pl_glsl_version *glsl = &gpu->glsl;
+    glsl->gles = gl_is_gles(pl_gl);
+    int ver = pl_gl->major * 10 + pl_gl->minor;
+    p->gl_ver = glsl->gles ? 0 : ver;
+    p->gles_ver = glsl->gles ? ver : 0;
+
+    // If possible, query the GLSL version from the implementation
+    const char *glslver = (char *) gl->GetString(GL_SHADING_LANGUAGE_VERSION);
+    if (glslver) {
+        PL_INFO(gpu, "    GL_SHADING_LANGUAGE_VERSION: %s", glslver);
+        int major = 0, minor = 0;
+        if (sscanf(glslver, "%d.%d", &major, &minor) == 2)
+            glsl->version = major * 100 + minor;
+    }
+
+    if (!glsl->version) {
+        // Otherwise, use the fixed magic versions 100 and 300 for GLES.
+        if (p->gles_ver >= 30) {
+            glsl->version = 300;
+        } else if (p->gles_ver >= 20) {
+            glsl->version = 100;
+        } else {
+            goto error;
+        }
+    }
+
+    static const int glsl_ver_req = 130;
+    if (glsl->version < glsl_ver_req) {
+        PL_FATAL(gpu, "GLSL version too old (%d < %d), please use a newer "
+                 "OpenGL implementation or downgrade libplacebo!",
+                 glsl->version, glsl_ver_req);
+        goto error;
+    }
+
+    if (params->max_glsl_version && params->max_glsl_version >= glsl_ver_req) {
+        glsl->version = PL_MIN(glsl->version, params->max_glsl_version);
+        PL_INFO(gpu, "Restricting GLSL version to %d... new version is %d",
+                params->max_glsl_version, glsl->version);
+    }
+
+    if (gl_test_ext(gpu, "GL_ARB_compute_shader", 43, 0) && glsl->version >= 420) {
+        glsl->compute = true;
+        get(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE, &glsl->max_shmem_size);
+        get(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS, &glsl->max_group_threads);
+        for (int i = 0; i < 3; i++)
+            geti(GL_MAX_COMPUTE_WORK_GROUP_SIZE, i, &glsl->max_group_size[i]);
+    }
+
+    if (gl_test_ext(gpu, "GL_ARB_texture_gather", 40, 0)) {
+        get(GL_MAX_PROGRAM_TEXTURE_GATHER_COMPONENTS_ARB, &p->gather_comps);
+        get(GL_MIN_PROGRAM_TEXTURE_GATHER_OFFSET_ARB, &glsl->min_gather_offset);
+        get(GL_MAX_PROGRAM_TEXTURE_GATHER_OFFSET_ARB, &glsl->max_gather_offset);
+    }
+
+    // Query all device limits
+    struct pl_gpu_limits *limits = &gpu->limits;
+    limits->thread_safe = params->make_current;
+    limits->callbacks = gl_test_ext(gpu, "GL_ARB_sync", 32, 30);
+    limits->align_vertex_stride = 1;
+    if (gl_test_ext(gpu, "GL_ARB_pixel_buffer_object", 31, 0)) {
+        limits->max_buf_size = SIZE_MAX; // no restriction imposed by GL
+        if (gl_test_ext(gpu, "GL_ARB_uniform_buffer_object", 31, 0))
+            get(GL_MAX_UNIFORM_BLOCK_SIZE, &limits->max_ubo_size);
+        if (gl_test_ext(gpu, "GL_ARB_shader_storage_buffer_object", 43, 0) &&
+            gpu->glsl.version >= 140)
+        {
+            get(GL_MAX_SHADER_STORAGE_BLOCK_SIZE, &limits->max_ssbo_size);
+        }
+        limits->max_vbo_size = limits->max_buf_size; // No additional restrictions
+        if (gl_test_ext(gpu, "GL_ARB_buffer_storage", 44, 0)) {
+            const char *vendor = (char *) gl->GetString(GL_VENDOR);
+            limits->max_mapped_size = limits->max_buf_size;
+            limits->host_cached = strcmp(vendor, "AMD") == 0 ||
+                                  strcmp(vendor, "NVIDIA Corporation") == 0;
+        }
+    }
+
+    get(GL_MAX_TEXTURE_SIZE, &limits->max_tex_2d_dim);
+    if (gl_test_ext(gpu, "GL_EXT_texture3D", 21, 30))
+        get(GL_MAX_3D_TEXTURE_SIZE, &limits->max_tex_3d_dim);
+    // There's no equivalent limit for 1D textures for whatever reason, so
+    // just set it to the same as the 2D limit
+    if (p->gl_ver >= 21)
+        limits->max_tex_1d_dim = limits->max_tex_2d_dim;
+    limits->buf_transfer = true;
+
+    if (p->gl_ver || p->gles_ver >= 30) {
+        get(GL_MAX_FRAGMENT_UNIFORM_COMPONENTS, &limits->max_variable_comps);
+    } else {
+        // fallback for GLES 2.0, which doesn't have max_comps
+        get(GL_MAX_FRAGMENT_UNIFORM_VECTORS, &limits->max_variable_comps);
+        limits->max_variable_comps *= 4;
+    }
+
+    if (glsl->compute) {
+        for (int i = 0; i < 3; i++)
+            geti(GL_MAX_COMPUTE_WORK_GROUP_COUNT, i, &limits->max_dispatch[i]);
+    }
+
+    // Query import/export support
+    p->egl_dpy = params->egl_display;
+    p->egl_ctx = params->egl_context;
+    p->has_egl_storage = pl_opengl_has_ext(p->gl, "GL_EXT_EGL_image_storage");
+    p->has_egl_import = pl_opengl_has_ext(p->gl, "GL_OES_EGL_image_external");
+    gpu->export_caps.tex = tex_handle_caps(gpu, false);
+    gpu->import_caps.tex = tex_handle_caps(gpu, true);
+
+    if (p->egl_dpy) {
+        p->has_modifiers = pl_opengl_has_ext(p->gl,
+                                        "EGL_EXT_image_dma_buf_import_modifiers");
+    }
+
+    if (pl_opengl_has_ext(pl_gl, "GL_AMD_pinned_memory")) {
+        gpu->import_caps.buf |= PL_HANDLE_HOST_PTR;
+        gpu->limits.align_host_ptr = get_page_size();
+    }
+
+    // Cache some internal capability checks
+    p->has_vao = gl_test_ext(gpu, "GL_ARB_vertex_array_object", 30, 0);
+    p->has_invalidate_fb = gl_test_ext(gpu, "GL_ARB_invalidate_subdata", 43, 30);
+    p->has_invalidate_tex = gl_test_ext(gpu, "GL_ARB_invalidate_subdata", 43, 0);
+    p->has_queries = gl_test_ext(gpu, "GL_ARB_timer_query", 33, 0);
+    p->has_storage = gl_test_ext(gpu, "GL_ARB_shader_image_load_store", 42, 0);
+    p->has_readback = true;
+
+    if (p->has_readback && p->gles_ver) {
+        GLuint fbo = 0, tex = 0;
+        GLint read_type = 0, read_fmt = 0;
+        gl->GenTextures(1, &tex);
+        gl->BindTexture(GL_TEXTURE_2D, tex);
+        gl->GenFramebuffers(1, &fbo);
+        gl->TexImage2D(GL_TEXTURE_2D, 0, GL_R8, 64, 64, 0, GL_RED,
+                       GL_UNSIGNED_BYTE, NULL);
+        gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, fbo);
+        gl->FramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
+                                 GL_TEXTURE_2D, tex, 0);
+        gl->GetIntegerv(GL_IMPLEMENTATION_COLOR_READ_TYPE, &read_type);
+        gl->GetIntegerv(GL_IMPLEMENTATION_COLOR_READ_FORMAT, &read_fmt);
+        if (read_type != GL_UNSIGNED_BYTE || read_fmt != GL_RED) {
+            PL_INFO(gpu, "GPU does not seem to support lossless texture "
+                    "readback, restricting readback capabilities! This is a "
+                    "GLES/driver limitation, there is little we can do to "
+                    "work around it.");
+            p->has_readback = false;
+        }
+        gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+        gl->BindTexture(GL_TEXTURE_2D, 0);
+        gl->DeleteFramebuffers(1, &fbo);
+        gl->DeleteTextures(1, &tex);
+    }
+
+    // We simply don't know, so make up some values
+    limits->align_tex_xfer_offset = 32;
+    limits->align_tex_xfer_pitch = 4;
+    limits->fragment_queues = 1;
+    limits->compute_queues = glsl->compute ? 1 : 0;
+
+    if (!gl_check_err(gpu, "pl_gpu_create_gl")) {
+        PL_WARN(gpu, "Encountered errors while detecting GPU capabilities... "
+                "ignoring, but expect limitations/issues");
+        p->failed = false;
+    }
+
+    // Filter out error messages during format probing
+    pl_log_level_cap(gpu->log, PL_LOG_INFO);
+    bool formats_ok = gl_setup_formats(gpu);
+    pl_log_level_cap(gpu->log, PL_LOG_NONE);
+    if (!formats_ok)
+        goto error;
+
+    return pl_gpu_finalize(gpu);
+
+error:
+    gl_gpu_destroy(gpu);
+    return NULL;
+}
+
+void gl_buf_destroy(pl_gpu gpu, pl_buf buf)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    if (!MAKE_CURRENT()) {
+        PL_ERR(gpu, "Failed uninitializing buffer, leaking resources!");
+        return;
+    }
+
+    struct pl_buf_gl *buf_gl = PL_PRIV(buf);
+    if (buf_gl->fence)
+        gl->DeleteSync(buf_gl->fence);
+
+    if (buf_gl->mapped) {
+        gl->BindBuffer(GL_COPY_WRITE_BUFFER, buf_gl->buffer);
+        gl->UnmapBuffer(GL_COPY_WRITE_BUFFER);
+        gl->BindBuffer(GL_COPY_WRITE_BUFFER, 0);
+    }
+
+    gl->DeleteBuffers(1, &buf_gl->buffer);
+    gl_check_err(gpu, "gl_buf_destroy");
+    RELEASE_CURRENT();
+    pl_free((void *) buf);
+}
+
+pl_buf gl_buf_create(pl_gpu gpu, const struct pl_buf_params *params)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    if (!MAKE_CURRENT())
+        return NULL;
+
+    struct pl_buf_t *buf = pl_zalloc_obj(NULL, buf, struct pl_buf_gl);
+    buf->params = *params;
+    buf->params.initial_data = NULL;
+
+    struct pl_gl *p = PL_PRIV(gpu);
+    struct pl_buf_gl *buf_gl = PL_PRIV(buf);
+    buf_gl->id = ++p->buf_id;
+
+    // Just use this since the generic GL_BUFFER doesn't work
+    GLenum target = GL_ARRAY_BUFFER;
+    const void *data = params->initial_data;
+    size_t total_size = params->size;
+    bool import = false;
+
+    if (params->import_handle == PL_HANDLE_HOST_PTR) {
+        const struct pl_shared_mem *shmem = &params->shared_mem;
+        target = GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD;
+
+        data = shmem->handle.ptr;
+        buf_gl->offset = shmem->offset;
+        total_size = shmem->size;
+        import = true;
+
+        if (params->host_mapped)
+            buf->data = (uint8_t *) data + buf_gl->offset;
+
+        if (buf_gl->offset > 0 && params->drawable) {
+            PL_ERR(gpu, "Cannot combine non-aligned host pointer imports with "
+                   "drawable (vertex) buffers! This is a design limitation, "
+                   "open an issue if you absolutely need this.");
+            goto error;
+        }
+    }
+
+    gl->GenBuffers(1, &buf_gl->buffer);
+    gl->BindBuffer(target, buf_gl->buffer);
+
+    if (gl_test_ext(gpu, "GL_ARB_buffer_storage", 44, 0) && !import) {
+
+        GLbitfield mapflags = 0, storflags = 0;
+        if (params->host_writable)
+            storflags |= GL_DYNAMIC_STORAGE_BIT;
+        if (params->host_mapped) {
+            mapflags |= GL_MAP_READ_BIT | GL_MAP_WRITE_BIT |
+                        GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT;
+        }
+        if (params->memory_type == PL_BUF_MEM_HOST)
+            storflags |= GL_CLIENT_STORAGE_BIT; // hopefully this works
+
+        gl->BufferStorage(target, total_size, data, storflags | mapflags);
+
+        if (params->host_mapped) {
+            buf_gl->mapped = true;
+            buf->data = gl->MapBufferRange(target, buf_gl->offset, params->size,
+                                           mapflags);
+            if (!buf->data) {
+                gl->BindBuffer(target, 0);
+                if (!gl_check_err(gpu, "gl_buf_create: map"))
+                    PL_ERR(gpu, "Failed mapping buffer: unknown reason");
+                goto error;
+            }
+        }
+
+    } else {
+
+        // Make a random guess based on arbitrary criteria we can't know
+        GLenum hint = GL_STREAM_DRAW;
+        if (params->initial_data && !params->host_writable && !params->host_mapped)
+            hint = GL_STATIC_DRAW;
+        if (params->host_readable && !params->host_writable && !params->host_mapped)
+            hint = GL_STREAM_READ;
+        if (params->storable)
+            hint = GL_DYNAMIC_COPY;
+
+        gl->BufferData(target, total_size, data, hint);
+
+        if (import && gl->GetError() == GL_INVALID_OPERATION) {
+            PL_ERR(gpu, "Failed importing host pointer!");
+            goto error;
+        }
+
+    }
+
+    gl->BindBuffer(target, 0);
+    if (!gl_check_err(gpu, "gl_buf_create"))
+        goto error;
+
+    if (params->storable) {
+        buf_gl->barrier = GL_BUFFER_UPDATE_BARRIER_BIT | // for buf_copy etc.
+                          GL_PIXEL_BUFFER_BARRIER_BIT | // for tex_upload
+                          GL_SHADER_STORAGE_BARRIER_BIT;
+
+        if (params->host_mapped)
+            buf_gl->barrier |= GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT;
+        if (params->uniform)
+            buf_gl->barrier |= GL_UNIFORM_BARRIER_BIT;
+        if (params->drawable)
+            buf_gl->barrier |= GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT;
+    }
+
+    RELEASE_CURRENT();
+    return buf;
+
+error:
+    gl_buf_destroy(gpu, buf);
+    RELEASE_CURRENT();
+    return NULL;
+}
+
+bool gl_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t timeout)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+
+    // Non-persistently mapped buffers are always implicitly reusable in OpenGL,
+    // the implementation will create more buffers under the hood if needed.
+    if (!buf->data)
+        return false;
+
+    if (!MAKE_CURRENT())
+        return true; // conservative guess
+
+    struct pl_buf_gl *buf_gl = PL_PRIV(buf);
+    if (buf_gl->fence) {
+        GLenum res = gl->ClientWaitSync(buf_gl->fence,
+                                        timeout ? GL_SYNC_FLUSH_COMMANDS_BIT : 0,
+                                        timeout);
+        if (res == GL_ALREADY_SIGNALED || res == GL_CONDITION_SATISFIED) {
+            gl->DeleteSync(buf_gl->fence);
+            buf_gl->fence = NULL;
+        }
+    }
+
+    gl_poll_callbacks(gpu);
+    RELEASE_CURRENT();
+    return !!buf_gl->fence;
+}
+
+void gl_buf_write(pl_gpu gpu, pl_buf buf, size_t offset,
+                  const void *data, size_t size)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    if (!MAKE_CURRENT())
+        return;
+
+    struct pl_buf_gl *buf_gl = PL_PRIV(buf);
+    gl->BindBuffer(GL_ARRAY_BUFFER, buf_gl->buffer);
+    gl->BufferSubData(GL_ARRAY_BUFFER, buf_gl->offset + offset, size, data);
+    gl->BindBuffer(GL_ARRAY_BUFFER, 0);
+    gl_check_err(gpu, "gl_buf_write");
+    RELEASE_CURRENT();
+}
+
+bool gl_buf_read(pl_gpu gpu, pl_buf buf, size_t offset,
+                 void *dest, size_t size)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    if (!MAKE_CURRENT())
+        return false;
+
+    struct pl_buf_gl *buf_gl = PL_PRIV(buf);
+    gl->BindBuffer(GL_ARRAY_BUFFER, buf_gl->buffer);
+    gl->GetBufferSubData(GL_ARRAY_BUFFER, buf_gl->offset + offset, size, dest);
+    gl->BindBuffer(GL_ARRAY_BUFFER, 0);
+    bool ok = gl_check_err(gpu, "gl_buf_read");
+    RELEASE_CURRENT();
+    return ok;
+}
+
+void gl_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset,
+                 pl_buf src, size_t src_offset, size_t size)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    if (!MAKE_CURRENT())
+        return;
+
+    struct pl_buf_gl *src_gl = PL_PRIV(src);
+    struct pl_buf_gl *dst_gl = PL_PRIV(dst);
+    gl->BindBuffer(GL_COPY_READ_BUFFER, src_gl->buffer);
+    gl->BindBuffer(GL_COPY_WRITE_BUFFER, dst_gl->buffer);
+    gl->CopyBufferSubData(GL_COPY_READ_BUFFER, GL_COPY_WRITE_BUFFER,
+                          src_gl->offset + src_offset,
+                          dst_gl->offset + dst_offset, size);
+    gl_check_err(gpu, "gl_buf_copy");
+    RELEASE_CURRENT();
+}
+
+#define QUERY_OBJECT_NUM 8
+
+struct pl_timer_t {
+    GLuint query[QUERY_OBJECT_NUM];
+    int index_write; // next index to write to
+    int index_read; // next index to read from
+};
+
+static pl_timer gl_timer_create(pl_gpu gpu)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    struct pl_gl *p = PL_PRIV(gpu);
+    if (!p->has_queries || !MAKE_CURRENT())
+        return NULL;
+
+    pl_timer timer = pl_zalloc_ptr(NULL, timer);
+    gl->GenQueries(QUERY_OBJECT_NUM, timer->query);
+    RELEASE_CURRENT();
+    return timer;
+}
+
+static void gl_timer_destroy(pl_gpu gpu, pl_timer timer)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    if (!MAKE_CURRENT()) {
+        PL_ERR(gpu, "Failed uninitializing timer, leaking resources!");
+        return;
+    }
+
+    gl->DeleteQueries(QUERY_OBJECT_NUM, timer->query);
+    gl_check_err(gpu, "gl_timer_destroy");
+    RELEASE_CURRENT();
+    pl_free(timer);
+}
+
+static uint64_t gl_timer_query(pl_gpu gpu, pl_timer timer)
+{
+    if (timer->index_read == timer->index_write)
+        return 0; // no more unprocessed results
+
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    if (!MAKE_CURRENT())
+        return 0;
+
+    uint64_t res = 0;
+    GLuint query = timer->query[timer->index_read];
+    int avail = 0;
+    gl->GetQueryObjectiv(query, GL_QUERY_RESULT_AVAILABLE, &avail);
+    if (!avail)
+        goto done;
+    gl->GetQueryObjectui64v(query, GL_QUERY_RESULT, &res);
+
+    timer->index_read = (timer->index_read + 1) % QUERY_OBJECT_NUM;
+    // fall through
+
+done:
+    RELEASE_CURRENT();
+    return res;
+}
+
+void gl_timer_begin(pl_gpu gpu, pl_timer timer)
+{
+    if (!timer)
+        return;
+
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    gl->BeginQuery(GL_TIME_ELAPSED, timer->query[timer->index_write]);
+}
+
+void gl_timer_end(pl_gpu gpu, pl_timer timer)
+{
+    if (!timer)
+        return;
+
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    gl->EndQuery(GL_TIME_ELAPSED);
+
+    timer->index_write = (timer->index_write + 1) % QUERY_OBJECT_NUM;
+    if (timer->index_write == timer->index_read) {
+        // forcibly drop the least recent result to make space
+        timer->index_read = (timer->index_read + 1) % QUERY_OBJECT_NUM;
+    }
+}
+
+static void gl_gpu_flush(pl_gpu gpu)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    if (!MAKE_CURRENT())
+        return;
+
+    gl->Flush();
+    gl_check_err(gpu, "gl_gpu_flush");
+    RELEASE_CURRENT();
+}
+
+static void gl_gpu_finish(pl_gpu gpu)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    if (!MAKE_CURRENT())
+        return;
+
+    gl->Finish();
+    gl_check_err(gpu, "gl_gpu_finish");
+    RELEASE_CURRENT();
+}
+
+static bool gl_gpu_is_failed(pl_gpu gpu)
+{
+    struct pl_gl *gl = PL_PRIV(gpu);
+    return gl->failed;
+}
+
+static const struct pl_gpu_fns pl_fns_gl = {
+    .destroy                = gl_gpu_destroy,
+    .tex_create             = gl_tex_create,
+    .tex_destroy            = gl_tex_destroy,
+    .tex_invalidate         = gl_tex_invalidate,
+    .tex_clear_ex           = gl_tex_clear_ex,
+    .tex_blit               = gl_tex_blit,
+    .tex_upload             = gl_tex_upload,
+    .tex_download           = gl_tex_download,
+    .buf_create             = gl_buf_create,
+    .buf_destroy            = gl_buf_destroy,
+    .buf_write              = gl_buf_write,
+    .buf_read               = gl_buf_read,
+    .buf_copy               = gl_buf_copy,
+    .buf_poll               = gl_buf_poll,
+    .desc_namespace         = gl_desc_namespace,
+    .pass_create            = gl_pass_create,
+    .pass_destroy           = gl_pass_destroy,
+    .pass_run               = gl_pass_run,
+    .timer_create           = gl_timer_create,
+    .timer_destroy          = gl_timer_destroy,
+    .timer_query            = gl_timer_query,
+    .gpu_flush              = gl_gpu_flush,
+    .gpu_finish             = gl_gpu_finish,
+    .gpu_is_failed          = gl_gpu_is_failed,
+};
diff --git a/src/opengl/gpu.h b/src/opengl/gpu.h
new file mode 100644
index 0000000..50741d0
--- /dev/null
+++ b/src/opengl/gpu.h
@@ -0,0 +1,141 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../gpu.h"
+#include "common.h"
+
+// Thread safety: Unsafe, same as pl_gpu_destroy
+pl_gpu pl_gpu_create_gl(pl_log log, pl_opengl gl, const struct pl_opengl_params *params);
+
+// --- pl_gpu internal structs and functions
+
+struct pl_gl {
+    struct pl_gpu_fns impl;
+    pl_opengl gl;
+    bool failed;
+
+    // For import/export
+    EGLDisplay egl_dpy;
+    EGLContext egl_ctx;
+    bool egl_storage;
+#ifdef PL_HAVE_UNIX
+    // List of formats supported by EGL_EXT_image_dma_buf_import
+    PL_ARRAY(EGLint) egl_formats;
+#endif
+
+    // Sync objects and associated callbacks
+    PL_ARRAY(struct gl_cb) callbacks;
+
+
+    // Incrementing counters to keep track of object uniqueness
+    int buf_id;
+
+    // Cached capabilities
+    int gl_ver;
+    int gles_ver;
+    bool has_storage;
+    bool has_invalidate_fb;
+    bool has_invalidate_tex;
+    bool has_vao;
+    bool has_queries;
+    bool has_modifiers;
+    bool has_readback;
+    bool has_egl_storage;
+    bool has_egl_import;
+    int gather_comps;
+};
+
+static inline const gl_funcs *gl_funcs_get(pl_gpu gpu)
+{
+    struct pl_gl *p = PL_PRIV(gpu);
+    struct gl_ctx *glctx = PL_PRIV(p->gl);
+    return &glctx->func;
+}
+
+void gl_timer_begin(pl_gpu gpu, pl_timer timer);
+void gl_timer_end(pl_gpu gpu, pl_timer timer);
+
+static inline bool _make_current(pl_gpu gpu)
+{
+    struct pl_gl *p = PL_PRIV(gpu);
+    if (!gl_make_current(p->gl)) {
+        p->failed = true;
+        return false;
+    }
+
+    return true;
+}
+
+static inline void _release_current(pl_gpu gpu)
+{
+    struct pl_gl *p = PL_PRIV(gpu);
+    gl_release_current(p->gl);
+}
+
+#define MAKE_CURRENT() _make_current(gpu)
+#define RELEASE_CURRENT() _release_current(gpu)
+
+struct pl_tex_gl {
+    GLenum target;
+    GLuint texture;
+    bool wrapped_tex;
+    GLuint fbo; // or 0
+    bool wrapped_fb;
+    GLbitfield barrier;
+
+    // GL format fields
+    GLenum format;
+    GLint iformat;
+    GLenum type;
+
+    // For imported/exported textures
+    EGLImageKHR image;
+    int fd;
+};
+
+pl_tex gl_tex_create(pl_gpu, const struct pl_tex_params *);
+void gl_tex_destroy(pl_gpu, pl_tex);
+void gl_tex_invalidate(pl_gpu, pl_tex);
+void gl_tex_clear_ex(pl_gpu, pl_tex, const union pl_clear_color);
+void gl_tex_blit(pl_gpu, const struct pl_tex_blit_params *);
+bool gl_tex_upload(pl_gpu, const struct pl_tex_transfer_params *);
+bool gl_tex_download(pl_gpu, const struct pl_tex_transfer_params *);
+
+struct pl_buf_gl {
+    uint64_t id; // unique per buffer
+    GLuint buffer;
+    size_t offset;
+    GLsync fence;
+    GLbitfield barrier;
+    bool mapped;
+};
+
+pl_buf gl_buf_create(pl_gpu, const struct pl_buf_params *);
+void gl_buf_destroy(pl_gpu, pl_buf);
+void gl_buf_write(pl_gpu, pl_buf, size_t offset, const void *src, size_t size);
+bool gl_buf_read(pl_gpu, pl_buf, size_t offset, void *dst, size_t size);
+void gl_buf_copy(pl_gpu, pl_buf dst, size_t dst_offset,
+                 pl_buf src, size_t src_offset, size_t size);
+bool gl_buf_poll(pl_gpu, pl_buf, uint64_t timeout);
+
+struct pl_pass_gl;
+int gl_desc_namespace(pl_gpu, enum pl_desc_type type);
+pl_pass gl_pass_create(pl_gpu, const struct pl_pass_params *);
+void gl_pass_destroy(pl_gpu, pl_pass);
+void gl_pass_run(pl_gpu, const struct pl_pass_run_params *);
diff --git a/src/opengl/gpu_pass.c b/src/opengl/gpu_pass.c
new file mode 100644
index 0000000..58e69a5
--- /dev/null
+++ b/src/opengl/gpu_pass.c
@@ -0,0 +1,707 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+#include "cache.h"
+#include "formats.h"
+#include "utils.h"
+
+int gl_desc_namespace(pl_gpu gpu, enum pl_desc_type type)
+{
+    return (int) type;
+}
+
+struct gl_cache_header {
+    GLenum format;
+};
+
+static GLuint load_cached_program(pl_gpu gpu, pl_cache cache, pl_cache_obj *obj)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    if (!gl_test_ext(gpu, "GL_ARB_get_program_binary", 41, 30))
+        return 0;
+
+    if (!pl_cache_get(cache, obj))
+        return 0;
+
+    if (obj->size < sizeof(struct gl_cache_header))
+        return 0;
+
+    GLuint prog = gl->CreateProgram();
+    if (!gl_check_err(gpu, "load_cached_program: glCreateProgram"))
+        return 0;
+
+    struct gl_cache_header *header = (struct gl_cache_header *) obj->data;
+    pl_str rest = (pl_str) { obj->data, obj->size };
+    rest = pl_str_drop(rest, sizeof(*header));
+    gl->ProgramBinary(prog, header->format, rest.buf, rest.len);
+    gl->GetError(); // discard potential useless error
+
+    GLint status = 0;
+    gl->GetProgramiv(prog, GL_LINK_STATUS, &status);
+    if (status)
+        return prog;
+
+    gl->DeleteProgram(prog);
+    gl_check_err(gpu, "load_cached_program: glProgramBinary");
+    return 0;
+}
+
+static enum pl_log_level gl_log_level(GLint status, GLint log_length)
+{
+    if (!status) {
+        return PL_LOG_ERR;
+    } else if (log_length > 0) {
+        return PL_LOG_INFO;
+    } else {
+        return PL_LOG_DEBUG;
+    }
+}
+
+static bool gl_attach_shader(pl_gpu gpu, GLuint program, GLenum type, const char *src)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    GLuint shader = gl->CreateShader(type);
+    gl->ShaderSource(shader, 1, &src, NULL);
+    gl->CompileShader(shader);
+
+    GLint status = 0;
+    gl->GetShaderiv(shader, GL_COMPILE_STATUS, &status);
+    GLint log_length = 0;
+    gl->GetShaderiv(shader, GL_INFO_LOG_LENGTH, &log_length);
+
+    enum pl_log_level level = gl_log_level(status, log_length);
+    if (pl_msg_test(gpu->log, level)) {
+        GLchar *logstr = pl_zalloc(NULL, log_length + 1);
+        gl->GetShaderInfoLog(shader, log_length, NULL, logstr);
+        PL_MSG(gpu, level, "shader compile log (status=%d): %s", status, logstr);
+        pl_free(logstr);
+    }
+
+    if (!status || !gl_check_err(gpu, "gl_attach_shader"))
+        goto error;
+
+    gl->AttachShader(program, shader);
+    gl->DeleteShader(shader);
+    return true;
+
+error:
+    gl->DeleteShader(shader);
+    return false;
+}
+
+static GLuint gl_compile_program(pl_gpu gpu, const struct pl_pass_params *params)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    GLuint prog = gl->CreateProgram();
+    bool ok = true;
+
+    switch (params->type) {
+    case PL_PASS_COMPUTE:
+        ok &= gl_attach_shader(gpu, prog, GL_COMPUTE_SHADER, params->glsl_shader);
+        break;
+    case PL_PASS_RASTER:
+        ok &= gl_attach_shader(gpu, prog, GL_VERTEX_SHADER, params->vertex_shader);
+        ok &= gl_attach_shader(gpu, prog, GL_FRAGMENT_SHADER, params->glsl_shader);
+        for (int i = 0; i < params->num_vertex_attribs; i++)
+            gl->BindAttribLocation(prog, i, params->vertex_attribs[i].name);
+        break;
+    case PL_PASS_INVALID:
+    case PL_PASS_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+    if (!ok || !gl_check_err(gpu, "gl_compile_program: attach shader"))
+        goto error;
+
+    gl->LinkProgram(prog);
+    GLint status = 0;
+    gl->GetProgramiv(prog, GL_LINK_STATUS, &status);
+    GLint log_length = 0;
+    gl->GetProgramiv(prog, GL_INFO_LOG_LENGTH, &log_length);
+
+    enum pl_log_level level = gl_log_level(status, log_length);
+    if (pl_msg_test(gpu->log, level)) {
+        GLchar *logstr = pl_zalloc(NULL, log_length + 1);
+        gl->GetProgramInfoLog(prog, log_length, NULL, logstr);
+        PL_MSG(gpu, level, "shader link log (status=%d): %s", status, logstr);
+        pl_free(logstr);
+    }
+
+    if (!gl_check_err(gpu, "gl_compile_program: link program"))
+        goto error;
+
+    return prog;
+
+error:
+    gl->DeleteProgram(prog);
+    PL_ERR(gpu, "Failed compiling/linking GLSL program");
+    return 0;
+}
+
+// For pl_pass.priv
+struct pl_pass_gl {
+    GLuint program;
+    GLuint vao;         // the VAO object
+    uint64_t vao_id;    // buf_gl.id of VAO
+    size_t vao_offset;  // VBO offset of VAO
+    GLuint buffer;      // VBO for raw vertex pointers
+    GLuint index_buffer;
+    GLint *var_locs;
+};
+
+void gl_pass_destroy(pl_gpu gpu, pl_pass pass)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    if (!MAKE_CURRENT()) {
+        PL_ERR(gpu, "Failed uninitializing pass, leaking resources!");
+        return;
+    }
+
+    struct pl_pass_gl *pass_gl = PL_PRIV(pass);
+    if (pass_gl->vao)
+        gl->DeleteVertexArrays(1, &pass_gl->vao);
+    gl->DeleteBuffers(1, &pass_gl->index_buffer);
+    gl->DeleteBuffers(1, &pass_gl->buffer);
+    gl->DeleteProgram(pass_gl->program);
+
+    gl_check_err(gpu, "gl_pass_destroy");
+    RELEASE_CURRENT();
+    pl_free((void *) pass);
+}
+
+static void gl_update_va(pl_gpu gpu, pl_pass pass, size_t vbo_offset)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    for (int i = 0; i < pass->params.num_vertex_attribs; i++) {
+        const struct pl_vertex_attrib *va = &pass->params.vertex_attribs[i];
+        const struct gl_format **glfmtp = PL_PRIV(va->fmt);
+        const struct gl_format *glfmt = *glfmtp;
+
+        bool norm = false;
+        switch (va->fmt->type) {
+        case PL_FMT_UNORM:
+        case PL_FMT_SNORM:
+            norm = true;
+            break;
+
+        case PL_FMT_UNKNOWN:
+        case PL_FMT_FLOAT:
+        case PL_FMT_UINT:
+        case PL_FMT_SINT:
+            break;
+        case PL_FMT_TYPE_COUNT:
+            pl_unreachable();
+        }
+
+        gl->EnableVertexAttribArray(i);
+        gl->VertexAttribPointer(i, va->fmt->num_components, glfmt->type, norm,
+                                pass->params.vertex_stride,
+                                (void *) (va->offset + vbo_offset));
+    }
+}
+
+pl_pass gl_pass_create(pl_gpu gpu, const struct pl_pass_params *params)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    if (!MAKE_CURRENT())
+        return NULL;
+
+    struct pl_gl *p = PL_PRIV(gpu);
+    struct pl_pass_t *pass = pl_zalloc_obj(NULL, pass, struct pl_pass_gl);
+    struct pl_pass_gl *pass_gl = PL_PRIV(pass);
+    pl_cache cache = pl_gpu_cache(gpu);
+    pass->params = pl_pass_params_copy(pass, params);
+
+    pl_cache_obj obj = { .key = CACHE_KEY_GL_PROG };
+    if (cache) {
+        pl_hash_merge(&obj.key, pl_str0_hash(params->glsl_shader));
+        if (params->type == PL_PASS_RASTER)
+            pl_hash_merge(&obj.key, pl_str0_hash(params->vertex_shader));
+    }
+
+    // Load/Compile program
+    if ((pass_gl->program = load_cached_program(gpu, cache, &obj))) {
+        PL_DEBUG(gpu, "Using cached GL program");
+    } else {
+        pl_clock_t start = pl_clock_now();
+        pass_gl->program = gl_compile_program(gpu, params);
+        pl_log_cpu_time(gpu->log, start, pl_clock_now(), "compiling shader");
+    }
+
+    if (!pass_gl->program)
+        goto error;
+
+    // Update program cache if possible
+    if (cache && gl_test_ext(gpu, "GL_ARB_get_program_binary", 41, 30)) {
+        GLint buf_size = 0;
+        gl->GetProgramiv(pass_gl->program, GL_PROGRAM_BINARY_LENGTH, &buf_size);
+        if (buf_size > 0) {
+            buf_size += sizeof(struct gl_cache_header);
+            pl_cache_obj_resize(NULL, &obj, buf_size);
+            struct gl_cache_header *header = obj.data;
+            void *buffer = &header[1];
+            GLsizei binary_size = 0;
+            gl->GetProgramBinary(pass_gl->program, buf_size, &binary_size,
+                                 &header->format, buffer);
+            bool ok = gl_check_err(gpu, "gl_pass_create: get program binary");
+            if (ok) {
+                obj.size = sizeof(*header) + binary_size;
+                pl_assert(obj.size <= buf_size);
+                pl_cache_set(cache, &obj);
+            }
+        }
+    }
+
+    gl->UseProgram(pass_gl->program);
+    pass_gl->var_locs = pl_calloc(pass, params->num_variables, sizeof(GLint));
+
+    for (int i = 0; i < params->num_variables; i++) {
+        pass_gl->var_locs[i] = gl->GetUniformLocation(pass_gl->program,
+                                                      params->variables[i].name);
+
+        // Due to OpenGL API restrictions, we need to ensure that this is a
+        // variable type we can actually *update*. Fortunately, this is easily
+        // checked by virtue of the fact that all legal combinations of
+        // parameters will have a valid GLSL type name
+        if (!pl_var_glsl_type_name(params->variables[i])) {
+            gl->UseProgram(0);
+            PL_ERR(gpu, "Input variable '%s' does not match any known type!",
+                   params->variables[i].name);
+            goto error;
+        }
+    }
+
+    for (int i = 0; i < params->num_descriptors; i++) {
+        const struct pl_desc *desc = &params->descriptors[i];
+        switch (desc->type) {
+        case PL_DESC_SAMPLED_TEX:
+        case PL_DESC_STORAGE_IMG: {
+            // For compatibility with older OpenGL, we need to explicitly
+            // update the texture/image unit bindings after creating the shader
+            // program, since specifying it directly requires GLSL 4.20+
+            GLint loc = gl->GetUniformLocation(pass_gl->program, desc->name);
+            gl->Uniform1i(loc, desc->binding);
+            break;
+        }
+        case PL_DESC_BUF_UNIFORM: {
+            GLuint idx = gl->GetUniformBlockIndex(pass_gl->program, desc->name);
+            gl->UniformBlockBinding(pass_gl->program, idx, desc->binding);
+            break;
+        }
+        case PL_DESC_BUF_STORAGE: {
+            GLuint idx = gl->GetProgramResourceIndex(pass_gl->program,
+                                                     GL_SHADER_STORAGE_BLOCK,
+                                                     desc->name);
+            gl->ShaderStorageBlockBinding(pass_gl->program, idx, desc->binding);
+            break;
+        }
+        case PL_DESC_BUF_TEXEL_UNIFORM:
+        case PL_DESC_BUF_TEXEL_STORAGE:
+            assert(!"unimplemented"); // TODO
+        case PL_DESC_INVALID:
+        case PL_DESC_TYPE_COUNT:
+            pl_unreachable();
+        }
+    }
+
+    gl->UseProgram(0);
+
+    // Initialize the VAO and single vertex buffer
+    gl->GenBuffers(1, &pass_gl->buffer);
+    if (p->has_vao) {
+        gl->GenVertexArrays(1, &pass_gl->vao);
+        gl->BindBuffer(GL_ARRAY_BUFFER, pass_gl->buffer);
+        gl->BindVertexArray(pass_gl->vao);
+        gl_update_va(gpu, pass, 0);
+        gl->BindVertexArray(0);
+        gl->BindBuffer(GL_ARRAY_BUFFER, 0);
+    }
+
+    if (!gl_check_err(gpu, "gl_pass_create"))
+        goto error;
+
+    pl_cache_obj_free(&obj);
+    RELEASE_CURRENT();
+    return pass;
+
+error:
+    PL_ERR(gpu, "Failed creating pass");
+    pl_cache_obj_free(&obj);
+    gl_pass_destroy(gpu, pass);
+    RELEASE_CURRENT();
+    return NULL;
+}
+
+static void update_var(pl_gpu gpu, pl_pass pass,
+                       const struct pl_var_update *vu)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    struct pl_pass_gl *pass_gl = PL_PRIV(pass);
+    const struct pl_var *var = &pass->params.variables[vu->index];
+    GLint loc = pass_gl->var_locs[vu->index];
+
+    switch (var->type) {
+    case PL_VAR_SINT: {
+        const int *i = vu->data;
+        pl_assert(var->dim_m == 1);
+        switch (var->dim_v) {
+        case 1: gl->Uniform1iv(loc, var->dim_a, i); break;
+        case 2: gl->Uniform2iv(loc, var->dim_a, i); break;
+        case 3: gl->Uniform3iv(loc, var->dim_a, i); break;
+        case 4: gl->Uniform4iv(loc, var->dim_a, i); break;
+        default: pl_unreachable();
+        }
+        return;
+    }
+    case PL_VAR_UINT: {
+        const unsigned int *u = vu->data;
+        pl_assert(var->dim_m == 1);
+        switch (var->dim_v) {
+        case 1: gl->Uniform1uiv(loc, var->dim_a, u); break;
+        case 2: gl->Uniform2uiv(loc, var->dim_a, u); break;
+        case 3: gl->Uniform3uiv(loc, var->dim_a, u); break;
+        case 4: gl->Uniform4uiv(loc, var->dim_a, u); break;
+        default: pl_unreachable();
+        }
+        return;
+    }
+    case PL_VAR_FLOAT: {
+        const float *f = vu->data;
+        if (var->dim_m == 1) {
+            switch (var->dim_v) {
+            case 1: gl->Uniform1fv(loc, var->dim_a, f); break;
+            case 2: gl->Uniform2fv(loc, var->dim_a, f); break;
+            case 3: gl->Uniform3fv(loc, var->dim_a, f); break;
+            case 4: gl->Uniform4fv(loc, var->dim_a, f); break;
+            default: pl_unreachable();
+            }
+        } else if (var->dim_m == 2 && var->dim_v == 2) {
+            gl->UniformMatrix2fv(loc, var->dim_a, GL_FALSE, f);
+        } else if (var->dim_m == 3 && var->dim_v == 3) {
+            gl->UniformMatrix3fv(loc, var->dim_a, GL_FALSE, f);
+        } else if (var->dim_m == 4 && var->dim_v == 4) {
+            gl->UniformMatrix4fv(loc, var->dim_a, GL_FALSE, f);
+        } else if (var->dim_m == 2 && var->dim_v == 3) {
+            gl->UniformMatrix2x3fv(loc, var->dim_a, GL_FALSE, f);
+        } else if (var->dim_m == 3 && var->dim_v == 2) {
+            gl->UniformMatrix3x2fv(loc, var->dim_a, GL_FALSE, f);
+        } else if (var->dim_m == 2 && var->dim_v == 4) {
+            gl->UniformMatrix2x4fv(loc, var->dim_a, GL_FALSE, f);
+        } else if (var->dim_m == 4 && var->dim_v == 2) {
+            gl->UniformMatrix4x2fv(loc, var->dim_a, GL_FALSE, f);
+        } else if (var->dim_m == 3 && var->dim_v == 4) {
+            gl->UniformMatrix3x4fv(loc, var->dim_a, GL_FALSE, f);
+        } else if (var->dim_m == 4 && var->dim_v == 3) {
+            gl->UniformMatrix4x3fv(loc, var->dim_a, GL_FALSE, f);
+        } else {
+            pl_unreachable();
+        }
+        return;
+    }
+
+    case PL_VAR_INVALID:
+    case PL_VAR_TYPE_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+}
+
+static void update_desc(pl_gpu gpu, pl_pass pass, int index,
+                        const struct pl_desc_binding *db)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    const struct pl_desc *desc = &pass->params.descriptors[index];
+
+    static const GLenum access[] = {
+        [PL_DESC_ACCESS_READWRITE] = GL_READ_WRITE,
+        [PL_DESC_ACCESS_READONLY]  = GL_READ_ONLY,
+        [PL_DESC_ACCESS_WRITEONLY] = GL_WRITE_ONLY,
+    };
+
+    static const GLint wraps[PL_TEX_ADDRESS_MODE_COUNT] = {
+        [PL_TEX_ADDRESS_CLAMP]  = GL_CLAMP_TO_EDGE,
+        [PL_TEX_ADDRESS_REPEAT] = GL_REPEAT,
+        [PL_TEX_ADDRESS_MIRROR] = GL_MIRRORED_REPEAT,
+    };
+
+    static const GLint filters[PL_TEX_SAMPLE_MODE_COUNT] = {
+        [PL_TEX_SAMPLE_NEAREST] = GL_NEAREST,
+        [PL_TEX_SAMPLE_LINEAR]  = GL_LINEAR,
+    };
+
+    switch (desc->type) {
+    case PL_DESC_SAMPLED_TEX: {
+        pl_tex tex = db->object;
+        struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+        gl->ActiveTexture(GL_TEXTURE0 + desc->binding);
+        gl->BindTexture(tex_gl->target, tex_gl->texture);
+
+        GLint filter = filters[db->sample_mode];
+        GLint wrap = wraps[db->address_mode];
+        gl->TexParameteri(tex_gl->target, GL_TEXTURE_MIN_FILTER, filter);
+        gl->TexParameteri(tex_gl->target, GL_TEXTURE_MAG_FILTER, filter);
+        switch (pl_tex_params_dimension(tex->params)) {
+        case 3: gl->TexParameteri(tex_gl->target, GL_TEXTURE_WRAP_R, wrap); // fall through
+        case 2: gl->TexParameteri(tex_gl->target, GL_TEXTURE_WRAP_T, wrap); // fall through
+        case 1: gl->TexParameteri(tex_gl->target, GL_TEXTURE_WRAP_S, wrap); break;
+        }
+        return;
+    }
+    case PL_DESC_STORAGE_IMG: {
+        pl_tex tex = db->object;
+        struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+        gl->BindImageTexture(desc->binding, tex_gl->texture, 0, GL_FALSE, 0,
+                             access[desc->access], tex_gl->iformat);
+        return;
+    }
+    case PL_DESC_BUF_UNIFORM: {
+        pl_buf buf = db->object;
+        struct pl_buf_gl *buf_gl = PL_PRIV(buf);
+        gl->BindBufferRange(GL_UNIFORM_BUFFER, desc->binding, buf_gl->buffer,
+                            buf_gl->offset, buf->params.size);
+        return;
+    }
+    case PL_DESC_BUF_STORAGE: {
+        pl_buf buf = db->object;
+        struct pl_buf_gl *buf_gl = PL_PRIV(buf);
+        gl->BindBufferRange(GL_SHADER_STORAGE_BUFFER, desc->binding, buf_gl->buffer,
+                            buf_gl->offset, buf->params.size);
+        return;
+    }
+    case PL_DESC_BUF_TEXEL_UNIFORM:
+    case PL_DESC_BUF_TEXEL_STORAGE:
+        assert(!"unimplemented"); // TODO
+
+    case PL_DESC_INVALID:
+    case PL_DESC_TYPE_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+}
+
+static void unbind_desc(pl_gpu gpu, pl_pass pass, int index,
+                        const struct pl_desc_binding *db)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    const struct pl_desc *desc = &pass->params.descriptors[index];
+
+    switch (desc->type) {
+    case PL_DESC_SAMPLED_TEX: {
+        pl_tex tex = db->object;
+        struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+        gl->ActiveTexture(GL_TEXTURE0 + desc->binding);
+        gl->BindTexture(tex_gl->target, 0);
+        return;
+    }
+    case PL_DESC_STORAGE_IMG: {
+        pl_tex tex = db->object;
+        struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+        gl->BindImageTexture(desc->binding, 0, 0, GL_FALSE, 0,
+                             GL_WRITE_ONLY, GL_R32F);
+        if (desc->access != PL_DESC_ACCESS_READONLY)
+            gl->MemoryBarrier(tex_gl->barrier);
+        return;
+    }
+    case PL_DESC_BUF_UNIFORM:
+        gl->BindBufferBase(GL_UNIFORM_BUFFER, desc->binding, 0);
+        return;
+    case PL_DESC_BUF_STORAGE: {
+        pl_buf buf = db->object;
+        struct pl_buf_gl *buf_gl = PL_PRIV(buf);
+        gl->BindBufferBase(GL_SHADER_STORAGE_BUFFER, desc->binding, 0);
+        if (desc->access != PL_DESC_ACCESS_READONLY)
+            gl->MemoryBarrier(buf_gl->barrier);
+        return;
+    }
+    case PL_DESC_BUF_TEXEL_UNIFORM:
+    case PL_DESC_BUF_TEXEL_STORAGE:
+        assert(!"unimplemented"); // TODO
+    case PL_DESC_INVALID:
+    case PL_DESC_TYPE_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+}
+
+void gl_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    if (!MAKE_CURRENT())
+        return;
+
+    pl_pass pass = params->pass;
+    struct pl_pass_gl *pass_gl = PL_PRIV(pass);
+    struct pl_gl *p = PL_PRIV(gpu);
+
+    gl->UseProgram(pass_gl->program);
+
+    for (int i = 0; i < params->num_var_updates; i++)
+        update_var(gpu, pass, &params->var_updates[i]);
+    for (int i = 0; i < pass->params.num_descriptors; i++)
+        update_desc(gpu, pass, i, &params->desc_bindings[i]);
+    gl->ActiveTexture(GL_TEXTURE0);
+
+    if (!gl_check_err(gpu, "gl_pass_run: updating uniforms")) {
+        RELEASE_CURRENT();
+        return;
+    }
+
+    switch (pass->params.type) {
+    case PL_PASS_RASTER: {
+        struct pl_tex_gl *target_gl = PL_PRIV(params->target);
+        gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, target_gl->fbo);
+        if (!pass->params.load_target && p->has_invalidate_fb) {
+            GLenum fb = target_gl->fbo ? GL_COLOR_ATTACHMENT0 : GL_COLOR;
+            gl->InvalidateFramebuffer(GL_DRAW_FRAMEBUFFER, 1, &fb);
+        }
+
+        gl->Viewport(params->viewport.x0, params->viewport.y0,
+                     pl_rect_w(params->viewport), pl_rect_h(params->viewport));
+        gl->Scissor(params->scissors.x0, params->scissors.y0,
+                    pl_rect_w(params->scissors), pl_rect_h(params->scissors));
+        gl->Enable(GL_SCISSOR_TEST);
+        gl->Disable(GL_DEPTH_TEST);
+        gl->Disable(GL_CULL_FACE);
+        gl_check_err(gpu, "gl_pass_run: enabling viewport/scissor");
+
+        const struct pl_blend_params *blend = pass->params.blend_params;
+        if (blend) {
+            static const GLenum map_blend[] = {
+                [PL_BLEND_ZERO]                 = GL_ZERO,
+                [PL_BLEND_ONE]                  = GL_ONE,
+                [PL_BLEND_SRC_ALPHA]            = GL_SRC_ALPHA,
+                [PL_BLEND_ONE_MINUS_SRC_ALPHA]  = GL_ONE_MINUS_SRC_ALPHA,
+            };
+
+            gl->BlendFuncSeparate(map_blend[blend->src_rgb],
+                                  map_blend[blend->dst_rgb],
+                                  map_blend[blend->src_alpha],
+                                  map_blend[blend->dst_alpha]);
+            gl->Enable(GL_BLEND);
+            gl_check_err(gpu, "gl_pass_run: enabling blend");
+        }
+
+        // Update VBO and VAO
+        pl_buf vert = params->vertex_buf;
+        struct pl_buf_gl *vert_gl = vert ? PL_PRIV(vert) : NULL;
+        gl->BindBuffer(GL_ARRAY_BUFFER, vert ? vert_gl->buffer : pass_gl->buffer);
+
+        if (!vert) {
+            // Update the buffer directly. In theory we could also do a memcmp
+            // cache here to avoid unnecessary updates.
+            gl->BufferData(GL_ARRAY_BUFFER, pl_vertex_buf_size(params),
+                           params->vertex_data, GL_STREAM_DRAW);
+        }
+
+        if (pass_gl->vao)
+            gl->BindVertexArray(pass_gl->vao);
+
+        uint64_t vert_id = vert ? vert_gl->id : 0;
+        size_t vert_offset = vert ? params->buf_offset : 0;
+        if (!pass_gl->vao || pass_gl->vao_id != vert_id ||
+             pass_gl->vao_offset != vert_offset)
+        {
+            // We need to update the VAO when the buffer ID or offset changes
+            gl_update_va(gpu, pass, vert_offset);
+            pass_gl->vao_id = vert_id;
+            pass_gl->vao_offset = vert_offset;
+        }
+
+        gl_check_err(gpu, "gl_pass_run: update/bind vertex buffer");
+
+        static const GLenum map_prim[PL_PRIM_TYPE_COUNT] = {
+            [PL_PRIM_TRIANGLE_LIST]     = GL_TRIANGLES,
+            [PL_PRIM_TRIANGLE_STRIP]    = GL_TRIANGLE_STRIP,
+        };
+        GLenum mode = map_prim[pass->params.vertex_type];
+
+        gl_timer_begin(gpu, params->timer);
+
+        if (params->index_data) {
+
+            static const GLenum index_fmts[PL_INDEX_FORMAT_COUNT] = {
+                [PL_INDEX_UINT16] = GL_UNSIGNED_SHORT,
+                [PL_INDEX_UINT32] = GL_UNSIGNED_INT,
+            };
+
+            // Upload indices to temporary buffer object
+            if (!pass_gl->index_buffer)
+                gl->GenBuffers(1, &pass_gl->index_buffer); // lazily allocated
+            gl->BindBuffer(GL_ELEMENT_ARRAY_BUFFER, pass_gl->index_buffer);
+            gl->BufferData(GL_ELEMENT_ARRAY_BUFFER, pl_index_buf_size(params),
+                           params->index_data, GL_STREAM_DRAW);
+            gl->DrawElements(mode, params->vertex_count,
+                             index_fmts[params->index_fmt], 0);
+            gl->BindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
+
+        } else if (params->index_buf) {
+
+            // The pointer argument becomes the index buffer offset
+            struct pl_buf_gl *index_gl = PL_PRIV(params->index_buf);
+            gl->BindBuffer(GL_ELEMENT_ARRAY_BUFFER, index_gl->buffer);
+            gl->DrawElements(mode, params->vertex_count, GL_UNSIGNED_SHORT,
+                             (void *) params->index_offset);
+            gl->BindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
+
+        } else {
+
+            // Note: the VBO offset is handled in the VAO
+            gl->DrawArrays(mode, 0, params->vertex_count);
+        }
+
+        gl_timer_end(gpu, params->timer);
+        gl_check_err(gpu, "gl_pass_run: drawing");
+
+        if (pass_gl->vao) {
+            gl->BindVertexArray(0);
+        } else {
+            for (int i = 0; i < pass->params.num_vertex_attribs; i++)
+                gl->DisableVertexAttribArray(i);
+        }
+
+        gl->BindBuffer(GL_ARRAY_BUFFER, 0);
+        gl->Disable(GL_SCISSOR_TEST);
+        gl->Disable(GL_BLEND);
+        gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+        break;
+    }
+
+    case PL_PASS_COMPUTE:
+        gl_timer_begin(gpu, params->timer);
+        gl->DispatchCompute(params->compute_groups[0],
+                            params->compute_groups[1],
+                            params->compute_groups[2]);
+        gl_timer_end(gpu, params->timer);
+        break;
+
+    case PL_PASS_INVALID:
+    case PL_PASS_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+    for (int i = 0; i < pass->params.num_descriptors; i++)
+        unbind_desc(gpu, pass, i, &params->desc_bindings[i]);
+    gl->ActiveTexture(GL_TEXTURE0);
+
+    gl->UseProgram(0);
+    gl_check_err(gpu, "gl_pass_run");
+    RELEASE_CURRENT();
+}
diff --git a/src/opengl/gpu_tex.c b/src/opengl/gpu_tex.c
new file mode 100644
index 0000000..02eda77
--- /dev/null
+++ b/src/opengl/gpu_tex.c
@@ -0,0 +1,1078 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+#include "formats.h"
+#include "utils.h"
+
+#ifdef PL_HAVE_UNIX
+#include <unistd.h>
+#include <errno.h>
+#endif
+
+void gl_tex_destroy(pl_gpu gpu, pl_tex tex)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    if (!MAKE_CURRENT()) {
+        PL_ERR(gpu, "Failed uninitializing texture, leaking resources!");
+        return;
+    }
+
+    struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+    if (tex_gl->fbo && !tex_gl->wrapped_fb)
+        gl->DeleteFramebuffers(1, &tex_gl->fbo);
+    if (tex_gl->image) {
+        struct pl_gl *p = PL_PRIV(gpu);
+        eglDestroyImageKHR(p->egl_dpy, tex_gl->image);
+    }
+    if (!tex_gl->wrapped_tex)
+        gl->DeleteTextures(1, &tex_gl->texture);
+
+#ifdef PL_HAVE_UNIX
+    if (tex_gl->fd != -1)
+        close(tex_gl->fd);
+#endif
+
+    gl_check_err(gpu, "gl_tex_destroy");
+    RELEASE_CURRENT();
+    pl_free((void *) tex);
+}
+
+static GLbitfield tex_barrier(pl_tex tex)
+{
+    GLbitfield barrier = 0;
+    const struct pl_tex_params *params = &tex->params;
+
+    if (params->sampleable)
+        barrier |= GL_TEXTURE_FETCH_BARRIER_BIT;
+    if (params->renderable || params->blit_src || params->blit_dst)
+        barrier |= GL_FRAMEBUFFER_BARRIER_BIT;
+    if (params->storable)
+        barrier |= GL_SHADER_IMAGE_ACCESS_BARRIER_BIT;
+    if (params->host_writable || params->host_readable)
+        barrier |= GL_TEXTURE_UPDATE_BARRIER_BIT;
+
+    return barrier;
+}
+
+#define ADD_ATTRIB(name, value)                                     \
+    do {                                                            \
+        assert(num_attribs + 3 < PL_ARRAY_SIZE(attribs));           \
+        attribs[num_attribs++] = (name);                            \
+        attribs[num_attribs++] = (value);                           \
+    } while (0)
+
+#define ADD_DMABUF_PLANE_ATTRIBS(plane, fd, offset, stride)         \
+    do {                                                            \
+        ADD_ATTRIB(EGL_DMA_BUF_PLANE ## plane ## _FD_EXT,           \
+                   fd);                                             \
+        ADD_ATTRIB(EGL_DMA_BUF_PLANE ## plane ## _OFFSET_EXT,       \
+                   offset);                                         \
+        ADD_ATTRIB(EGL_DMA_BUF_PLANE ## plane ## _PITCH_EXT,        \
+                   stride);                                         \
+    } while (0)
+
+#define ADD_DMABUF_PLANE_MODIFIERS(plane, mod)                      \
+    do {                                                            \
+        ADD_ATTRIB(EGL_DMA_BUF_PLANE ## plane ## _MODIFIER_LO_EXT,  \
+                   (uint32_t) ((mod) & 0xFFFFFFFFlu));              \
+        ADD_ATTRIB(EGL_DMA_BUF_PLANE ## plane ## _MODIFIER_HI_EXT,  \
+                   (uint32_t) (((mod) >> 32u) & 0xFFFFFFFFlu));     \
+    } while (0)
+
+static bool gl_tex_import(pl_gpu gpu,
+                          enum pl_handle_type handle_type,
+                          const struct pl_shared_mem *shared_mem,
+                          struct pl_tex_t *tex)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    struct pl_gl *p = PL_PRIV(gpu);
+    if (!MAKE_CURRENT())
+        return false;
+
+    struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+    const struct pl_tex_params *params = &tex->params;
+
+    int attribs[20] = {};
+    int num_attribs = 0;
+    ADD_ATTRIB(EGL_WIDTH,  params->w);
+    ADD_ATTRIB(EGL_HEIGHT, params->h);
+
+    switch (handle_type) {
+
+#ifdef PL_HAVE_UNIX
+    case PL_HANDLE_DMA_BUF:
+        if (shared_mem->handle.fd == -1) {
+            PL_ERR(gpu, "%s: invalid fd", __func__);
+            goto error;
+        }
+
+        tex_gl->fd = dup(shared_mem->handle.fd);
+        if (tex_gl->fd == -1) {
+            PL_ERR(gpu, "%s: cannot duplicate fd %d for importing: %s",
+                   __func__, shared_mem->handle.fd, strerror(errno));
+            goto error;
+        }
+
+        ADD_ATTRIB(EGL_LINUX_DRM_FOURCC_EXT, params->format->fourcc);
+        ADD_DMABUF_PLANE_ATTRIBS(0, tex_gl->fd, shared_mem->offset,
+                                 PL_DEF(shared_mem->stride_w, params->w));
+        if (p->has_modifiers)
+            ADD_DMABUF_PLANE_MODIFIERS(0, shared_mem->drm_format_mod);
+
+        attribs[num_attribs] = EGL_NONE;
+
+        // EGL_LINUX_DMA_BUF_EXT requires EGL_NO_CONTEXT
+        tex_gl->image = eglCreateImageKHR(p->egl_dpy,
+                                          EGL_NO_CONTEXT,
+                                          EGL_LINUX_DMA_BUF_EXT,
+                                          (EGLClientBuffer) NULL,
+                                          attribs);
+
+        break;
+#else // !PL_HAVE_UNIX
+    case PL_HANDLE_DMA_BUF:
+        pl_unreachable();
+#endif
+
+    case PL_HANDLE_WIN32:
+    case PL_HANDLE_WIN32_KMT:
+    case PL_HANDLE_HOST_PTR:
+    case PL_HANDLE_FD:
+    case PL_HANDLE_MTL_TEX:
+    case PL_HANDLE_IOSURFACE:
+        pl_unreachable();
+
+    }
+
+    if (!egl_check_err(gpu, "eglCreateImageKHR") || !tex_gl->image)
+        goto error;
+
+    // tex_gl->image should be already bound
+    if (p->has_egl_storage) {
+        gl->EGLImageTargetTexStorageEXT(GL_TEXTURE_2D, tex_gl->image, NULL);
+    } else {
+        gl->EGLImageTargetTexture2DOES(GL_TEXTURE_2D, tex_gl->image);
+    }
+    if (!egl_check_err(gpu, "EGLImageTargetTexture2DOES"))
+        goto error;
+
+    RELEASE_CURRENT();
+    return true;
+
+error:
+    PL_ERR(gpu, "Failed importing GL texture!");
+    RELEASE_CURRENT();
+    return false;
+}
+
+static EGLenum egl_from_gl_target(pl_gpu gpu, int target)
+{
+    switch(target) {
+    case GL_TEXTURE_2D: return EGL_GL_TEXTURE_2D;
+    case GL_TEXTURE_3D: return EGL_GL_TEXTURE_3D;
+    default:
+        PL_ERR(gpu, "%s: unsupported texture target 0x%x", __func__, target);
+        return 0;
+    }
+}
+
+static bool gl_tex_export(pl_gpu gpu, enum pl_handle_type handle_type,
+                          bool preserved, struct pl_tex_t *tex)
+{
+    struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+    struct pl_gl *p = PL_PRIV(gpu);
+
+    EGLenum egltarget = egl_from_gl_target(gpu, tex_gl->target);
+    if (!egltarget)
+        goto error;
+
+    int attribs[] = {
+        EGL_IMAGE_PRESERVED, preserved,
+        EGL_NONE,
+    };
+
+    // We assume that tex_gl->texture is already bound
+    tex_gl->image = eglCreateImageKHR(p->egl_dpy,
+                                      p->egl_ctx,
+                                      egltarget,
+                                      (EGLClientBuffer) (uintptr_t) tex_gl->texture,
+                                      attribs);
+    if (!egl_check_err(gpu, "eglCreateImageKHR") || !tex_gl->image)
+        goto error;
+
+    switch (handle_type) {
+
+#ifdef PL_HAVE_UNIX
+    case PL_HANDLE_DMA_BUF: {
+        int fourcc = 0;
+        int num_planes = 0;
+        EGLuint64KHR modifier = 0;
+        bool ok;
+        ok = eglExportDMABUFImageQueryMESA(p->egl_dpy,
+                                           tex_gl->image,
+                                           &fourcc,
+                                           &num_planes,
+                                           &modifier);
+        if (!egl_check_err(gpu, "eglExportDMABUFImageQueryMESA") || !ok)
+            goto error;
+
+        if (fourcc != tex->params.format->fourcc) {
+            PL_ERR(gpu, "Exported DRM format %s does not match fourcc of "
+                   "specified pl_fmt %s? Please open a bug.",
+                   PRINT_FOURCC(fourcc), PRINT_FOURCC(tex->params.format->fourcc));
+            goto error;
+        }
+
+        if (num_planes != 1) {
+            PL_ERR(gpu, "Unsupported number of planes: %d", num_planes);
+            goto error;
+        }
+
+        int offset = 0, stride = 0;
+        ok = eglExportDMABUFImageMESA(p->egl_dpy,
+                                      tex_gl->image,
+                                      &tex_gl->fd,
+                                      &stride,
+                                      &offset);
+        if (!egl_check_err(gpu, "eglExportDMABUFImageMesa") || !ok)
+            goto error;
+
+        off_t fdsize = lseek(tex_gl->fd, 0, SEEK_END);
+        off_t err = fdsize > 0 && lseek(tex_gl->fd, 0, SEEK_SET);
+        if (fdsize <= 0 || err < 0) {
+            PL_ERR(gpu, "Failed querying FD size: %s", strerror(errno));
+            goto error;
+        }
+
+        tex->shared_mem = (struct pl_shared_mem) {
+            .handle.fd = tex_gl->fd,
+            .size = fdsize,
+            .offset = offset,
+            .drm_format_mod = modifier,
+            .stride_w = stride,
+        };
+        break;
+    }
+#else // !PL_HAVE_UNIX
+    case PL_HANDLE_DMA_BUF:
+        pl_unreachable();
+#endif
+
+    case PL_HANDLE_WIN32:
+    case PL_HANDLE_WIN32_KMT:
+    case PL_HANDLE_HOST_PTR:
+    case PL_HANDLE_FD:
+    case PL_HANDLE_MTL_TEX:
+    case PL_HANDLE_IOSURFACE:
+        pl_unreachable();
+
+    }
+
+    return true;
+
+error:
+    PL_ERR(gpu, "Failed exporting GL texture!");
+    return false;
+}
+
+static const char *fb_err_str(GLenum err)
+{
+    switch (err) {
+#define CASE(name) case name: return #name
+    CASE(GL_FRAMEBUFFER_COMPLETE);
+    CASE(GL_FRAMEBUFFER_UNDEFINED);
+    CASE(GL_FRAMEBUFFER_INCOMPLETE_ATTACHMENT);
+    CASE(GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT);
+    CASE(GL_FRAMEBUFFER_INCOMPLETE_DIMENSIONS);
+    CASE(GL_FRAMEBUFFER_INCOMPLETE_DRAW_BUFFER);
+    CASE(GL_FRAMEBUFFER_INCOMPLETE_READ_BUFFER);
+    CASE(GL_FRAMEBUFFER_UNSUPPORTED);
+    CASE(GL_FRAMEBUFFER_INCOMPLETE_MULTISAMPLE);
+    CASE(GL_FRAMEBUFFER_INCOMPLETE_LAYER_TARGETS);
+#undef CASE
+
+    default: return "unknown error";
+    }
+}
+
+pl_tex gl_tex_create(pl_gpu gpu, const struct pl_tex_params *params)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    if (!MAKE_CURRENT())
+        return NULL;
+
+    struct pl_gl *p = PL_PRIV(gpu);
+    struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct pl_tex_gl);
+    tex->params = *params;
+    tex->params.initial_data = NULL;
+    tex->sampler_type = PL_SAMPLER_NORMAL;
+
+    struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+
+    const struct gl_format **fmtp = PL_PRIV(params->format);
+    const struct gl_format *fmt = *fmtp;
+    *tex_gl = (struct pl_tex_gl) {
+        .format = fmt->fmt,
+        .iformat = fmt->ifmt,
+        .type = fmt->type,
+        .barrier = tex_barrier(tex),
+        .fd = -1,
+    };
+
+    static const GLint targets[] = {
+        [1] = GL_TEXTURE_1D,
+        [2] = GL_TEXTURE_2D,
+        [3] = GL_TEXTURE_3D,
+    };
+
+    int dims = pl_tex_params_dimension(*params);
+    pl_assert(dims >= 1 && dims <= 3);
+    tex_gl->target = targets[dims];
+
+    gl->GenTextures(1, &tex_gl->texture);
+    gl->BindTexture(tex_gl->target, tex_gl->texture);
+
+    if (params->import_handle) {
+        if (!gl_tex_import(gpu, params->import_handle, &params->shared_mem, tex))
+            goto error;
+    } else {
+        gl->PixelStorei(GL_UNPACK_ALIGNMENT, 1);
+
+        switch (dims) {
+        case 1:
+            gl->TexImage1D(tex_gl->target, 0, tex_gl->iformat, params->w, 0,
+                           tex_gl->format, tex_gl->type, params->initial_data);
+            break;
+        case 2:
+            gl->TexImage2D(tex_gl->target, 0, tex_gl->iformat, params->w, params->h,
+                           0, tex_gl->format, tex_gl->type, params->initial_data);
+            break;
+        case 3:
+            gl->TexImage3D(tex_gl->target, 0, tex_gl->iformat, params->w, params->h,
+                           params->d, 0, tex_gl->format, tex_gl->type,
+                           params->initial_data);
+            break;
+        }
+
+        gl->PixelStorei(GL_UNPACK_ALIGNMENT, 4);
+    }
+
+    if (params->export_handle) {
+        if (!gl_tex_export(gpu, params->export_handle, params->initial_data, tex))
+            goto error;
+    }
+
+    gl->BindTexture(tex_gl->target, 0);
+
+    if (!gl_check_err(gpu, "gl_tex_create: texture"))
+        goto error;
+
+    bool need_fbo = tex->params.renderable;
+    if (tex->params.blit_src || tex->params.blit_dst) {
+        if (dims != 2) {
+            PL_ERR(gpu, "Blittable textures may only be 2D!");
+            goto error;
+        }
+
+        need_fbo = true;
+    }
+
+    bool can_fbo = tex->params.format->caps & PL_FMT_CAP_RENDERABLE &&
+                   tex->params.d == 0;
+
+    // Try creating an FBO for host-readable textures, since this allows
+    // reading back with glReadPixels instead of glGetTexImage. (Additionally,
+    // GLES does not support glGetTexImage)
+    if (tex->params.host_readable && (can_fbo || p->gles_ver))
+        need_fbo = true;
+
+    if (need_fbo) {
+        if (!can_fbo) {
+            PL_ERR(gpu, "Trying to create a renderable/blittable/readable "
+                   "texture with an incompatible (non-renderable) format!");
+            goto error;
+        }
+
+        gl->GenFramebuffers(1, &tex_gl->fbo);
+        gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, tex_gl->fbo);
+        switch (dims) {
+        case 1:
+            gl->FramebufferTexture1D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
+                                     GL_TEXTURE_1D, tex_gl->texture, 0);
+            break;
+        case 2:
+            gl->FramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
+                                     GL_TEXTURE_2D, tex_gl->texture, 0);
+            break;
+        case 3: pl_unreachable();
+        }
+
+        GLenum err = gl->CheckFramebufferStatus(GL_DRAW_FRAMEBUFFER);
+        if (err != GL_FRAMEBUFFER_COMPLETE) {
+            gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+            PL_ERR(gpu, "Failed creating framebuffer: %s", fb_err_str(err));
+            goto error;
+        }
+
+        if (params->host_readable && p->gles_ver) {
+            GLint read_type = 0, read_fmt = 0;
+            gl->GetIntegerv(GL_IMPLEMENTATION_COLOR_READ_TYPE, &read_type);
+            gl->GetIntegerv(GL_IMPLEMENTATION_COLOR_READ_FORMAT, &read_fmt);
+            if (read_type != tex_gl->type || read_fmt != tex_gl->format) {
+                gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+                PL_ERR(gpu, "Trying to create host_readable texture whose "
+                       "implementation-defined pixel read format "
+                       "(type=0x%X, fmt=0x%X) does not match the texture's "
+                       "internal format (type=0x%X, fmt=0x%X)! This is a "
+                       "GLES/driver limitation, there's little we can do "
+                       "about it.",
+                       read_type, read_fmt, tex_gl->type, tex_gl->format);
+                goto error;
+            }
+        }
+
+        gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+        if (!gl_check_err(gpu, "gl_tex_create: fbo"))
+            goto error;
+    }
+
+    RELEASE_CURRENT();
+    return tex;
+
+error:
+    gl_tex_destroy(gpu, tex);
+    RELEASE_CURRENT();
+    return NULL;
+}
+
+static bool gl_fb_query(pl_gpu gpu, int fbo, struct pl_fmt_t *fmt,
+                        struct gl_format *glfmt)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    struct pl_gl *p = PL_PRIV(gpu);
+    *fmt = (struct pl_fmt_t) {
+        .name = "fbo",
+        .type = PL_FMT_UNKNOWN,
+        .caps = PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_BLITTABLE | PL_FMT_CAP_BLENDABLE,
+        .num_components = 4,
+        .component_depth = {8, 8, 8, 8}, // default to rgba8
+        .sample_order = {0, 1, 2, 3},
+    };
+
+    *glfmt = (struct gl_format) {
+        .fmt = GL_RGBA,
+    };
+
+    bool can_query = gl_test_ext(gpu, "GL_ARB_framebuffer_object", 30, 20);
+    if (!fbo && p->gles_ver && p->gles_ver < 30)
+        can_query = false; // can't query default framebuffer on GLES 2.0
+
+    if (can_query) {
+        gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, fbo);
+
+        GLenum obj = p->gles_ver ? GL_BACK : GL_BACK_LEFT;
+        if (fbo != 0)
+            obj = GL_COLOR_ATTACHMENT0;
+
+        GLint type = 0;
+        gl->GetFramebufferAttachmentParameteriv(GL_DRAW_FRAMEBUFFER, obj,
+                            GL_FRAMEBUFFER_ATTACHMENT_COMPONENT_TYPE, &type);
+        switch (type) {
+        case GL_FLOAT:                  fmt->type = PL_FMT_FLOAT; break;
+        case GL_INT:                    fmt->type = PL_FMT_SINT; break;
+        case GL_UNSIGNED_INT:           fmt->type = PL_FMT_UINT; break;
+        case GL_SIGNED_NORMALIZED:      fmt->type = PL_FMT_SNORM; break;
+        case GL_UNSIGNED_NORMALIZED:    fmt->type = PL_FMT_UNORM; break;
+        default:                        fmt->type = PL_FMT_UNKNOWN; break;
+        }
+
+        gl->GetFramebufferAttachmentParameteriv(GL_DRAW_FRAMEBUFFER, obj,
+                GL_FRAMEBUFFER_ATTACHMENT_RED_SIZE, &fmt->component_depth[0]);
+        gl->GetFramebufferAttachmentParameteriv(GL_DRAW_FRAMEBUFFER, obj,
+                GL_FRAMEBUFFER_ATTACHMENT_GREEN_SIZE, &fmt->component_depth[1]);
+        gl->GetFramebufferAttachmentParameteriv(GL_DRAW_FRAMEBUFFER, obj,
+                GL_FRAMEBUFFER_ATTACHMENT_BLUE_SIZE, &fmt->component_depth[2]);
+        gl->GetFramebufferAttachmentParameteriv(GL_DRAW_FRAMEBUFFER, obj,
+                GL_FRAMEBUFFER_ATTACHMENT_ALPHA_SIZE, &fmt->component_depth[3]);
+
+        gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+        gl_check_err(gpu, "gl_fb_query");
+
+        if (!fmt->component_depth[0]) {
+            PL_INFO(gpu, "OpenGL framebuffer did not export depth information,"
+                    "assuming 8-bit framebuffer");
+            for (int i = 0; i < PL_ARRAY_SIZE(fmt->component_depth); i++)
+                fmt->component_depth[i] = 8;
+        }
+
+        // Strip missing components from component map
+        while (!fmt->component_depth[fmt->num_components - 1]) {
+            fmt->num_components--;
+            pl_assert(fmt->num_components);
+        }
+    }
+
+    int gpu_bits = 0;
+    for (int i = 0; i < 4; i++)
+        gpu_bits += fmt->component_depth[i];
+    fmt->internal_size = (gpu_bits + 7) / 8;
+
+    size_t host_size = 0;
+    switch (fmt->type) {
+    case PL_FMT_UNKNOWN:
+        fmt->opaque = true;
+        return true;
+    case PL_FMT_FLOAT:
+        glfmt->type = GL_FLOAT;
+        host_size = sizeof(float);
+        break;
+    case PL_FMT_UNORM:
+    case PL_FMT_UINT:
+        if (gpu_bits > 32) {
+            glfmt->type = GL_UNSIGNED_SHORT;
+            host_size = sizeof(uint16_t);
+        } else {
+            glfmt->type = GL_UNSIGNED_BYTE;
+            host_size = sizeof(uint8_t);
+        }
+        break;
+    case PL_FMT_SNORM:
+    case PL_FMT_SINT:
+        if (gpu_bits > 32) {
+            glfmt->type = GL_SHORT;
+            host_size = sizeof(int16_t);
+        } else {
+            glfmt->type = GL_BYTE;
+            host_size = sizeof(int8_t);
+        }
+        break;
+    case PL_FMT_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+    fmt->texel_size = fmt->num_components * host_size;
+    for (int i = 0; i < fmt->num_components; i++)
+        fmt->host_bits[i] = 8 * host_size;
+    fmt->caps |= PL_FMT_CAP_HOST_READABLE;
+
+    return true;
+}
+
+pl_tex pl_opengl_wrap(pl_gpu gpu, const struct pl_opengl_wrap_params *params)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    if (!MAKE_CURRENT())
+        return NULL;
+
+    struct pl_gl *p = PL_PRIV(gpu);
+    struct pl_tex_t *tex = pl_alloc_obj(NULL, tex, struct pl_tex_gl);
+    struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+    *tex = (struct pl_tex_t) {
+        .params = {
+            .w = params->width,
+            .h = params->height,
+            .d = params->depth,
+        },
+    };
+
+    pl_fmt fmt = NULL;
+    const struct gl_format *glfmt = NULL;
+
+    if (params->texture) {
+        // Wrapping texture: Require matching iformat
+        pl_assert(params->iformat);
+        for (int i = 0; i < gpu->num_formats; i++) {
+            const struct gl_format **glfmtp = PL_PRIV(gpu->formats[i]);
+            if ((*glfmtp)->ifmt == params->iformat) {
+                fmt = gpu->formats[i];
+                glfmt = *glfmtp;
+                break;
+            }
+        }
+
+        if (!fmt) {
+            PL_ERR(gpu, "Failed mapping iformat %d to any equivalent `pl_fmt`",
+                   params->iformat);
+            goto error;
+        }
+    } else {
+        // Wrapping framebuffer: Allocate/infer generic FBO format
+        fmt = pl_alloc_obj((void *) gpu, fmt, const struct gl_format *);
+        glfmt = pl_alloc_ptr((void *) fmt, glfmt);
+        const struct gl_format **glfmtp = PL_PRIV(fmt);
+        *glfmtp = glfmt;
+        if (!gl_fb_query(gpu, params->framebuffer,
+                         (struct pl_fmt_t *) fmt,
+                         (struct gl_format *) glfmt))
+        {
+            PL_ERR(gpu, "Failed querying framebuffer specifics!");
+            pl_free((void *) fmt);
+            goto error;
+        }
+    }
+
+    *tex_gl = (struct pl_tex_gl) {
+        .target = params->target,
+        .texture = params->texture,
+        .fbo = params->framebuffer,
+        .wrapped_tex = !!params->texture,
+        .wrapped_fb = params->framebuffer || !params->texture,
+        .iformat = glfmt->ifmt,
+        .format = glfmt->fmt,
+        .type = glfmt->type,
+        .fd = -1,
+    };
+
+    int dims = pl_tex_params_dimension(tex->params);
+    if (!tex_gl->target) {
+        switch (dims) {
+        case 1: tex_gl->target = GL_TEXTURE_1D; break;
+        case 2: tex_gl->target = GL_TEXTURE_2D; break;
+        case 3: tex_gl->target = GL_TEXTURE_3D; break;
+        }
+    }
+
+    // Map texture-specific sampling metadata
+    if (params->texture) {
+        switch (params->target) {
+        case GL_TEXTURE_1D:
+            if (params->width || params->depth) {
+                PL_ERR(gpu, "Invalid texture dimensions for GL_TEXTURE_1D");
+                goto error;
+            }
+            // fall through
+        case GL_TEXTURE_2D:
+            if (params->depth) {
+                PL_ERR(gpu, "Invalid texture dimensions for GL_TEXTURE_2D");
+                goto error;
+            }
+            // fall through
+        case 0:
+        case GL_TEXTURE_3D:
+            tex->sampler_type = PL_SAMPLER_NORMAL;
+            break;
+
+        case GL_TEXTURE_RECTANGLE: tex->sampler_type = PL_SAMPLER_RECT; break;
+        case GL_TEXTURE_EXTERNAL_OES: tex->sampler_type = PL_SAMPLER_EXTERNAL; break;
+
+        default:
+            PL_ERR(gpu, "Failed mapping texture target %u to any equivalent "
+                   "`pl_sampler_type`", params->target);
+            goto error;
+        }
+    }
+
+    // Create optional extra fbo if needed/possible
+    bool can_fbo = tex_gl->texture &&
+                   (fmt->caps & PL_FMT_CAP_RENDERABLE) &&
+                   tex->sampler_type != PL_SAMPLER_EXTERNAL &&
+                   dims < 3;
+
+    if (can_fbo && !tex_gl->fbo) {
+        gl->GenFramebuffers(1, &tex_gl->fbo);
+        gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, tex_gl->fbo);
+        switch (dims) {
+        case 1:
+            gl->FramebufferTexture1D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
+                                     tex_gl->target, tex_gl->texture, 0);
+            break;
+        case 2:
+            gl->FramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
+                                     tex_gl->target, tex_gl->texture, 0);
+            break;
+        }
+
+        GLenum err = gl->CheckFramebufferStatus(GL_DRAW_FRAMEBUFFER);
+        if (err != GL_FRAMEBUFFER_COMPLETE) {
+            gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+            PL_ERR(gpu, "Failed creating framebuffer: error code %d", err);
+            goto error;
+        }
+
+        if (p->gles_ver) {
+            GLint read_type = 0, read_fmt = 0;
+            gl->GetIntegerv(GL_IMPLEMENTATION_COLOR_READ_TYPE, &read_type);
+            gl->GetIntegerv(GL_IMPLEMENTATION_COLOR_READ_FORMAT, &read_fmt);
+            tex->params.host_readable = read_type == tex_gl->type &&
+                                        read_fmt == tex_gl->format;
+        } else {
+            tex->params.host_readable = true;
+        }
+
+        gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+        if (!gl_check_err(gpu, "pl_opengl_wrap: fbo"))
+            goto error;
+    }
+
+    // Complete the process of inferring the texture capabilities
+    tex->params.format = fmt;
+    if (tex_gl->texture) {
+        tex->params.sampleable = fmt->caps & PL_FMT_CAP_SAMPLEABLE;
+        tex->params.storable = fmt->caps & PL_FMT_CAP_STORABLE;
+        tex->params.host_writable = !fmt->opaque;
+        tex->params.host_readable |= fmt->caps & PL_FMT_CAP_HOST_READABLE;
+    }
+    if (tex_gl->fbo || tex_gl->wrapped_fb) {
+        tex->params.renderable = fmt->caps & PL_FMT_CAP_RENDERABLE;
+        tex->params.host_readable |= fmt->caps & PL_FMT_CAP_HOST_READABLE;
+        if (dims == 2 && (fmt->caps & PL_FMT_CAP_BLITTABLE)) {
+            tex->params.blit_src = true;
+            tex->params.blit_dst = true;
+        }
+    }
+
+    tex_gl->barrier = tex_barrier(tex);
+    RELEASE_CURRENT();
+    return tex;
+
+error:
+    gl_tex_destroy(gpu, tex);
+    RELEASE_CURRENT();
+    return NULL;
+}
+
+unsigned int pl_opengl_unwrap(pl_gpu gpu, pl_tex tex,
+                              unsigned int *out_target, int *out_iformat,
+                              unsigned int *out_fbo)
+{
+    struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+    if (!tex_gl->texture) {
+        PL_ERR(gpu, "Trying to call `pl_opengl_unwrap` on a pseudo-texture "
+               "(perhaps obtained by `pl_swapchain_start_frame`?)");
+        return 0;
+    }
+
+    if (out_target)
+        *out_target = tex_gl->target;
+    if (out_iformat)
+        *out_iformat = tex_gl->iformat;
+    if (out_fbo)
+        *out_fbo = tex_gl->fbo;
+
+    return tex_gl->texture;
+}
+
+void gl_tex_invalidate(pl_gpu gpu, pl_tex tex)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    struct pl_gl *p = PL_PRIV(gpu);
+    struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+    if (!MAKE_CURRENT())
+        return;
+
+    if (tex_gl->texture && p->has_invalidate_tex)
+        gl->InvalidateTexImage(tex_gl->texture, 0);
+
+    if ((tex_gl->wrapped_fb || tex_gl->fbo) && p->has_invalidate_fb) {
+        GLenum attachment = tex_gl->fbo ? GL_COLOR_ATTACHMENT0 : GL_COLOR;
+        gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, tex_gl->fbo);
+        gl->InvalidateFramebuffer(GL_DRAW_FRAMEBUFFER, 1, &attachment);
+        gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+    }
+
+    gl_check_err(gpu, "gl_tex_invalidate");
+    RELEASE_CURRENT();
+}
+
+void gl_tex_clear_ex(pl_gpu gpu, pl_tex tex, const union pl_clear_color color)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    if (!MAKE_CURRENT())
+        return;
+
+    struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+    pl_assert(tex_gl->fbo || tex_gl->wrapped_fb);
+
+    switch (tex->params.format->type) {
+    case PL_FMT_UNKNOWN:
+    case PL_FMT_FLOAT:
+    case PL_FMT_UNORM:
+    case PL_FMT_SNORM:
+        gl->ClearColor(color.f[0], color.f[1], color.f[2], color.f[3]);
+        break;
+
+    case PL_FMT_UINT:
+        gl->ClearColorIuiEXT(color.u[0], color.u[1], color.u[2], color.u[3]);
+        break;
+
+    case PL_FMT_SINT:
+        gl->ClearColorIiEXT(color.i[0], color.i[1], color.i[2], color.i[3]);
+        break;
+
+    case PL_FMT_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+    gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, tex_gl->fbo);
+    gl->Clear(GL_COLOR_BUFFER_BIT);
+    gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+    gl_check_err(gpu, "gl_tex_clear");
+    RELEASE_CURRENT();
+}
+
+void gl_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    if (!MAKE_CURRENT())
+        return;
+
+    struct pl_tex_gl *src_gl = PL_PRIV(params->src);
+    struct pl_tex_gl *dst_gl = PL_PRIV(params->dst);
+
+    pl_assert(src_gl->fbo || src_gl->wrapped_fb);
+    pl_assert(dst_gl->fbo || dst_gl->wrapped_fb);
+    gl->BindFramebuffer(GL_READ_FRAMEBUFFER, src_gl->fbo);
+    gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, dst_gl->fbo);
+
+    static const GLint filters[PL_TEX_SAMPLE_MODE_COUNT] = {
+        [PL_TEX_SAMPLE_NEAREST] = GL_NEAREST,
+        [PL_TEX_SAMPLE_LINEAR]  = GL_LINEAR,
+    };
+
+    pl_rect3d src_rc = params->src_rc, dst_rc = params->dst_rc;
+    gl->BlitFramebuffer(src_rc.x0, src_rc.y0, src_rc.x1, src_rc.y1,
+                        dst_rc.x0, dst_rc.y0, dst_rc.x1, dst_rc.y1,
+                        GL_COLOR_BUFFER_BIT, filters[params->sample_mode]);
+
+    gl->BindFramebuffer(GL_READ_FRAMEBUFFER, 0);
+    gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+    gl_check_err(gpu, "gl_tex_blit");
+    RELEASE_CURRENT();
+}
+
+static int get_alignment(size_t pitch)
+{
+    if (pitch % 8 == 0)
+        return 8;
+    if (pitch % 4 == 0)
+        return 4;
+    if (pitch % 2 == 0)
+        return 2;
+    return 1;
+}
+
+bool gl_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    struct pl_gl *p = PL_PRIV(gpu);
+    pl_tex tex = params->tex;
+    pl_fmt fmt = tex->params.format;
+    pl_buf buf = params->buf;
+    struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+    struct pl_buf_gl *buf_gl = buf ? PL_PRIV(buf) : NULL;
+
+    // If the user requests asynchronous uploads, it's more efficient to do
+    // them via a PBO - this allows us to skip blocking the caller, especially
+    // when the host pointer can be imported directly.
+    if (params->callback && !buf) {
+        size_t buf_size = pl_tex_transfer_size(params);
+        const size_t min_size = 32*1024; // 32 KiB
+        if (buf_size >= min_size && buf_size <= gpu->limits.max_buf_size)
+            return pl_tex_upload_pbo(gpu, params);
+    }
+
+    if (!MAKE_CURRENT())
+        return false;
+
+    uintptr_t src = (uintptr_t) params->ptr;
+    if (buf) {
+        gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, buf_gl->buffer);
+        src = buf_gl->offset + params->buf_offset;
+    }
+
+    bool misaligned = params->row_pitch % fmt->texel_size;
+    int stride_w = params->row_pitch / fmt->texel_size;
+    int stride_h = params->depth_pitch / params->row_pitch;
+
+    int dims = pl_tex_params_dimension(tex->params);
+    if (dims > 1)
+        gl->PixelStorei(GL_UNPACK_ALIGNMENT, get_alignment(params->row_pitch));
+
+    int rows = pl_rect_h(params->rc);
+    if (misaligned) {
+        rows = 1;
+    } else if (stride_w != pl_rect_w(params->rc)) {
+        gl->PixelStorei(GL_UNPACK_ROW_LENGTH, stride_w);
+    }
+
+    int imgs = pl_rect_d(params->rc);
+    if (stride_h != pl_rect_h(params->rc) || rows < stride_h)
+        gl->PixelStorei(GL_UNPACK_IMAGE_HEIGHT, stride_h);
+
+    gl->BindTexture(tex_gl->target, tex_gl->texture);
+    gl_timer_begin(gpu, params->timer);
+
+    switch (dims) {
+    case 1:
+        gl->TexSubImage1D(tex_gl->target, 0, params->rc.x0, pl_rect_w(params->rc),
+                          tex_gl->format, tex_gl->type, (void *) src);
+        break;
+    case 2:
+        for (int y = params->rc.y0; y < params->rc.y1; y += rows) {
+            gl->TexSubImage2D(tex_gl->target, 0, params->rc.x0, y,
+                              pl_rect_w(params->rc), rows, tex_gl->format,
+                              tex_gl->type, (void *) src);
+            src += params->row_pitch * rows;
+        }
+        break;
+    case 3:
+        for (int z = params->rc.z0; z < params->rc.z1; z += imgs) {
+            uintptr_t row_src = src;
+            for (int y = params->rc.y0; y < params->rc.y1; y += rows) {
+                gl->TexSubImage3D(tex_gl->target, 0, params->rc.x0, y, z,
+                                  pl_rect_w(params->rc), rows, imgs,
+                                  tex_gl->format, tex_gl->type, (void *) row_src);
+                row_src = (uintptr_t) row_src + params->row_pitch * rows;
+            }
+            src += params->depth_pitch * imgs;
+        }
+        break;
+    }
+
+    gl_timer_end(gpu, params->timer);
+    gl->BindTexture(tex_gl->target, 0);
+    gl->PixelStorei(GL_UNPACK_ALIGNMENT, 4);
+    gl->PixelStorei(GL_UNPACK_ROW_LENGTH, 0);
+    gl->PixelStorei(GL_UNPACK_IMAGE_HEIGHT, 0);
+
+    if (buf) {
+        gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+        if (buf->params.host_mapped) {
+            // Make sure the PBO is not reused until GL is done with it. If a
+            // previous operation is pending, "update" it by creating a new
+            // fence that will cover the previous operation as well.
+            gl->DeleteSync(buf_gl->fence);
+            buf_gl->fence = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+        }
+    }
+
+    if (params->callback) {
+        PL_ARRAY_APPEND(gpu, p->callbacks, (struct gl_cb) {
+            .sync = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0),
+            .callback = params->callback,
+            .priv = params->priv,
+        });
+    }
+
+    bool ok = gl_check_err(gpu, "gl_tex_upload");
+    RELEASE_CURRENT();
+    return ok;
+}
+
+bool gl_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    struct pl_gl *p = PL_PRIV(gpu);
+    pl_tex tex = params->tex;
+    pl_fmt fmt = tex->params.format;
+    pl_buf buf = params->buf;
+    struct pl_tex_gl *tex_gl = PL_PRIV(tex);
+    struct pl_buf_gl *buf_gl = buf ? PL_PRIV(buf) : NULL;
+    bool ok = true;
+
+    if (params->callback && !buf) {
+        size_t buf_size = pl_tex_transfer_size(params);
+        const size_t min_size = 32*1024; // 32 KiB
+        if (buf_size >= min_size && buf_size <= gpu->limits.max_buf_size)
+            return pl_tex_download_pbo(gpu, params);
+    }
+
+    if (!MAKE_CURRENT())
+        return false;
+
+    uintptr_t dst = (uintptr_t) params->ptr;
+    if (buf) {
+        gl->BindBuffer(GL_PIXEL_PACK_BUFFER, buf_gl->buffer);
+        dst = buf_gl->offset + params->buf_offset;
+    }
+
+    pl_rect3d full = {
+        0, 0, 0,
+        tex->params.w,
+        PL_DEF(tex->params.h, 1),
+        PL_DEF(tex->params.d, 1),
+    };
+
+    bool misaligned = params->row_pitch % fmt->texel_size;
+    int stride_w = params->row_pitch / fmt->texel_size;
+    int stride_h = params->depth_pitch / params->row_pitch;
+
+    int dims = pl_tex_params_dimension(tex->params);
+    bool is_copy = pl_rect3d_eq(params->rc, full) &&
+                   stride_w == tex->params.w &&
+                   stride_h == PL_DEF(tex->params.h, 1) &&
+                   !misaligned;
+
+    gl_timer_begin(gpu, params->timer);
+
+    if (tex_gl->fbo || tex_gl->wrapped_fb) {
+        // We can use a more efficient path when we have an FBO available
+        if (dims > 1)
+            gl->PixelStorei(GL_PACK_ALIGNMENT, get_alignment(params->row_pitch));
+
+        int rows = pl_rect_h(params->rc);
+        if (misaligned) {
+            rows = 1;
+        } else if (stride_w != tex->params.w) {
+            gl->PixelStorei(GL_PACK_ROW_LENGTH, stride_w);
+        }
+
+        // No 3D framebuffers
+        pl_assert(pl_rect_d(params->rc) == 1);
+
+        gl->BindFramebuffer(GL_READ_FRAMEBUFFER, tex_gl->fbo);
+        for (int y = params->rc.y0; y < params->rc.y1; y += rows) {
+            gl->ReadPixels(params->rc.x0, y, pl_rect_w(params->rc), rows,
+                           tex_gl->format, tex_gl->type, (void *) dst);
+            dst += params->row_pitch * rows;
+        }
+        gl->BindFramebuffer(GL_READ_FRAMEBUFFER, 0);
+        gl->PixelStorei(GL_PACK_ALIGNMENT, 4);
+        gl->PixelStorei(GL_PACK_ROW_LENGTH, 0);
+    } else if (is_copy) {
+        // We're downloading the entire texture
+        gl->BindTexture(tex_gl->target, tex_gl->texture);
+        gl->GetTexImage(tex_gl->target, 0, tex_gl->format, tex_gl->type, (void *) dst);
+        gl->BindTexture(tex_gl->target, 0);
+    } else {
+        PL_ERR(gpu, "Partial downloads of 3D textures not implemented!");
+        ok = false;
+    }
+
+    gl_timer_end(gpu, params->timer);
+
+    if (buf) {
+        gl->BindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+        if (ok && buf->params.host_mapped) {
+            gl->DeleteSync(buf_gl->fence);
+            buf_gl->fence = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+        }
+    }
+
+    if (params->callback) {
+        PL_ARRAY_APPEND(gpu, p->callbacks, (struct gl_cb) {
+            .sync = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0),
+            .callback = params->callback,
+            .priv = params->priv,
+        });
+    }
+
+    ok &= gl_check_err(gpu, "gl_tex_download");
+    RELEASE_CURRENT();
+    return ok;
+}
diff --git a/src/opengl/include/glad/meson.build b/src/opengl/include/glad/meson.build
new file mode 100644
index 0000000..05b3f02
--- /dev/null
+++ b/src/opengl/include/glad/meson.build
@@ -0,0 +1,29 @@
+glad_check = run_command([ python, '-c', 'import glad; print(glad.__version__)' ],
+  env: python_env,
+  capture: true,
+  check: false,
+)
+
+glad_ver = glad_check.returncode() == 0 ? glad_check.stdout().strip() : 'none'
+glad_req = '>= 2.0'
+
+if not glad_ver.version_compare(glad_req)
+  error(f'glad (required: @glad_req@, found: @glad_ver@) was not found in ' +
+        'PYTHONPATH or `3rdparty`. Please run `git submodule update --init` ' +
+        'followed by `meson --wipe`.')
+endif
+
+glad = custom_target('gl.h',
+  output: 'gl.h',
+  env: python_env,
+  command: [
+    python, '-m', 'glad', '--out-path=@OUTDIR@/../../',
+    '--reproducible', '--merge', '--api=gl:core,gles2,egl',
+    '--extensions=' + ','.join(gl_extensions), 'c', '--header-only', '--mx'
+  ] + (opengl_link.allowed() ? ['--loader'] : [])
+)
+
+glad_dep = declare_dependency(
+  include_directories: include_directories('..'),
+  sources: glad,
+)
diff --git a/src/opengl/loader_egl.c b/src/opengl/loader_egl.c
new file mode 100644
index 0000000..0e04c71
--- /dev/null
+++ b/src/opengl/loader_egl.c
@@ -0,0 +1,2 @@
+#define GLAD_EGL_IMPLEMENTATION
+#include "common.h"
diff --git a/src/opengl/loader_gl.c b/src/opengl/loader_gl.c
new file mode 100644
index 0000000..26b8bef
--- /dev/null
+++ b/src/opengl/loader_gl.c
@@ -0,0 +1,2 @@
+#define GLAD_GL_IMPLEMENTATION
+#include "common.h"
diff --git a/src/opengl/meson.build b/src/opengl/meson.build
new file mode 100644
index 0000000..59ba921
--- /dev/null
+++ b/src/opengl/meson.build
@@ -0,0 +1,76 @@
+opengl_build = get_option('opengl')
+opengl_link = get_option('gl-proc-addr')
+
+if host_machine.system() == 'windows' or host_machine.system().endswith('bsd') or \
+   host_machine.system() == 'dragonfly'
+    libdl = declare_dependency()
+else
+    libdl = cc.find_library('dl', required : opengl_link)
+endif
+opengl_link = opengl_link.require(libdl.found())
+components.set('opengl', opengl_build.allowed())
+components.set('gl-proc-addr', opengl_link.allowed())
+
+if opengl_build.allowed()
+  sources += [
+    'opengl/context.c',
+    'opengl/formats.c',
+    'opengl/loader_gl.c',
+    'opengl/loader_egl.c',
+    'opengl/gpu.c',
+    'opengl/gpu_tex.c',
+    'opengl/gpu_pass.c',
+    'opengl/swapchain.c',
+    'opengl/utils.c',
+  ]
+
+  if opengl_link.allowed()
+    build_deps += libdl
+    tests += 'opengl_surfaceless.c'
+  endif
+
+  gl_extensions = [
+    'GL_AMD_pinned_memory',
+    'GL_ARB_buffer_storage',
+    'GL_ARB_compute_shader',
+    'GL_ARB_framebuffer_object',
+    'GL_ARB_get_program_binary',
+    'GL_ARB_invalidate_subdata',
+    'GL_ARB_pixel_buffer_object',
+    'GL_ARB_program_interface_query',
+    'GL_ARB_shader_image_load_store',
+    'GL_ARB_shader_storage_buffer_object',
+    'GL_ARB_sync',
+    'GL_ARB_texture_float',
+    'GL_ARB_texture_gather',
+    'GL_ARB_texture_rg',
+    'GL_ARB_timer_query',
+    'GL_ARB_uniform_buffer_object',
+    'GL_ARB_vertex_array_object',
+    'GL_EXT_EGL_image_storage',
+    'GL_EXT_color_buffer_float',
+    'GL_EXT_color_buffer_half_float',
+    'GL_EXT_texture3D',
+    'GL_EXT_texture_format_BGRA8888',
+    'GL_EXT_texture_integer',
+    'GL_EXT_texture_norm16',
+    'GL_EXT_texture_rg',
+    'GL_EXT_unpack_subimage',
+    'GL_KHR_debug',
+    'GL_OES_EGL_image',
+    'GL_OES_EGL_image_external',
+    'EGL_EXT_image_dma_buf_import',
+    'EGL_EXT_image_dma_buf_import_modifiers',
+    'EGL_EXT_platform_base',
+    'EGL_KHR_debug',
+    'EGL_KHR_image_base',
+    'EGL_MESA_image_dma_buf_export',
+    'EGL_MESA_platform_surfaceless',
+  ]
+
+  # Generate GL loader
+  subdir('include/glad')
+else
+  glad_dep = []
+  sources += 'opengl/stubs.c'
+endif
diff --git a/src/opengl/stubs.c b/src/opengl/stubs.c
new file mode 100644
index 0000000..20395f9
--- /dev/null
+++ b/src/opengl/stubs.c
@@ -0,0 +1,63 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../common.h"
+#include "log.h"
+
+#include <libplacebo/opengl.h>
+
+const struct pl_opengl_params pl_opengl_default_params = {0};
+
+pl_opengl pl_opengl_create(pl_log log, const struct pl_opengl_params *params)
+{
+    pl_fatal(log, "libplacebo compiled without OpenGL support!");
+    return NULL;
+}
+
+void pl_opengl_destroy(pl_opengl *pgl)
+{
+    pl_opengl gl = *pgl;
+    pl_assert(!gl);
+}
+
+pl_opengl pl_opengl_get(pl_gpu gpu)
+{
+    return NULL;
+}
+
+pl_swapchain pl_opengl_create_swapchain(pl_opengl gl,
+                            const struct pl_opengl_swapchain_params *params)
+{
+    pl_unreachable();
+}
+
+void pl_opengl_swapchain_update_fb(pl_swapchain sw,
+                                   const struct pl_opengl_framebuffer *fb)
+{
+    pl_unreachable();
+}
+
+pl_tex pl_opengl_wrap(pl_gpu gpu, const struct pl_opengl_wrap_params *params)
+{
+    pl_unreachable();
+}
+
+unsigned int pl_opengl_unwrap(pl_gpu gpu, pl_tex tex, unsigned int *out_target,
+                              int *out_iformat, unsigned int *out_fbo)
+{
+    pl_unreachable();
+}
diff --git a/src/opengl/swapchain.c b/src/opengl/swapchain.c
new file mode 100644
index 0000000..46d5f9e
--- /dev/null
+++ b/src/opengl/swapchain.c
@@ -0,0 +1,278 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include "formats.h"
+#include "gpu.h"
+#include "swapchain.h"
+#include "utils.h"
+#include "pl_thread.h"
+
+struct priv {
+    struct pl_sw_fns impl;
+
+    struct pl_opengl_swapchain_params params;
+    pl_opengl gl;
+    pl_mutex lock;
+    bool has_sync;
+
+    // current parameters
+    pl_tex fb;
+    bool frame_started;
+
+    // vsync fences
+    int swapchain_depth;
+    PL_ARRAY(GLsync) vsync_fences;
+};
+
+static const struct pl_sw_fns opengl_swapchain;
+
+pl_swapchain pl_opengl_create_swapchain(pl_opengl pl_gl,
+                              const struct pl_opengl_swapchain_params *params)
+{
+    pl_gpu gpu = pl_gl->gpu;
+
+    if (params->max_swapchain_depth < 0) {
+        PL_ERR(gpu, "Tried specifying negative swapchain depth?");
+        return NULL;
+    }
+
+    if (!gl_make_current(pl_gl))
+        return NULL;
+
+    struct pl_swapchain_t *sw = pl_zalloc_obj(NULL, sw, struct priv);
+    sw->log = gpu->log;
+    sw->gpu = gpu;
+
+    struct priv *p = PL_PRIV(sw);
+    pl_mutex_init(&p->lock);
+    p->impl = opengl_swapchain;
+    p->params = *params;
+    p->has_sync = pl_opengl_has_ext(pl_gl, "GL_ARB_sync");
+    p->gl = pl_gl;
+
+    gl_release_current(pl_gl);
+    return sw;
+}
+
+static void gl_sw_destroy(pl_swapchain sw)
+{
+    pl_gpu gpu = sw->gpu;
+    struct priv *p = PL_PRIV(sw);
+
+    pl_gpu_flush(gpu);
+    pl_tex_destroy(gpu, &p->fb);
+    pl_mutex_destroy(&p->lock);
+    pl_free((void *) sw);
+}
+
+static int gl_sw_latency(pl_swapchain sw)
+{
+    struct priv *p = PL_PRIV(sw);
+    return p->params.max_swapchain_depth;
+}
+
+static bool gl_sw_resize(pl_swapchain sw, int *width, int *height)
+{
+    struct priv *p = PL_PRIV(sw);
+    const int w = *width, h = *height;
+
+    pl_mutex_lock(&p->lock);
+    if (p->fb && w == p->fb->params.w && h == p->fb->params.h) {
+        pl_mutex_unlock(&p->lock);
+        return true;
+    }
+
+    if (p->frame_started && (w || h)) {
+        PL_ERR(sw, "Tried resizing the swapchain while a frame was in progress! "
+               "Please submit the current frame first.");
+        pl_mutex_unlock(&p->lock);
+        return false;
+    }
+
+    if (w && h) {
+        pl_tex_destroy(sw->gpu, &p->fb);
+        p->fb = pl_opengl_wrap(sw->gpu, pl_opengl_wrap_params(
+            .framebuffer = p->params.framebuffer.id,
+            .width = w,
+            .height = h,
+        ));
+        if (!p->fb) {
+            PL_ERR(sw, "Failed wrapping OpenGL framebuffer!");
+            pl_mutex_unlock(&p->lock);
+            return false;
+        }
+    }
+
+    if (!p->fb) {
+        PL_ERR(sw, "Tried calling `pl_swapchain_resize` with unknown size! "
+               "This is forbidden for OpenGL. The first call to "
+               "`pl_swapchain_resize` must include the width and height of the "
+               "swapchain, because there's no way to figure this out from "
+               "within the API.");
+        pl_mutex_unlock(&p->lock);
+        return false;
+    }
+
+    *width = p->fb->params.w;
+    *height = p->fb->params.h;
+    pl_mutex_unlock(&p->lock);
+    return true;
+}
+
+void pl_opengl_swapchain_update_fb(pl_swapchain sw,
+                                   const struct pl_opengl_framebuffer *fb)
+{
+    struct priv *p = PL_PRIV(sw);
+    pl_mutex_lock(&p->lock);
+    if (p->frame_started) {
+        PL_ERR(sw,"Tried calling `pl_opengl_swapchain_update_fb` while a frame "
+               "was in progress! Please submit the current frame first.");
+        pl_mutex_unlock(&p->lock);
+        return;
+    }
+
+    if (p->params.framebuffer.id != fb->id)
+        pl_tex_destroy(sw->gpu, &p->fb);
+
+    p->params.framebuffer = *fb;
+    pl_mutex_unlock(&p->lock);
+}
+
+static bool gl_sw_start_frame(pl_swapchain sw,
+                              struct pl_swapchain_frame *out_frame)
+{
+    struct priv *p = PL_PRIV(sw);
+    pl_mutex_lock(&p->lock);
+    bool ok = false;
+
+    if (!p->fb) {
+        PL_ERR(sw, "Unknown framebuffer size. Please call `pl_swapchain_resize` "
+               "before `pl_swapchain_start_frame` for OpenGL swapchains!");
+        goto error;
+    }
+
+    if (p->frame_started) {
+        PL_ERR(sw, "Attempted calling `pl_swapchain_start` while a frame was "
+               "already in progress! Call `pl_swapchain_submit_frame` first.");
+        goto error;
+    }
+
+    if (!gl_make_current(p->gl))
+        goto error;
+
+    *out_frame = (struct pl_swapchain_frame) {
+        .fbo = p->fb,
+        .flipped = !p->params.framebuffer.flipped,
+        .color_repr = {
+            .sys = PL_COLOR_SYSTEM_RGB,
+            .levels = PL_COLOR_LEVELS_FULL,
+            .alpha = p->fb->params.format->num_components == 4
+                        ? PL_ALPHA_PREMULTIPLIED
+                        : PL_ALPHA_UNKNOWN,
+            .bits = {
+                // Just use the red channel in the absence of anything more
+                // sane to do, because the red channel is both guaranteed to
+                // exist and also typically has the minimum number of bits
+                // (which is arguably what matters for dithering)
+                .sample_depth = p->fb->params.format->component_depth[0],
+                .color_depth = p->fb->params.format->component_depth[0],
+            },
+        },
+        .color_space = pl_color_space_monitor,
+    };
+
+    p->frame_started = gl_check_err(sw->gpu, "gl_sw_start_frame");
+    if (!p->frame_started)
+        goto error;
+
+    // keep p->lock held
+    gl_release_current(p->gl);
+    return true;
+
+error:
+    gl_release_current(p->gl);
+    pl_mutex_unlock(&p->lock);
+    return ok;
+}
+
+static bool gl_sw_submit_frame(pl_swapchain sw)
+{
+    struct priv *p = PL_PRIV(sw);
+    struct gl_ctx *glctx = PL_PRIV(p->gl);
+    const gl_funcs *gl = &glctx->func;
+    if (!gl_make_current(p->gl)) {
+        p->frame_started = false;
+        pl_mutex_unlock(&p->lock);
+        return false;
+    }
+
+    pl_assert(p->frame_started);
+    if (p->has_sync && p->params.max_swapchain_depth) {
+        GLsync fence = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+        if (fence)
+            PL_ARRAY_APPEND(sw, p->vsync_fences, fence);
+    }
+
+    gl->Flush();
+    p->frame_started = false;
+    bool ok = gl_check_err(sw->gpu, "gl_sw_submit_frame");
+    gl_release_current(p->gl);
+    pl_mutex_unlock(&p->lock);
+
+    return ok;
+}
+
+static void gl_sw_swap_buffers(pl_swapchain sw)
+{
+    struct priv *p = PL_PRIV(sw);
+    struct gl_ctx *glctx = PL_PRIV(p->gl);
+    const gl_funcs *gl = &glctx->func;
+    if (!p->params.swap_buffers) {
+        PL_ERR(sw, "`pl_swapchain_swap_buffers` called but no "
+               "`params.swap_buffers` callback set!");
+        return;
+    }
+
+    pl_mutex_lock(&p->lock);
+    if (!gl_make_current(p->gl)) {
+        pl_mutex_unlock(&p->lock);
+        return;
+    }
+
+    p->params.swap_buffers(p->params.priv);
+
+    const int max_depth = p->params.max_swapchain_depth;
+    while (max_depth && p->vsync_fences.num >= max_depth) {
+        gl->ClientWaitSync(p->vsync_fences.elem[0], GL_SYNC_FLUSH_COMMANDS_BIT, 1e9);
+        gl->DeleteSync(p->vsync_fences.elem[0]);
+        PL_ARRAY_REMOVE_AT(p->vsync_fences, 0);
+    }
+
+    gl_check_err(sw->gpu, "gl_sw_swap_buffers");
+    gl_release_current(p->gl);
+    pl_mutex_unlock(&p->lock);
+}
+
+static const struct pl_sw_fns opengl_swapchain = {
+    .destroy      = gl_sw_destroy,
+    .latency      = gl_sw_latency,
+    .resize       = gl_sw_resize,
+    .start_frame  = gl_sw_start_frame,
+    .submit_frame = gl_sw_submit_frame,
+    .swap_buffers = gl_sw_swap_buffers,
+};
diff --git a/src/opengl/utils.c b/src/opengl/utils.c
new file mode 100644
index 0000000..d96a3e7
--- /dev/null
+++ b/src/opengl/utils.c
@@ -0,0 +1,158 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include "gpu.h"
+#include "utils.h"
+
+const char *gl_err_str(GLenum err)
+{
+    switch (err) {
+#define CASE(name) case name: return #name
+    CASE(GL_NO_ERROR);
+    CASE(GL_INVALID_ENUM);
+    CASE(GL_INVALID_VALUE);
+    CASE(GL_INVALID_OPERATION);
+    CASE(GL_INVALID_FRAMEBUFFER_OPERATION);
+    CASE(GL_OUT_OF_MEMORY);
+    CASE(GL_STACK_UNDERFLOW);
+    CASE(GL_STACK_OVERFLOW);
+#undef CASE
+
+    default: return "unknown error";
+    }
+}
+
+void gl_poll_callbacks(pl_gpu gpu)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    struct pl_gl *p = PL_PRIV(gpu);
+    while (p->callbacks.num) {
+        struct gl_cb cb = p->callbacks.elem[0];
+        GLenum res = gl->ClientWaitSync(cb.sync, 0, 0);
+        switch (res) {
+        case GL_ALREADY_SIGNALED:
+        case GL_CONDITION_SATISFIED:
+            PL_ARRAY_REMOVE_AT(p->callbacks, 0);
+            cb.callback(cb.priv);
+            continue;
+
+        case GL_WAIT_FAILED:
+            PL_ARRAY_REMOVE_AT(p->callbacks, 0);
+            gl->DeleteSync(cb.sync);
+            p->failed = true;
+            gl_check_err(gpu, "gl_poll_callbacks"); // NOTE: will recurse!
+            return;
+
+        case GL_TIMEOUT_EXPIRED:
+            return;
+
+        default:
+            pl_unreachable();
+        }
+    }
+}
+
+bool gl_check_err(pl_gpu gpu, const char *fun)
+{
+    const gl_funcs *gl = gl_funcs_get(gpu);
+    struct pl_gl *p = PL_PRIV(gpu);
+    bool ret = true;
+
+    while (true) {
+        GLenum error = gl->GetError();
+        if (error == GL_NO_ERROR)
+            break;
+        PL_ERR(gpu, "%s: OpenGL error: %s", fun, gl_err_str(error));
+        ret = false;
+        p->failed = true;
+    }
+
+    gl_poll_callbacks(gpu);
+    return ret;
+}
+
+bool gl_is_software(pl_opengl pl_gl)
+{
+    struct gl_ctx *glctx = PL_PRIV(pl_gl);
+    const gl_funcs *gl = &glctx->func;
+    const char *renderer = (char *) gl->GetString(GL_RENDERER);
+    return !renderer ||
+           strcmp(renderer, "Software Rasterizer") == 0 ||
+           strstr(renderer, "llvmpipe") ||
+           strstr(renderer, "softpipe") ||
+           strcmp(renderer, "Mesa X11") == 0 ||
+           strcmp(renderer, "Apple Software Renderer") == 0;
+}
+
+bool gl_is_gles(pl_opengl pl_gl)
+{
+    struct gl_ctx *glctx = PL_PRIV(pl_gl);
+    const gl_funcs *gl = &glctx->func;
+    const char *version = (char *) gl->GetString(GL_VERSION);
+    return pl_str_startswith0(pl_str0(version), "OpenGL ES");
+}
+
+bool gl_test_ext(pl_gpu gpu, const char *ext, int gl_ver, int gles_ver)
+{
+    struct pl_gl *p = PL_PRIV(gpu);
+    if (gl_ver && p->gl_ver >= gl_ver)
+        return true;
+    if (gles_ver && p->gles_ver >= gles_ver)
+        return true;
+
+    return ext ? pl_opengl_has_ext(p->gl, ext) : false;
+}
+
+const char *egl_err_str(EGLenum err)
+{
+    switch (err) {
+#define CASE(name) case name: return #name
+    CASE(EGL_SUCCESS);
+    CASE(EGL_NOT_INITIALIZED);
+    CASE(EGL_BAD_ACCESS);
+    CASE(EGL_BAD_ALLOC);
+    CASE(EGL_BAD_ATTRIBUTE);
+    CASE(EGL_BAD_CONFIG);
+    CASE(EGL_BAD_CONTEXT);
+    CASE(EGL_BAD_CURRENT_SURFACE);
+    CASE(EGL_BAD_DISPLAY);
+    CASE(EGL_BAD_MATCH);
+    CASE(EGL_BAD_NATIVE_PIXMAP);
+    CASE(EGL_BAD_NATIVE_WINDOW);
+    CASE(EGL_BAD_PARAMETER);
+    CASE(EGL_BAD_SURFACE);
+#undef CASE
+
+    default: return "unknown error";
+    }
+}
+
+bool egl_check_err(pl_gpu gpu, const char *fun)
+{
+    struct pl_gl *p = PL_PRIV(gpu);
+    bool ret = true;
+
+    while (true) {
+        GLenum error = eglGetError();
+        if (error == EGL_SUCCESS)
+            return ret;
+        PL_ERR(gpu, "%s: EGL error: %s", fun, egl_err_str(error));
+        ret = false;
+        p->failed = true;
+    }
+}
diff --git a/src/opengl/utils.h b/src/opengl/utils.h
new file mode 100644
index 0000000..0be229d
--- /dev/null
+++ b/src/opengl/utils.h
@@ -0,0 +1,57 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+// Iterate through callbacks attached to the `pl_gl` and execute all of the
+// ones that have completed.
+//
+// Thread-safety: Unsafe
+void gl_poll_callbacks(pl_gpu gpu);
+
+// Return a human-readable name for various OpenGL errors
+//
+// Thread-safety: Safe
+const char *gl_err_str(GLenum err);
+
+// Check for errors and log them + return false if detected
+//
+// Thread-safety: Unsafe
+bool gl_check_err(pl_gpu gpu, const char *fun);
+
+// Returns true if the context is a suspected software rasterizer
+//
+// Thread-safety: Unsafe
+bool gl_is_software(pl_opengl gl);
+
+// Returns true if the context is detected as OpenGL ES
+//
+// Thread-safety: Unsafe
+bool gl_is_gles(pl_opengl gl);
+
+// Check for presence of an extension, alternatively a minimum GL version
+//
+// Thread-safety: Unsafe
+bool gl_test_ext(pl_gpu gpu, const char *ext, int gl_ver, int gles_ver);
+
+// Thread-safety: Safe
+const char *egl_err_str(EGLenum err);
+
+// Thread-safety: Unsafe
+bool egl_check_err(pl_gpu gpu, const char *fun);
diff --git a/src/options.c b/src/options.c
new file mode 100644
index 0000000..1db53bf
--- /dev/null
+++ b/src/options.c
@@ -0,0 +1,1166 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include "common.h"
+#include "log.h"
+
+#include <libplacebo/options.h>
+
+struct priv {
+    pl_log log;
+
+    // for pl_options_get
+    struct pl_opt_data_t data;
+    pl_str data_text;
+
+    // for pl_options_save
+    pl_str saved;
+
+    // internally managed hooks array
+    PL_ARRAY(const struct pl_hook *) hooks;
+};
+
+static const struct pl_options_t defaults = {
+    .params             = { PL_RENDER_DEFAULTS },
+    .deband_params      = { PL_DEBAND_DEFAULTS },
+    .sigmoid_params     = { PL_SIGMOID_DEFAULTS },
+    .color_adjustment   = { PL_COLOR_ADJUSTMENT_NEUTRAL },
+    .peak_detect_params = { PL_PEAK_DETECT_DEFAULTS },
+    .color_map_params   = { PL_COLOR_MAP_DEFAULTS },
+    .dither_params      = { PL_DITHER_DEFAULTS },
+    .icc_params         = { PL_ICC_DEFAULTS },
+    .cone_params        = { PL_CONE_NONE, 1.0 },
+    .deinterlace_params = { PL_DEINTERLACE_DEFAULTS },
+    .distort_params     = { PL_DISTORT_DEFAULTS },
+    .upscaler = {
+        .name           = "custom",
+        .description    = "Custom upscaler",
+        .allowed        = PL_FILTER_UPSCALING,
+    },
+    .downscaler = {
+        .name           = "custom",
+        .description    = "Custom downscaler",
+        .allowed        = PL_FILTER_DOWNSCALING,
+    },
+    .plane_upscaler = {
+        .name           = "custom",
+        .description    = "Custom plane upscaler",
+        .allowed        = PL_FILTER_UPSCALING,
+    },
+    .plane_downscaler = {
+        .name           = "custom",
+        .description    = "Custom plane downscaler",
+        .allowed        = PL_FILTER_DOWNSCALING,
+    },
+    .frame_mixer = {
+        .name           = "custom",
+        .description    = "Custom frame mixer",
+        .allowed        = PL_FILTER_FRAME_MIXING,
+    },
+};
+
+// Copies only whitelisted fields
+static inline void copy_filter(struct pl_filter_config *dst,
+                               const struct pl_filter_config *src)
+{
+    dst->kernel = src->kernel;
+    dst->window = src->window;
+    dst->radius = src->radius;
+    dst->clamp  = src->clamp;
+    dst->blur   = src->blur;
+    dst->taper  = src->taper;
+    dst->polar  = src->polar;
+    for (int i = 0; i < PL_FILTER_MAX_PARAMS; i++) {
+        dst->params[i]  = src->params[i];
+        dst->wparams[i] = src->wparams[i];
+    }
+}
+
+static inline void redirect_params(pl_options opts)
+{
+    // Copy all non-NULL params structs into pl_options and redirect them
+#define REDIRECT_PARAMS(field) do          \
+{                                          \
+    if (opts->params.field) {              \
+        opts->field = *opts->params.field; \
+        opts->params.field = &opts->field; \
+    }                                      \
+} while (0)
+
+    REDIRECT_PARAMS(deband_params);
+    REDIRECT_PARAMS(sigmoid_params);
+    REDIRECT_PARAMS(color_adjustment);
+    REDIRECT_PARAMS(peak_detect_params);
+    REDIRECT_PARAMS(color_map_params);
+    REDIRECT_PARAMS(dither_params);
+    REDIRECT_PARAMS(icc_params);
+    REDIRECT_PARAMS(cone_params);
+    REDIRECT_PARAMS(deinterlace_params);
+    REDIRECT_PARAMS(distort_params);
+}
+
+void pl_options_reset(pl_options opts, const struct pl_render_params *preset)
+{
+    *opts = defaults;
+    if (preset)
+        opts->params = *preset;
+    redirect_params(opts);
+
+    // Make a copy of all scaler configurations that aren't built-in filters
+    struct {
+        bool upscaler;
+        bool downscaler;
+        bool plane_upscaler;
+        bool plane_downscaler;
+        bool frame_mixer;
+    } fixed = {0};
+
+    for (int i = 0; i < pl_num_filter_configs; i++) {
+        const struct pl_filter_config *f = pl_filter_configs[i];
+        fixed.upscaler         |= f == opts->params.upscaler;
+        fixed.downscaler       |= f == opts->params.downscaler;
+        fixed.plane_upscaler   |= f == opts->params.plane_upscaler;
+        fixed.plane_downscaler |= f == opts->params.plane_downscaler;
+        fixed.frame_mixer      |= f == opts->params.frame_mixer;
+    }
+
+#define REDIRECT_SCALER(scaler) do                       \
+{                                                        \
+    if (opts->params.scaler && !fixed.scaler) {          \
+        copy_filter(&opts->scaler, opts->params.scaler); \
+        opts->params.scaler = &opts->scaler;             \
+    }                                                    \
+} while (0)
+
+    REDIRECT_SCALER(upscaler);
+    REDIRECT_SCALER(downscaler);
+    REDIRECT_SCALER(plane_upscaler);
+    REDIRECT_SCALER(plane_downscaler);
+    REDIRECT_SCALER(frame_mixer);
+}
+
+pl_options pl_options_alloc(pl_log log)
+{
+    struct pl_options_t *opts = pl_zalloc_obj(NULL, opts, struct priv);
+    struct priv *p = PL_PRIV(opts);
+    pl_options_reset(opts, NULL);
+    p->log = log;
+    return opts;
+}
+
+void pl_options_free(pl_options *popts)
+{
+    pl_free_ptr((void **) popts);
+}
+
+static void make_hooks_internal(pl_options opts)
+{
+    struct priv *p = PL_PRIV(opts);
+    struct pl_render_params *params = &opts->params;
+    if (params->num_hooks && params->hooks != p->hooks.elem) {
+        PL_ARRAY_MEMDUP(opts, p->hooks, params->hooks, params->num_hooks);
+        params->hooks = p->hooks.elem;
+    }
+}
+
+void pl_options_add_hook(pl_options opts, const struct pl_hook *hook)
+{
+    struct priv *p = PL_PRIV(opts);
+    make_hooks_internal(opts);
+    PL_ARRAY_APPEND(opts, p->hooks, hook);
+    opts->params.hooks = p->hooks.elem;
+}
+
+void pl_options_insert_hook(pl_options opts, const struct pl_hook *hook, int idx)
+{
+    struct priv *p = PL_PRIV(opts);
+    make_hooks_internal(opts);
+    PL_ARRAY_INSERT_AT(opts, p->hooks, idx, hook);
+    opts->params.hooks = p->hooks.elem;
+}
+
+void pl_options_remove_hook_at(pl_options opts, int idx)
+{
+    struct priv *p = PL_PRIV(opts);
+    make_hooks_internal(opts);
+    PL_ARRAY_REMOVE_AT(p->hooks, idx);
+    opts->params.hooks = p->hooks.elem;
+}
+
+// Options printing/parsing context
+typedef const struct opt_ctx_t {
+    pl_log log; // as a convenience, only needed when parsing
+    pl_opt opt;
+    void *alloc; // for printing only
+    pl_options opts; // current base ptr
+} *opt_ctx;
+
+struct enum_val {
+    const char *name;
+    unsigned val;
+};
+
+struct preset {
+    const char *name;
+    const void *val;
+};
+
+struct named {
+    const char *name;
+};
+
+typedef const struct opt_priv_t {
+    int (*compare)(opt_ctx p, const void *a, const void *b); // optional
+    void (*print)(opt_ctx p, pl_str *out, const void *val); // apends to `out`
+    bool (*parse)(opt_ctx p, pl_str str, void *out_val);
+    const struct enum_val *values; // for enums, terminated by {0}
+    const struct preset *presets; // for preset lists, terminated by {0}
+    const struct named * const *names; // for array-backed options, terminated by NULL
+
+    // Offset and size of option in `struct pl_options_t`
+    size_t offset;
+    size_t size;
+    size_t offset_params; // offset of actual struct (for params toggles)
+} *opt_priv;
+
+static pl_opt_data get_opt_data(opt_ctx ctx)
+{
+    pl_options opts = ctx->opts;
+    struct priv *p = PL_PRIV(opts);
+    opt_priv priv = ctx->opt->priv;
+    const void *val = (void *) ((uintptr_t) opts + priv->offset);
+
+    p->data_text.len = 0;
+    priv->print(ctx, &p->data_text, val);
+    p->data = (struct pl_opt_data_t) {
+        .opts  = opts,
+        .opt   = ctx->opt,
+        .value = val,
+        .text  = (char *) p->data_text.buf,
+    };
+
+    return &p->data;
+}
+
+pl_opt_data pl_options_get(pl_options opts, const char *key)
+{
+    struct priv *p = PL_PRIV(opts);
+
+    pl_opt opt = pl_find_option(key);
+    if (!opt || opt->preset) {
+        PL_ERR(p, "Unrecognized or invalid option '%s'", key);
+        return NULL;
+    }
+
+    return get_opt_data(&(struct opt_ctx_t) {
+        .alloc = opts,
+        .opts  = opts,
+        .opt   = opt,
+    });
+}
+
+void pl_options_iterate(pl_options opts,
+                        void (*cb)(void *priv, pl_opt_data data),
+                        void *cb_priv)
+{
+    for (pl_opt opt = pl_option_list; opt->key; opt++) {
+        if (opt->preset)
+            continue;
+
+        struct opt_ctx_t ctx = {
+            .alloc = opts,
+            .opts  = opts,
+            .opt   = opt,
+        };
+
+        opt_priv priv = opt->priv;
+        const void *val = (void *) ((uintptr_t) opts + priv->offset);
+        const void *ref = (void *) ((uintptr_t) &defaults + priv->offset);
+        int cmp = priv->compare ? priv->compare(&ctx, val, ref)
+                                : memcmp(val, ref, priv->size);
+        if (cmp != 0)
+            cb(cb_priv, get_opt_data(&ctx));
+    }
+}
+
+static void save_cb(void *priv, pl_opt_data data)
+{
+    pl_opt opt = data->opt;
+    void *alloc = data->opts;
+    pl_str *out = priv;
+
+    if (out->len)
+        pl_str_append_raw(alloc, out, ",", 1);
+    pl_str_append_raw(alloc, out, opt->key, strlen(opt->key));
+    pl_str_append_raw(alloc, out, "=", 1);
+    pl_str_append(alloc, out, pl_str0(data->text));
+}
+
+const char *pl_options_save(pl_options opts)
+{
+    struct priv *p = PL_PRIV(opts);
+
+    p->saved.len = 0;
+    pl_options_iterate(opts, save_cb, &p->saved);
+    return p->saved.len ? (char *) p->saved.buf : "";
+}
+
+static bool option_set_raw(pl_options opts, pl_str k, pl_str v)
+{
+    struct priv *p = PL_PRIV(opts);
+    k = pl_str_strip(k);
+    v = pl_str_strip(v);
+
+    pl_opt opt;
+    for (opt = pl_option_list; opt->key; opt++) {
+        if (pl_str_equals0(k, opt->key))
+            goto found;
+    }
+
+    PL_ERR(p, "Unrecognized option '%.*s', in '%.*s=%.*s'",
+           PL_STR_FMT(k), PL_STR_FMT(k), PL_STR_FMT(v));
+    return false;
+
+found:
+    PL_TRACE(p, "Parsing option '%s' = '%.*s'", opt->key, PL_STR_FMT(v));
+    if (opt->deprecated)
+        PL_WARN(p, "Option '%s' is deprecated", opt->key);
+
+    struct opt_ctx_t ctx = {
+        .log  = p->log,
+        .opts = opts,
+        .opt  = opt,
+    };
+
+    opt_priv priv = opt->priv;
+    void *val = (void *) ((uintptr_t) opts + priv->offset);
+    return priv->parse(&ctx, v, val);
+}
+
+bool pl_options_set_str(pl_options opts, const char *key, const char *value)
+{
+    return option_set_raw(opts, pl_str0(key), pl_str0(value));
+}
+
+bool pl_options_load(pl_options opts, const char *str)
+{
+    bool ret = true;
+
+    pl_str rest = pl_str0(str);
+    while (rest.len) {
+        pl_str kv = pl_str_strip(pl_str_split_chars(rest, " ,;:\n", &rest));
+        if (!kv.len)
+            continue;
+        pl_str v, k = pl_str_split_char(kv, '=', &v);
+        ret &= option_set_raw(opts, k, v);
+    }
+
+    return ret;
+}
+
+// Individual option types
+
+static void print_bool(opt_ctx p, pl_str *out, const void *ptr)
+{
+    const bool *val = ptr;
+    if (*val) {
+        pl_str_append(p->alloc, out, pl_str0("yes"));
+    } else {
+        pl_str_append(p->alloc, out, pl_str0("no"));
+    }
+}
+
+static bool parse_bool(opt_ctx p, pl_str str, void *out)
+{
+    bool *res = out;
+    if (pl_str_equals0(str, "yes") ||
+        pl_str_equals0(str, "y") ||
+        pl_str_equals0(str, "on") ||
+        pl_str_equals0(str, "true") ||
+        pl_str_equals0(str, "enabled") ||
+        !str.len) // accept naked option name as well
+    {
+        *res = true;
+        return true;
+    } else if (pl_str_equals0(str, "no") ||
+               pl_str_equals0(str, "n") ||
+               pl_str_equals0(str, "off") ||
+               pl_str_equals0(str, "false") ||
+               pl_str_equals0(str, "disabled"))
+    {
+        *res = false;
+        return true;
+    }
+
+    PL_ERR(p, "Invalid value '%.*s' for option '%s', expected boolean",
+           PL_STR_FMT(str), p->opt->key);
+    return false;
+}
+
+static void print_int(opt_ctx p, pl_str *out, const void *ptr)
+{
+    pl_opt opt = p->opt;
+    const int *val = ptr;
+    pl_assert(opt->min == opt->max || (*val >= opt->min && *val <= opt->max));
+    pl_str_append_asprintf_c(p->alloc, out, "%d", *val);
+}
+
+static bool parse_int(opt_ctx p, pl_str str, void *out)
+{
+    pl_opt opt = p->opt;
+    int val;
+    if (!pl_str_parse_int(str, &val)) {
+        PL_ERR(p, "Invalid value '%.*s' for option '%s', expected integer",
+               PL_STR_FMT(str), opt->key);
+        return false;
+    }
+
+    if (opt->min != opt->max) {
+        if (val < opt->min || val > opt->max) {
+            PL_ERR(p, "Value of %d out of range for option '%s': [%d, %d]",
+                   val, opt->key, (int) opt->min, (int) opt->max);
+            return false;
+        }
+    }
+
+    *(int *) out = val;
+    return true;
+}
+
+static void print_float(opt_ctx p, pl_str *out, const void *ptr)
+{
+    pl_opt opt = p->opt;
+    const float *val = ptr;
+    pl_assert(opt->min == opt->max || (*val >= opt->min && *val <= opt->max));
+    pl_str_append_asprintf_c(p->alloc, out, "%f", *val);
+}
+
+static bool parse_fraction(pl_str str, float *val)
+{
+    pl_str denom, num = pl_str_split_char(str, '/', &denom);
+    float n, d;
+    bool ok = denom.buf && denom.len && pl_str_parse_float(num, &n) &&
+                                        pl_str_parse_float(denom, &d);
+    if (ok)
+        *val = n / d;
+    return ok;
+}
+
+static bool parse_float(opt_ctx p, pl_str str, void *out)
+{
+    pl_opt opt = p->opt;
+    float val;
+    if (!parse_fraction(str, &val) && !pl_str_parse_float(str, &val)) {
+        PL_ERR(p, "Invalid value '%.*s' for option '%s', expected floating point "
+                  "or fraction", PL_STR_FMT(str), opt->key);
+        return false;
+    }
+
+    switch (fpclassify(val)) {
+    case FP_NAN:
+    case FP_INFINITE:
+    case FP_SUBNORMAL:
+        PL_ERR(p, "Invalid value '%f' for option '%s', non-normal float",
+               val, opt->key);
+        return false;
+
+    case FP_ZERO:
+    case FP_NORMAL:
+        break;
+    }
+
+    if (opt->min != opt->max) {
+        if (val < opt->min || val > opt->max) {
+            PL_ERR(p, "Value of %.3f out of range for option '%s': [%.2f, %.2f]",
+                   val, opt->key, opt->min, opt->max);
+            return false;
+        }
+    }
+
+    *(float *) out = val;
+    return true;
+}
+
+static int compare_params(opt_ctx p, const void *pa, const void *pb)
+{
+    const bool a = *(const void * const *) pa;
+    const bool b = *(const void * const *) pb;
+    return PL_CMP(a, b);
+}
+
+static void print_params(opt_ctx p, pl_str *out, const void *ptr)
+{
+    const bool value = *(const void * const *) ptr;
+    print_bool(p, out, &value);
+}
+
+static bool parse_params(opt_ctx p, pl_str str, void *out)
+{
+    pl_opt opt = p->opt;
+    opt_priv priv = opt->priv;
+    const void **res = out;
+    bool set;
+    if (!parse_bool(p, str, &set))
+        return false;
+    if (set) {
+        *res = (const void *) ((uintptr_t) p->opts + priv->offset_params);
+    } else {
+        *res = NULL;
+    }
+    return true;
+}
+
+static void print_enum(opt_ctx p, pl_str *out, const void *ptr)
+{
+    pl_opt opt = p->opt;
+    opt_priv priv = opt->priv;
+    const unsigned value = *(const unsigned *) ptr;
+    for (int i = 0; priv->values[i].name; i++) {
+        if (priv->values[i].val == value) {
+            pl_str_append(p->alloc, out, pl_str0(priv->values[i].name));
+            return;
+        }
+    }
+
+    pl_unreachable();
+}
+
+static bool parse_enum(opt_ctx p, pl_str str, void *out)
+{
+    pl_opt opt = p->opt;
+    opt_priv priv = opt->priv;
+    for (int i = 0; priv->values[i].name; i++) {
+        if (pl_str_equals0(str, priv->values[i].name)) {
+            *(unsigned *) out = priv->values[i].val;
+            return true;
+        }
+    }
+
+    PL_ERR(p, "Value of '%.*s' unrecognized for option '%s', valid values:",
+           PL_STR_FMT(str), opt->key);
+    for (int i = 0; priv->values[i].name; i++)
+        PL_ERR(p, "  %s", priv->values[i].name);
+    return false;
+}
+
+static bool parse_preset(opt_ctx p, pl_str str, void *out)
+{
+    pl_opt opt = p->opt;
+    opt_priv priv = opt->priv;
+    for (int i = 0; priv->presets[i].name; i++) {
+        if (pl_str_equals0(str, priv->presets[i].name)) {
+            if (priv->offset == offsetof(struct pl_options_t, params)) {
+                const struct pl_render_params *preset = priv->presets[i].val;
+                pl_assert(priv->size == sizeof(*preset));
+
+                // Redirect params structs into internal system after loading
+                struct pl_render_params *params = out, prev = *params;
+                *params = *preset;
+                redirect_params(p->opts);
+
+                // Re-apply excluded options
+                params->lut = prev.lut;
+                params->hooks = prev.hooks;
+                params->num_hooks = prev.num_hooks;
+                params->info_callback = prev.info_callback;
+                params->info_priv = prev.info_priv;
+            } else {
+                memcpy(out, priv->presets[i].val, priv->size);
+            }
+            return true;
+        }
+    }
+
+    PL_ERR(p, "Value of '%.*s' unrecognized for option '%s', valid values:",
+           PL_STR_FMT(str), opt->key);
+    for (int i = 0; priv->presets[i].name; i++)
+        PL_ERR(p, "  %s", priv->presets[i].name);
+    return false;
+}
+
+static void print_named(opt_ctx p, pl_str *out, const void *ptr)
+{
+    const struct named *value = *(const struct named **) ptr;
+    if (value) {
+        pl_str_append(p->alloc, out, pl_str0(value->name));
+    } else {
+        pl_str_append(p->alloc, out, pl_str0("none"));
+    }
+}
+
+static bool parse_named(opt_ctx p, pl_str str, void *out)
+{
+    pl_opt opt = p->opt;
+    opt_priv priv = opt->priv;
+    const struct named **res = out;
+    if (pl_str_equals0(str, "none")) {
+        *res = NULL;
+        return true;
+    }
+
+    for (int i = 0; priv->names[i]; i++) {
+        if (pl_str_equals0(str, priv->names[i]->name)) {
+            *res = priv->names[i];
+            return true;
+        }
+    }
+
+    PL_ERR(p, "Value of '%.*s' unrecognized for option '%s', valid values:",
+           PL_STR_FMT(str), opt->key);
+    PL_ERR(p, "  none");
+    for (int i = 0; priv->names[i]; i++)
+        PL_ERR(p, "  %s", priv->names[i]->name);
+    return false;
+}
+
+static void print_scaler(opt_ctx p, pl_str *out, const void *ptr)
+{
+    const struct pl_filter_config *f = *(const struct pl_filter_config **) ptr;
+    if (f) {
+        pl_assert(f->name); // this is either a built-in scaler or ptr to custom
+        pl_str_append(p->alloc, out, pl_str0(f->name));
+    } else {
+        pl_str_append(p->alloc, out, pl_str0("none"));
+    }
+}
+
+static enum pl_filter_usage scaler_usage(pl_opt opt)
+{
+    opt_priv priv = opt->priv;
+    switch (priv->offset) {
+    case offsetof(struct pl_options_t, params.upscaler):
+    case offsetof(struct pl_options_t, params.plane_upscaler):
+    case offsetof(struct pl_options_t, upscaler):
+    case offsetof(struct pl_options_t, plane_upscaler):
+        return PL_FILTER_UPSCALING;
+
+    case offsetof(struct pl_options_t, params.downscaler):
+    case offsetof(struct pl_options_t, params.plane_downscaler):
+    case offsetof(struct pl_options_t, downscaler):
+    case offsetof(struct pl_options_t, plane_downscaler):
+        return PL_FILTER_DOWNSCALING;
+
+    case offsetof(struct pl_options_t, params.frame_mixer):
+    case offsetof(struct pl_options_t, frame_mixer):
+        return PL_FILTER_FRAME_MIXING;
+    }
+
+    pl_unreachable();
+}
+
+static bool parse_scaler(opt_ctx p, pl_str str, void *out)
+{
+    pl_opt opt = p->opt;
+    opt_priv priv = opt->priv;
+    const struct pl_filter_config **res = out;
+    if (pl_str_equals0(str, "none")) {
+        *res = NULL;
+        return true;
+    } else if (pl_str_equals0(str, "custom")) {
+        *res = (void *) ((uintptr_t) p->opts + priv->offset_params);
+        return true;
+    }
+
+    const enum pl_filter_usage usage = scaler_usage(opt);
+    for (int i = 0; i < pl_num_filter_configs; i++) {
+        if (!(pl_filter_configs[i]->allowed & usage))
+            continue;
+        if (pl_str_equals0(str, pl_filter_configs[i]->name)) {
+            *res = pl_filter_configs[i];
+            return true;
+        }
+    }
+
+    PL_ERR(p, "Value of '%.*s' unrecognized for option '%s', valid values:",
+           PL_STR_FMT(str), opt->key);
+    PL_ERR(p, "  none");
+    PL_ERR(p, "  custom");
+    for (int i = 0; i < pl_num_filter_configs; i++) {
+        if (pl_filter_configs[i]->allowed & usage)
+            PL_ERR(p, "  %s", pl_filter_configs[i]->name);
+    }
+    return false;
+}
+
+static bool parse_scaler_preset(opt_ctx p, pl_str str, void *out)
+{
+    pl_opt opt = p->opt;
+    struct pl_filter_config *res = out;
+    if (pl_str_equals0(str, "none")) {
+        *res = (struct pl_filter_config) { .name = "custom" };
+        return true;
+    }
+
+    const enum pl_filter_usage usage = scaler_usage(opt);
+    for (int i = 0; i < pl_num_filter_configs; i++) {
+        if (!(pl_filter_configs[i]->allowed & usage))
+            continue;
+        if (pl_str_equals0(str, pl_filter_configs[i]->name)) {
+            copy_filter(res, pl_filter_configs[i]);
+            return true;
+        }
+    }
+
+    PL_ERR(p, "Value of '%.*s' unrecognized for option '%s', valid values:",
+           PL_STR_FMT(str), opt->key);
+    PL_ERR(p, "  none");
+    for (int i = 0; i < pl_num_filter_configs; i++) {
+        if (pl_filter_configs[i]->allowed & usage)
+            PL_ERR(p, "  %s", pl_filter_configs[i]->name);
+    }
+    return false;
+}
+
+#define OPT_BOOL(KEY, NAME, FIELD, ...)                                         \
+    {                                                                           \
+        .key  = KEY,                                                            \
+        .name = NAME,                                                           \
+        .type = PL_OPT_BOOL,                                                    \
+        .priv = &(struct opt_priv_t) {                                          \
+            .print  = print_bool,                                               \
+            .parse  = parse_bool,                                               \
+            .offset = offsetof(struct pl_options_t, FIELD),                     \
+            .size   = sizeof(struct {                                           \
+                bool dummy;                                                     \
+                pl_static_assert(sizeof(defaults.FIELD) == sizeof(bool));       \
+            }),                                                                 \
+        },                                                                      \
+        __VA_ARGS__                                                             \
+    }
+
+#define OPT_INT(KEY, NAME, FIELD, ...)                                          \
+    {                                                                           \
+        .key  = KEY,                                                            \
+        .name = NAME,                                                           \
+        .type = PL_OPT_INT,                                                     \
+        .priv = &(struct opt_priv_t) {                                          \
+            .print  = print_int,                                                \
+            .parse  = parse_int,                                                \
+            .offset = offsetof(struct pl_options_t, FIELD),                     \
+            .size   = sizeof(struct {                                           \
+                int dummy;                                                      \
+                pl_static_assert(sizeof(defaults.FIELD) == sizeof(int));        \
+            }),                                                                 \
+        },                                                                      \
+        __VA_ARGS__                                                             \
+    }
+
+#define OPT_FLOAT(KEY, NAME, FIELD, ...)                                        \
+    {                                                                           \
+        .key  = KEY,                                                            \
+        .name = NAME,                                                           \
+        .type = PL_OPT_FLOAT,                                                   \
+        .priv = &(struct opt_priv_t) {                                          \
+            .print  = print_float,                                              \
+            .parse  = parse_float,                                              \
+            .offset = offsetof(struct pl_options_t, FIELD),                     \
+            .size   = sizeof(struct {                                           \
+                float dummy;                                                    \
+                pl_static_assert(sizeof(defaults.FIELD) == sizeof(float));      \
+            }),                                                                 \
+        },                                                                      \
+        __VA_ARGS__                                                             \
+    }
+
+#define OPT_ENABLE_PARAMS(KEY, NAME, PARAMS, ...)                               \
+    {                                                                           \
+        .key  = KEY,                                                            \
+        .name = NAME,                                                           \
+        .type = PL_OPT_BOOL,                                                    \
+        .priv = &(struct opt_priv_t) {                                          \
+            .compare       = compare_params,                                    \
+            .print         = print_params,                                      \
+            .parse         = parse_params,                                      \
+            .offset        = offsetof(struct pl_options_t, params.PARAMS),      \
+            .offset_params = offsetof(struct pl_options_t, PARAMS),             \
+            .size          = sizeof(struct {                                    \
+                void *dummy;                                                    \
+                pl_static_assert(sizeof(defaults.params.PARAMS) == sizeof(void*));\
+            }),                                                                 \
+        },                                                                      \
+        __VA_ARGS__                                                             \
+    }
+
+#define OPT_ENUM(KEY, NAME, FIELD, VALUES, ...)                                 \
+    {                                                                           \
+        .key  = KEY,                                                            \
+        .name = NAME,                                                           \
+        .type = PL_OPT_STRING,                                                  \
+        .priv = &(struct opt_priv_t) {                                          \
+            .print  = print_enum,                                               \
+            .parse  = parse_enum,                                               \
+            .offset = offsetof(struct pl_options_t, FIELD),                     \
+            .size   = sizeof(struct {                                           \
+                unsigned dummy;                                                 \
+                pl_static_assert(sizeof(defaults.FIELD) == sizeof(unsigned));   \
+            }),                                                                 \
+            .values = (struct enum_val[]) { VALUES }                            \
+        },                                                                      \
+        __VA_ARGS__                                                             \
+    }
+
+#define OPT_PRESET(KEY, NAME, PARAMS, PRESETS, ...)                             \
+    {                                                                           \
+        .key    = KEY,                                                          \
+        .name   = NAME,                                                         \
+        .type   = PL_OPT_STRING,                                                \
+        .preset = true,                                                         \
+        .priv   = &(struct opt_priv_t) {                                        \
+            .parse   = parse_preset,                                            \
+            .offset  = offsetof(struct pl_options_t, PARAMS),                   \
+            .size    = sizeof(defaults.PARAMS),                                 \
+            .presets = (struct preset[]) { PRESETS },                           \
+        },                                                                      \
+        __VA_ARGS__                                                             \
+    }
+
+#define OPT_NAMED(KEY, NAME, FIELD, NAMES, ...)                                 \
+    {                                                                           \
+        .key  = KEY,                                                            \
+        .name = NAME,                                                           \
+        .type = PL_OPT_STRING,                                                  \
+        .priv = &(struct opt_priv_t) {                                          \
+            .print  = print_named,                                              \
+            .parse  = parse_named,                                              \
+            .offset = offsetof(struct pl_options_t, FIELD),                     \
+            .names  = (const struct named * const * ) NAMES,                    \
+            .size   = sizeof(struct {                                           \
+                const struct named *dummy;                                      \
+                pl_static_assert(offsetof(__typeof__(*NAMES[0]), name) == 0);   \
+                pl_static_assert(sizeof(defaults.FIELD) ==                      \
+                                 sizeof(const struct named *));                 \
+            }),                                                                 \
+        },                                                                      \
+        __VA_ARGS__                                                             \
+    }
+
+#define OPT_SCALER(KEY, NAME, SCALER, ...)                                      \
+    {                                                                           \
+        .key  = KEY,                                                            \
+        .name = NAME,                                                           \
+        .type = PL_OPT_STRING,                                                  \
+        .priv = &(struct opt_priv_t) {                                          \
+            .print         = print_scaler,                                      \
+            .parse         = parse_scaler,                                      \
+            .offset        = offsetof(struct pl_options_t, params.SCALER),      \
+            .offset_params = offsetof(struct pl_options_t, SCALER),             \
+            .size          = sizeof(struct {                                    \
+                const struct pl_filter_config *dummy;                           \
+                pl_static_assert(sizeof(defaults.SCALER) ==                     \
+                                 sizeof(struct pl_filter_config));              \
+            }),                                                                 \
+        },                                                                      \
+        __VA_ARGS__                                                             \
+    }
+
+#define OPT_SCALER_PRESET(KEY, NAME, SCALER, ...)                               \
+    {                                                                           \
+        .key    = KEY,                                                          \
+        .name   = NAME,                                                         \
+        .type   = PL_OPT_STRING,                                                \
+        .preset = true,                                                         \
+        .priv   = &(struct opt_priv_t) {                                        \
+            .parse         = parse_scaler_preset,                               \
+            .offset        = offsetof(struct pl_options_t, SCALER),             \
+            .size          = sizeof(struct {                                    \
+                struct pl_filter_config dummy;                                  \
+                pl_static_assert(sizeof(defaults.SCALER) ==                     \
+                                 sizeof(struct pl_filter_config));              \
+            }),                                                                 \
+        },                                                                      \
+        __VA_ARGS__                                                             \
+    }
+
+#define LIST(...) __VA_ARGS__, {0}
+
+#define SCALE_OPTS(PREFIX, NAME, FIELD)                                               \
+    OPT_SCALER(PREFIX, NAME, FIELD),                                                  \
+    OPT_SCALER_PRESET(PREFIX"_preset", NAME "preset", FIELD),                         \
+    OPT_NAMED(PREFIX"_kernel", NAME" kernel", FIELD.kernel, pl_filter_functions),     \
+    OPT_NAMED(PREFIX"_window", NAME" window", FIELD.window, pl_filter_functions),     \
+    OPT_FLOAT(PREFIX"_radius", NAME" radius", FIELD.radius, .min = 0.0, .max = 16.0), \
+    OPT_FLOAT(PREFIX"_clamp", NAME" clamping", FIELD.clamp, .max = 1.0),              \
+    OPT_FLOAT(PREFIX"_blur", NAME" blur factor", FIELD.blur, .max = 100.0),           \
+    OPT_FLOAT(PREFIX"_taper", NAME" taper factor", FIELD.taper, .max = 1.0),          \
+    OPT_FLOAT(PREFIX"_antiring", NAME" antiringing", FIELD.antiring, .max = 1.0),     \
+    OPT_FLOAT(PREFIX"_param1", NAME" parameter 1", FIELD.params[0]),                  \
+    OPT_FLOAT(PREFIX"_param2", NAME" parameter 2", FIELD.params[1]),                  \
+    OPT_FLOAT(PREFIX"_wparam1", NAME" window parameter 1", FIELD.wparams[0]),         \
+    OPT_FLOAT(PREFIX"_wparam2", NAME" window parameter 2", FIELD.wparams[1]),         \
+    OPT_BOOL(PREFIX"_polar", NAME" polar", FIELD.polar)
+
+const struct pl_opt_t pl_option_list[] = {
+    OPT_PRESET("preset", "Global preset", params, LIST(
+               {"default",      &pl_render_default_params},
+               {"fast",         &pl_render_fast_params},
+               {"high_quality", &pl_render_high_quality_params})),
+
+    // Scalers
+    SCALE_OPTS("upscaler", "Upscaler", upscaler),
+    SCALE_OPTS("downscaler", "Downscaler", downscaler),
+    SCALE_OPTS("plane_upscaler", "Plane upscaler", plane_upscaler),
+    SCALE_OPTS("plane_downscaler", "Plane downscaler", plane_downscaler),
+    SCALE_OPTS("frame_mixer", "Frame mixer", frame_mixer),
+    OPT_FLOAT("antiringing_strength", "Anti-ringing strength", params.antiringing_strength, .max = 1.0),
+
+    // Debanding
+    OPT_ENABLE_PARAMS("deband", "Enable debanding", deband_params),
+    OPT_PRESET("deband_preset", "Debanding preset", deband_params, LIST(
+               {"default", &pl_deband_default_params})),
+    OPT_INT("deband_iterations", "Debanding iterations", deband_params.iterations, .max = 16),
+    OPT_FLOAT("deband_threshold", "Debanding threshold", deband_params.threshold, .max = 1000.0),
+    OPT_FLOAT("deband_radius", "Debanding radius", deband_params.radius, .max = 1000.0),
+    OPT_FLOAT("deband_grain", "Debanding grain", deband_params.grain, .max = 1000.0),
+    OPT_FLOAT("deband_grain_neutral_r", "Debanding grain neutral R", deband_params.grain_neutral[0]),
+    OPT_FLOAT("deband_grain_neutral_g", "Debanding grain neutral G", deband_params.grain_neutral[1]),
+    OPT_FLOAT("deband_grain_neutral_b", "Debanding grain neutral B", deband_params.grain_neutral[2]),
+
+    // Sigmodization
+    OPT_ENABLE_PARAMS("sigmoid", "Enable sigmoidization", sigmoid_params),
+    OPT_PRESET("sigmoid_preset", "Sigmoidization preset", sigmoid_params, LIST(
+               {"default", &pl_sigmoid_default_params})),
+    OPT_FLOAT("sigmoid_center", "Sigmoidization center", sigmoid_params.center, .max = 1.0),
+    OPT_FLOAT("sigmoid_slope", "Sigmoidization slope", sigmoid_params.slope, .min = 1.0, .max = 20.0),
+
+    // Color adjustment
+    OPT_ENABLE_PARAMS("color_adjustment", "Enable color adjustment", color_adjustment),
+    OPT_PRESET("color_adjustment_preset", "Color adjustment preset", color_adjustment, LIST(
+               {"neutral", &pl_color_adjustment_neutral})),
+    OPT_FLOAT("brightness", "Brightness boost", color_adjustment.brightness, .min = -1.0, .max = 1.0),
+    OPT_FLOAT("contrast", "Contrast boost", color_adjustment.contrast, .max = 100.0),
+    OPT_FLOAT("saturation", "Saturation gain", color_adjustment.saturation, .max = 100.0),
+    OPT_FLOAT("hue", "Hue shift", color_adjustment.hue),
+    OPT_FLOAT("gamma", "Gamma adjustment", color_adjustment.gamma, .max = 100.0),
+    OPT_FLOAT("temperature", "Color temperature shift", color_adjustment.temperature,
+              .min = (2500  - 6500) / 3500.0, // see `pl_white_from_temp`
+              .max = (25000 - 6500) / 3500.0),
+
+    // Peak detection
+    OPT_ENABLE_PARAMS("peak_detect", "Enable peak detection", peak_detect_params),
+    OPT_PRESET("peak_detect_preset", "Peak detection preset", peak_detect_params, LIST(
+               {"default",      &pl_peak_detect_default_params},
+               {"high_quality", &pl_peak_detect_high_quality_params})),
+    OPT_FLOAT("peak_smoothing_period", "Peak detection smoothing coefficient", peak_detect_params.smoothing_period, .max = 1000.0),
+    OPT_FLOAT("scene_threshold_low", "Scene change threshold low", peak_detect_params.scene_threshold_low, .max = 100.0),
+    OPT_FLOAT("scene_threshold_high", "Scene change threshold high", peak_detect_params.scene_threshold_high, .max = 100.0),
+    OPT_FLOAT("minimum_peak", "Minimum detected peak", peak_detect_params.minimum_peak, .max = 100.0, .deprecated = true),
+    OPT_FLOAT("peak_percentile", "Peak detection percentile", peak_detect_params.percentile, .max = 100.0),
+    OPT_BOOL("allow_delayed_peak", "Allow delayed peak detection", peak_detect_params.allow_delayed),
+
+    // Color mapping
+    OPT_ENABLE_PARAMS("color_map", "Enable color mapping", color_map_params),
+    OPT_PRESET("color_map_preset", "Color mapping preset", color_map_params, LIST(
+               {"default",      &pl_color_map_default_params},
+               {"high_quality", &pl_color_map_high_quality_params})),
+    OPT_NAMED("gamut_mapping", "Gamut mapping function", color_map_params.gamut_mapping,
+              pl_gamut_map_functions),
+    OPT_FLOAT("perceptual_deadzone", "Gamut mapping perceptual deadzone", color_map_params.gamut_constants.perceptual_deadzone, .max = 1.0f),
+    OPT_FLOAT("perceptual_strength", "Gamut mapping perceptual strength", color_map_params.gamut_constants.perceptual_strength, .max = 1.0f),
+    OPT_FLOAT("colorimetric_gamma", "Gamut mapping colorimetric gamma", color_map_params.gamut_constants.colorimetric_gamma, .max = 10.0f),
+    OPT_FLOAT("softclip_knee", "Gamut mapping softclip knee point", color_map_params.gamut_constants.softclip_knee, .max = 1.0f),
+    OPT_FLOAT("softclip_desat", "Gamut mapping softclip desaturation strength", color_map_params.gamut_constants.softclip_desat, .max = 1.0f),
+    OPT_INT("lut3d_size_I", "Gamut 3DLUT size I", color_map_params.lut3d_size[0], .max = 1024),
+    OPT_INT("lut3d_size_C", "Gamut 3DLUT size C", color_map_params.lut3d_size[1], .max = 1024),
+    OPT_INT("lut3d_size_h", "Gamut 3DLUT size h", color_map_params.lut3d_size[2], .max = 1024),
+    OPT_BOOL("lut3d_tricubic", "Gamut 3DLUT tricubic interpolation", color_map_params.lut3d_tricubic),
+    OPT_BOOL("gamut_expansion", "Gamut expansion", color_map_params.gamut_expansion),
+    OPT_NAMED("tone_mapping", "Tone mapping function", color_map_params.tone_mapping_function,
+              pl_tone_map_functions),
+    OPT_FLOAT("knee_adaptation", "Tone mapping knee point adaptation", color_map_params.tone_constants.knee_adaptation, .max = 1.0f),
+    OPT_FLOAT("knee_minimum", "Tone mapping knee point minimum", color_map_params.tone_constants.knee_minimum, .max = 0.5f),
+    OPT_FLOAT("knee_maximum", "Tone mapping knee point maximum", color_map_params.tone_constants.knee_maximum, .min = 0.5f, .max = 1.0f),
+    OPT_FLOAT("knee_default", "Tone mapping knee point default", color_map_params.tone_constants.knee_default, .max = 1.0f),
+    OPT_FLOAT("knee_offset", "BT.2390 knee point offset", color_map_params.tone_constants.knee_offset, .min = 0.5f, .max = 2.0f),
+    OPT_FLOAT("slope_tuning", "Spline slope tuning strength", color_map_params.tone_constants.slope_tuning, .max = 10.0f),
+    OPT_FLOAT("slope_offset", "Spline slope tuning offset", color_map_params.tone_constants.slope_offset, .max = 1.0f),
+    OPT_FLOAT("spline_contrast", "Spline slope contrast", color_map_params.tone_constants.spline_contrast, .max = 1.5f),
+    OPT_FLOAT("reinhard_contrast", "Reinhard contrast", color_map_params.tone_constants.reinhard_contrast, .max = 1.0f),
+    OPT_FLOAT("linear_knee", "Tone mapping linear knee point", color_map_params.tone_constants.linear_knee, .max = 1.0f),
+    OPT_FLOAT("exposure", "Tone mapping linear exposure", color_map_params.tone_constants.exposure, .max = 10.0f),
+    OPT_BOOL("inverse_tone_mapping", "Inverse tone mapping", color_map_params.inverse_tone_mapping),
+    OPT_ENUM("tone_map_metadata", "Source of HDR metadata to use", color_map_params.metadata, LIST(
+             {"any",       PL_HDR_METADATA_ANY},
+             {"none",      PL_HDR_METADATA_NONE},
+             {"hdr10",     PL_HDR_METADATA_HDR10},
+             {"hdr10plus", PL_HDR_METADATA_HDR10PLUS},
+             {"cie_y",     PL_HDR_METADATA_CIE_Y})),
+    OPT_INT("tone_lut_size", "Tone mapping LUT size", color_map_params.lut_size, .max = 4096),
+    OPT_FLOAT("contrast_recovery", "HDR contrast recovery strength", color_map_params.contrast_recovery, .max = 2.0),
+    OPT_FLOAT("contrast_smoothness", "HDR contrast recovery smoothness", color_map_params.contrast_smoothness, .min = 1.0, .max = 32.0),
+    OPT_BOOL("force_tone_mapping_lut", "Force tone mapping LUT", color_map_params.force_tone_mapping_lut),
+    OPT_BOOL("visualize_lut", "Visualize tone mapping LUTs", color_map_params.visualize_lut),
+    OPT_FLOAT("visualize_lut_x0", "Visualization rect x0", color_map_params.visualize_rect.x0),
+    OPT_FLOAT("visualize_lut_y0", "Visualization rect y0", color_map_params.visualize_rect.y0),
+    OPT_FLOAT("visualize_lut_x1", "Visualization rect x0", color_map_params.visualize_rect.x1),
+    OPT_FLOAT("visualize_lut_y1", "Visualization rect y0", color_map_params.visualize_rect.y1),
+    OPT_FLOAT("visualize_hue", "Visualization hue slice", color_map_params.visualize_hue),
+    OPT_FLOAT("visualize_theta", "Visualization rotation", color_map_params.visualize_theta),
+    OPT_BOOL("show_clipping", "Highlight clipped pixels", color_map_params.show_clipping),
+    OPT_FLOAT("tone_mapping_param", "Tone mapping function parameter", color_map_params.tone_mapping_param, .deprecated = true),
+
+    // Dithering
+    OPT_ENABLE_PARAMS("dither", "Enable dithering", dither_params),
+    OPT_PRESET("dither_preset", "Dithering preset", dither_params, LIST(
+               {"default", &pl_dither_default_params})),
+    OPT_ENUM("dither_method", "Dither method", dither_params.method, LIST(
+             {"blue",         PL_DITHER_BLUE_NOISE},
+             {"ordered_lut",  PL_DITHER_ORDERED_LUT},
+             {"ordered",      PL_DITHER_ORDERED_FIXED},
+             {"white",        PL_DITHER_WHITE_NOISE})),
+    OPT_INT("dither_lut_size", "Dither LUT size", dither_params.lut_size, .min = 1, .max = 8),
+    OPT_BOOL("dither_temporal", "Temporal dithering", dither_params.temporal),
+
+    // ICC
+    OPT_ENABLE_PARAMS("icc", "Enable ICC settings", icc_params, .deprecated = true),
+    OPT_PRESET("icc_preset", "ICC preset", icc_params, LIST(
+               {"default", &pl_icc_default_params}), .deprecated = true),
+    OPT_ENUM("icc_intent", "ICC rendering intent", icc_params.intent, LIST(
+             {"auto",       PL_INTENT_AUTO},
+             {"perceptual", PL_INTENT_PERCEPTUAL},
+             {"relative",   PL_INTENT_RELATIVE_COLORIMETRIC},
+             {"saturation", PL_INTENT_SATURATION},
+             {"absolute",   PL_INTENT_ABSOLUTE_COLORIMETRIC}), .deprecated = true),
+    OPT_INT("icc_size_r", "ICC 3DLUT size R", icc_params.size_r, .max = 256, .deprecated = true),
+    OPT_INT("icc_size_g", "ICC 3DLUT size G", icc_params.size_g, .max = 256, .deprecated = true),
+    OPT_INT("icc_size_b", "ICC 3DLUT size G", icc_params.size_b, .max = 256, .deprecated = true),
+    OPT_FLOAT("icc_max_luma", "ICC profile luma override", icc_params.max_luma, .max = 10000, .deprecated = true),
+    OPT_BOOL("icc_force_bpc", "Force ICC black point compensation", icc_params.force_bpc, .deprecated = true),
+
+    // Cone distortion
+    OPT_ENABLE_PARAMS("cone", "Enable cone distortion", cone_params),
+    OPT_PRESET("cone_preset", "Cone distortion preset", cone_params, LIST(
+               {"normal",        &pl_vision_normal},
+               {"protanomaly",   &pl_vision_protanomaly},
+               {"protanopia",    &pl_vision_protanopia},
+               {"deuteranomaly", &pl_vision_deuteranomaly},
+               {"deuteranopia",  &pl_vision_deuteranopia},
+               {"tritanomaly",   &pl_vision_tritanomaly},
+               {"tritanopia",    &pl_vision_tritanopia},
+               {"monochromacy",  &pl_vision_monochromacy},
+               {"achromatopsia", &pl_vision_achromatopsia})),
+    OPT_ENUM("cones", "Cone selection", cone_params.cones, LIST(
+             {"none", PL_CONE_NONE},
+             {"l",    PL_CONE_L},
+             {"m",    PL_CONE_M},
+             {"s",    PL_CONE_S},
+             {"lm",   PL_CONE_LM},
+             {"ms",   PL_CONE_MS},
+             {"ls",   PL_CONE_LS},
+             {"lms",  PL_CONE_LMS})),
+    OPT_FLOAT("cone_strength", "Cone distortion gain", cone_params.strength),
+
+    // Blending
+#define BLEND_VALUES LIST(                       \
+        {"zero",            PL_BLEND_ZERO},      \
+        {"one",             PL_BLEND_ONE},       \
+        {"alpha",           PL_BLEND_SRC_ALPHA}, \
+        {"one_minus_alpha", PL_BLEND_ONE_MINUS_SRC_ALPHA})
+
+    OPT_ENABLE_PARAMS("blend", "Enable output blending", blend_params),
+    OPT_PRESET("blend_preset", "Output blending preset", blend_params, LIST(
+               {"alpha_overlay", &pl_alpha_overlay})),
+    OPT_ENUM("blend_src_rgb", "Source RGB blend mode", blend_params.src_rgb, BLEND_VALUES),
+    OPT_ENUM("blend_src_alpha", "Source alpha blend mode", blend_params.src_alpha, BLEND_VALUES),
+    OPT_ENUM("blend_dst_rgb", "Target RGB blend mode", blend_params.dst_rgb, BLEND_VALUES),
+    OPT_ENUM("blend_dst_alpha", "Target alpha blend mode", blend_params.dst_alpha, BLEND_VALUES),
+
+    // Deinterlacing
+    OPT_ENABLE_PARAMS("deinterlace", "Enable deinterlacing", deinterlace_params),
+    OPT_PRESET("deinterlace_preset", "Deinterlacing preset", deinterlace_params, LIST(
+               {"default", &pl_deinterlace_default_params})),
+    OPT_ENUM("deinterlace_algo", "Deinterlacing algorithm", deinterlace_params.algo, LIST(
+             {"weave", PL_DEINTERLACE_WEAVE},
+             {"bob",   PL_DEINTERLACE_BOB},
+             {"yadif", PL_DEINTERLACE_YADIF})),
+    OPT_BOOL("deinterlace_skip_spatial", "Skip spatial interlacing check", deinterlace_params.skip_spatial_check),
+
+    // Distortion
+    OPT_ENABLE_PARAMS("distort", "Enable distortion", distort_params),
+    OPT_PRESET("distort_preset", "Distortion preset", distort_params, LIST(
+               {"default", &pl_distort_default_params})),
+    OPT_FLOAT("distort_scale_x", "Distortion X scale", distort_params.transform.mat.m[0][0]),
+    OPT_FLOAT("distort_scale_y", "Distortion Y scale", distort_params.transform.mat.m[1][1]),
+    OPT_FLOAT("distort_shear_x", "Distortion X shear", distort_params.transform.mat.m[0][1]),
+    OPT_FLOAT("distort_shear_y", "Distortion Y shear", distort_params.transform.mat.m[1][0]),
+    OPT_FLOAT("distort_offset_x", "Distortion X offset", distort_params.transform.c[0]),
+    OPT_FLOAT("distort_offset_y", "Distortion Y offset", distort_params.transform.c[1]),
+    OPT_BOOL("distort_unscaled", "Distortion unscaled", distort_params.unscaled),
+    OPT_BOOL("distort_constrain", "Constrain distortion", distort_params.constrain),
+    OPT_BOOL("distort_bicubic", "Distortion bicubic interpolation", distort_params.bicubic),
+    OPT_ENUM("distort_address_mode", "Distortion texture address mode", distort_params.address_mode, LIST(
+             {"clamp",  PL_TEX_ADDRESS_CLAMP},
+             {"repeat", PL_TEX_ADDRESS_REPEAT},
+             {"mirror", PL_TEX_ADDRESS_MIRROR})),
+    OPT_ENUM("distort_alpha_mode", "Distortion alpha blending mode", distort_params.alpha_mode, LIST(
+             {"none",          PL_ALPHA_UNKNOWN},
+             {"independent",   PL_ALPHA_INDEPENDENT},
+             {"premultiplied", PL_ALPHA_PREMULTIPLIED})),
+
+    // Misc renderer settings
+    OPT_NAMED("error_diffusion", "Error diffusion kernel", params.error_diffusion,
+              pl_error_diffusion_kernels),
+    OPT_ENUM("lut_type", "Color mapping LUT type", params.lut_type, LIST(
+             {"unknown",    PL_LUT_UNKNOWN},
+             {"native",     PL_LUT_NATIVE},
+             {"normalized", PL_LUT_NORMALIZED},
+             {"conversion", PL_LUT_CONVERSION})),
+    OPT_FLOAT("background_r", "Background color R", params.background_color[0], .max = 1.0),
+    OPT_FLOAT("background_g", "Background color G", params.background_color[1], .max = 1.0),
+    OPT_FLOAT("background_b", "Background color B", params.background_color[2], .max = 1.0),
+    OPT_FLOAT("background_transparency", "Background color transparency", params.background_transparency, .max = 1),
+    OPT_BOOL("skip_target_clearing", "Skip target clearing", params.skip_target_clearing),
+    OPT_FLOAT("corner_rounding", "Corner rounding", params.corner_rounding, .max = 1.0),
+    OPT_BOOL("blend_against_tiles", "Blend against tiles", params.blend_against_tiles),
+    OPT_FLOAT("tile_color_hi_r", "Bright tile R", params.tile_colors[0][0], .max = 1.0),
+    OPT_FLOAT("tile_color_hi_g", "Bright tile G", params.tile_colors[0][1], .max = 1.0),
+    OPT_FLOAT("tile_color_hi_b", "Bright tile B", params.tile_colors[0][2], .max = 1.0),
+    OPT_FLOAT("tile_color_lo_r", "Dark tile R", params.tile_colors[1][0], .max = 1.0),
+    OPT_FLOAT("tile_color_lo_g", "Dark tile G", params.tile_colors[1][1], .max = 1.0),
+    OPT_FLOAT("tile_color_lo_b", "Dark tile B", params.tile_colors[1][2], .max = 1.0),
+    OPT_INT("tile_size", "Tile size", params.tile_size, .min = 2, .max = 256),
+
+    // Performance / quality trade-offs and debugging options
+    OPT_BOOL("skip_anti_aliasing", "Skip anti-aliasing", params.skip_anti_aliasing),
+    OPT_INT("lut_entries", "Scaler LUT entries", params.lut_entries, .max = 256, .deprecated = true),
+    OPT_FLOAT("polar_cutoff", "Polar LUT cutoff", params.polar_cutoff, .max = 1.0, .deprecated = true),
+    OPT_BOOL("preserve_mixing_cache", "Preserve mixing cache", params.preserve_mixing_cache),
+    OPT_BOOL("skip_caching_single_frame", "Skip caching single frame", params.skip_caching_single_frame),
+    OPT_BOOL("disable_linear_scaling", "Disable linear scaling", params.disable_linear_scaling),
+    OPT_BOOL("disable_builtin_scalers", "Disable built-in scalers", params.disable_builtin_scalers),
+    OPT_BOOL("correct_subpixel_offset", "Correct subpixel offsets", params.correct_subpixel_offsets),
+    OPT_BOOL("ignore_icc_profiles", "Ignore ICC profiles", params.ignore_icc_profiles, .deprecated = true),
+    OPT_BOOL("force_dither", "Force-enable dithering", params.force_dither),
+    OPT_BOOL("disable_dither_gamma_correction", "Disable gamma-correct dithering", params.disable_dither_gamma_correction),
+    OPT_BOOL("disable_fbos", "Disable FBOs", params.disable_fbos),
+    OPT_BOOL("force_low_bit_depth_fbos", "Force 8-bit FBOs", params.force_low_bit_depth_fbos),
+    OPT_BOOL("dynamic_constants", "Dynamic constants", params.dynamic_constants),
+    {0},
+};
+
+const int pl_option_count = PL_ARRAY_SIZE(pl_option_list) - 1;
+
+pl_opt pl_find_option(const char *key)
+{
+    for (int i = 0; i < pl_option_count; i++) {
+        if (!strcmp(key, pl_option_list[i].key))
+            return &pl_option_list[i];
+    }
+
+    return NULL;
+}
diff --git a/src/os.h b/src/os.h
new file mode 100644
index 0000000..386f0cb
--- /dev/null
+++ b/src/os.h
@@ -0,0 +1,30 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#ifdef __unix__
+#define PL_HAVE_UNIX
+#endif
+
+#ifdef _WIN32
+#define PL_HAVE_WIN32
+#endif
+
+#ifdef __APPLE__
+#define PL_HAVE_APPLE
+#endif
diff --git a/src/pl_alloc.c b/src/pl_alloc.c
new file mode 100644
index 0000000..64eeda7
--- /dev/null
+++ b/src/pl_alloc.c
@@ -0,0 +1,313 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+
+struct header {
+#ifndef NDEBUG
+#define MAGIC 0x20210119LU
+    uint32_t magic;
+#endif
+    size_t size;
+    struct header *parent;
+    struct ext *ext;
+
+    // Pointer to actual data, for alignment purposes
+    max_align_t data[];
+};
+
+// Lazily allocated, to save space for leaf allocations and allocations which
+// don't need fancy requirements
+struct ext {
+    size_t num_children;
+    size_t children_size; // total allocated size of `children`
+    struct header *children[];
+};
+
+#define PTR_OFFSET offsetof(struct header, data)
+#define MAX_ALLOC (SIZE_MAX - PTR_OFFSET)
+#define MINIMUM_CHILDREN 4
+
+static inline struct header *get_header(void *ptr)
+{
+    if (!ptr)
+        return NULL;
+
+    struct header *hdr = (struct header *) ((uintptr_t) ptr - PTR_OFFSET);
+#ifndef NDEBUG
+    assert(hdr->magic == MAGIC);
+#endif
+
+    return hdr;
+}
+
+static inline void *oom(void)
+{
+    fprintf(stderr, "out of memory\n");
+    abort();
+}
+
+static inline struct ext *alloc_ext(struct header *h)
+{
+    if (!h)
+        return NULL;
+
+    if (!h->ext) {
+        h->ext = malloc(sizeof(struct ext) + MINIMUM_CHILDREN * sizeof(void *));
+        if (!h->ext)
+            oom();
+        h->ext->num_children = 0;
+        h->ext->children_size = MINIMUM_CHILDREN;
+    }
+
+    return h->ext;
+}
+
+static inline void attach_child(struct header *parent, struct header *child)
+{
+    child->parent = parent;
+    if (!parent)
+        return;
+
+
+    struct ext *ext = alloc_ext(parent);
+    if (ext->num_children == ext->children_size) {
+        size_t new_size = ext->children_size * 2;
+        ext = realloc(ext, sizeof(struct ext) + new_size * sizeof(void *));
+        if (!ext)
+            oom();
+        ext->children_size = new_size;
+        parent->ext = ext;
+    }
+
+    ext->children[ext->num_children++] = child;
+}
+
+static inline void unlink_child(struct header *parent, struct header *child)
+{
+    child->parent = NULL;
+    if (!parent)
+        return;
+
+    struct ext *ext = parent->ext;
+    for (size_t i = 0; i < ext->num_children; i++) {
+        if (ext->children[i] == child) {
+            memmove(&ext->children[i], &ext->children[i + 1],
+                    (--ext->num_children - i) * sizeof(ext->children[0]));
+            return;
+        }
+    }
+
+    assert(!"unlinking orphaned child?");
+}
+
+void *pl_alloc(void *parent, size_t size)
+{
+    if (size >= MAX_ALLOC)
+        return oom();
+
+    struct header *h = malloc(PTR_OFFSET + size);
+    if (!h)
+        return oom();
+
+#ifndef NDEBUG
+    h->magic = MAGIC;
+#endif
+    h->size = size;
+    h->ext = NULL;
+
+    attach_child(get_header(parent), h);
+    return h->data;
+}
+
+void *pl_zalloc(void *parent, size_t size)
+{
+    if (size >= MAX_ALLOC)
+        return oom();
+
+    struct header *h = calloc(1, PTR_OFFSET + size);
+    if (!h)
+        return oom();
+
+#ifndef NDEBUG
+    h->magic = MAGIC;
+#endif
+    h->size = size;
+
+    attach_child(get_header(parent), h);
+    return h->data;
+}
+
+void *pl_realloc(void *parent, void *ptr, size_t size)
+{
+    if (size >= MAX_ALLOC)
+        return oom();
+    if (!ptr)
+        return pl_alloc(parent, size);
+
+    struct header *h = get_header(ptr);
+    assert(get_header(parent) == h->parent);
+    if (h->size == size)
+        return ptr;
+
+    struct header *old_h = h;
+    h = realloc(h, PTR_OFFSET + size);
+    if (!h)
+        return oom();
+
+    h->size = size;
+
+    if (h != old_h) {
+        if (h->parent) {
+            struct ext *ext = h->parent->ext;
+            for (size_t i = 0; i < ext->num_children; i++) {
+                if (ext->children[i] == old_h) {
+                    ext->children[i] = h;
+                    goto done_reparenting;
+                }
+            }
+            assert(!"reallocating orphaned child?");
+        }
+done_reparenting:
+
+        if (h->ext) {
+            for (size_t i = 0; i < h->ext->num_children; i++)
+                h->ext->children[i]->parent = h;
+        }
+    }
+
+    return h->data;
+}
+
+void pl_free(void *ptr)
+{
+    struct header *h = get_header(ptr);
+    if (!h)
+        return;
+
+    pl_free_children(ptr);
+    unlink_child(h->parent, h);
+
+    free(h->ext);
+    free(h);
+}
+
+void pl_free_children(void *ptr)
+{
+    struct header *h = get_header(ptr);
+    if (!h || !h->ext)
+        return;
+
+#ifndef NDEBUG
+    // this detects recursive hierarchies
+    h->magic = 0;
+#endif
+
+    for (size_t i = 0; i < h->ext->num_children; i++) {
+        h->ext->children[i]->parent = NULL; // prevent recursive access
+        pl_free(h->ext->children[i]->data);
+    }
+    h->ext->num_children = 0;
+
+#ifndef NDEBUG
+    h->magic = MAGIC;
+#endif
+}
+
+size_t pl_get_size(const void *ptr)
+{
+    const struct header *h = get_header((void *) ptr);
+    return h ? h->size : 0;
+}
+
+void *pl_steal(void *parent, void *ptr)
+{
+    struct header *h = get_header(ptr);
+    if (!h)
+        return NULL;
+
+    struct header *new_par = get_header(parent);
+    if (new_par != h->parent) {
+        unlink_child(h->parent, h);
+        attach_child(new_par, h);
+    }
+
+    return h->data;
+}
+
+void *pl_memdup(void *parent, const void *ptr, size_t size)
+{
+    if (!size)
+        return NULL;
+
+    void *new = pl_alloc(parent, size);
+    if (!new)
+        return oom();
+
+    assert(ptr);
+    memcpy(new, ptr, size);
+    return new;
+}
+
+char *pl_str0dup0(void *parent, const char *str)
+{
+    if (!str)
+        return NULL;
+
+    return pl_memdup(parent, str, strlen(str) + 1);
+}
+
+char *pl_strndup0(void *parent, const char *str, size_t size)
+{
+    if (!str)
+        return NULL;
+
+    size_t str_size = strnlen(str, size);
+    char *new = pl_alloc(parent, str_size + 1);
+    if (!new)
+        return oom();
+    memcpy(new, str, str_size);
+    new[str_size] = '\0';
+    return new;
+}
+
+char *pl_asprintf(void *parent, const char *fmt, ...)
+{
+    char *str;
+    va_list ap;
+    va_start(ap, fmt);
+    str = pl_vasprintf(parent, fmt, ap);
+    va_end(ap);
+    return str;
+}
+
+char *pl_vasprintf(void *parent, const char *fmt, va_list ap)
+{
+    // First, we need to determine the size that will be required for
+    // printing the entire string. Do this by making a copy of the va_list
+    // and printing it to a null buffer.
+    va_list copy;
+    va_copy(copy, ap);
+    int size = vsnprintf(NULL, 0, fmt, copy);
+    va_end(copy);
+    if (size < 0)
+        return NULL;
+
+    char *str = pl_alloc(parent, size + 1);
+    vsnprintf(str, size + 1, fmt, ap);
+    return str;
+}
diff --git a/src/pl_alloc.h b/src/pl_alloc.h
new file mode 100644
index 0000000..78df08a
--- /dev/null
+++ b/src/pl_alloc.h
@@ -0,0 +1,191 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <stdalign.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+// Unlike standard malloc, `size` may be 0, in which case this returns an empty
+// allocation which can still be used as a parent for other allocations.
+void *pl_alloc(void *parent, size_t size);
+void *pl_zalloc(void *parent, size_t size);
+void *pl_realloc(void *parent, void *ptr, size_t size);
+
+static inline void *pl_calloc(void *parent, size_t count, size_t size)
+{
+    return pl_zalloc(parent, count * size);
+}
+
+#define pl_tmp(parent) pl_alloc(parent, 0)
+
+// Variants of the above which resolve to sizeof(*ptr)
+#define pl_alloc_ptr(parent, ptr) \
+    (__typeof__(ptr)) pl_alloc(parent, sizeof(*(ptr)))
+#define pl_zalloc_ptr(parent, ptr) \
+    (__typeof__(ptr)) pl_zalloc(parent, sizeof(*(ptr)))
+#define pl_calloc_ptr(parent, num, ptr) \
+    (__typeof__(ptr)) pl_calloc(parent, num, sizeof(*(ptr)))
+
+// Helper function to allocate a struct and immediately assign it
+#define pl_alloc_struct(parent, type, ...) \
+    (type *) pl_memdup(parent, &(type) __VA_ARGS__, sizeof(type))
+
+// Free an allocation and its children (recursively)
+void pl_free(void *ptr);
+void pl_free_children(void *ptr);
+
+#define pl_free_ptr(ptr)    \
+    do {                    \
+        pl_free(*(ptr));    \
+        *(ptr) = NULL;      \
+    } while (0)
+
+// Get the current size of an allocation.
+size_t pl_get_size(const void *ptr);
+
+#define pl_grow(parent, ptr, size)                      \
+    do {                                                \
+        size_t _size = (size);                          \
+        if (_size > pl_get_size(*(ptr)))                \
+            *(ptr) = pl_realloc(parent, *(ptr), _size); \
+    } while (0)
+
+// Reparent an allocation onto a new parent
+void *pl_steal(void *parent, void *ptr);
+
+// Wrapper functions around common string utilities
+void *pl_memdup(void *parent, const void *ptr, size_t size);
+char *pl_str0dup0(void *parent, const char *str);
+char *pl_strndup0(void *parent, const char *str, size_t size);
+
+#define pl_memdup_ptr(parent, ptr) \
+    (__typeof__(ptr)) pl_memdup(parent, ptr, sizeof(*(ptr)))
+
+// Helper functions for allocating public/private pairs, done by allocating
+// `priv` at the address of `pub` + sizeof(pub), rounded up to the maximum
+// alignment requirements.
+
+#define PL_ALIGN_MEM(size) PL_ALIGN2(size, alignof(max_align_t))
+
+#define PL_PRIV(pub) \
+    (void *) ((uintptr_t) (pub) + PL_ALIGN_MEM(sizeof(*(pub))))
+
+#define pl_alloc_obj(parent, ptr, priv) \
+    (__typeof__(ptr)) pl_alloc(parent, PL_ALIGN_MEM(sizeof(*(ptr))) + sizeof(priv))
+
+#define pl_zalloc_obj(parent, ptr, priv) \
+    (__typeof__(ptr)) pl_zalloc(parent, PL_ALIGN_MEM(sizeof(*(ptr))) + sizeof(priv))
+
+// Helper functions for dealing with arrays
+
+#define PL_ARRAY(type) struct { type *elem; int num; }
+
+#define PL_ARRAY_REALLOC(parent, arr, len)                                      \
+    do {                                                                        \
+        size_t _new_size = (len) * sizeof((arr).elem[0]);                       \
+        (arr).elem = pl_realloc((void *) parent, (arr).elem, _new_size);        \
+    } while (0)
+
+#define PL_ARRAY_RESIZE(parent, arr, len)                                       \
+    do {                                                                        \
+        size_t _avail = pl_get_size((arr).elem) / sizeof((arr).elem[0]);        \
+        size_t _min_len = (len);                                                \
+        if (_avail < _min_len)                                                  \
+            PL_ARRAY_REALLOC(parent, arr, _min_len);                            \
+    } while (0)
+
+#define PL_ARRAY_MEMDUP(parent, arr, ptr, len)                                  \
+    do {                                                                        \
+        size_t _len = (len);                                                    \
+        PL_ARRAY_RESIZE(parent, arr, _len);                                     \
+        memcpy((arr).elem, ptr, _len * sizeof((arr).elem[0]));                  \
+        (arr).num = _len;                                                       \
+    } while (0)
+
+#define PL_ARRAY_GROW(parent, arr)                                              \
+    do {                                                                        \
+        size_t _avail = pl_get_size((arr).elem) / sizeof((arr).elem[0]);        \
+        if (_avail < 10) {                                                      \
+            PL_ARRAY_REALLOC(parent, arr, 10);                                  \
+        } else if ((arr).num == _avail) {                                       \
+            PL_ARRAY_REALLOC(parent, arr, (arr).num * 1.5);                     \
+        } else {                                                                \
+            assert((arr).elem);                                                 \
+        }                                                                       \
+    } while (0)
+
+#define PL_ARRAY_APPEND(parent, arr, ...)                                       \
+    do {                                                                        \
+        PL_ARRAY_GROW(parent, arr);                                             \
+        (arr).elem[(arr).num++] = __VA_ARGS__;                                  \
+    } while (0)
+
+#define PL_ARRAY_CONCAT(parent, to, from)                                       \
+    do {                                                                        \
+        if ((from).num) {                                                       \
+            PL_ARRAY_RESIZE(parent, to, (to).num + (from).num);                 \
+            memmove(&(to).elem[(to).num], (from).elem,                          \
+                    (from).num * sizeof((from).elem[0]));                       \
+            (to).num += (from).num;                                             \
+        }                                                                       \
+    } while (0)
+
+#define PL_ARRAY_REMOVE_RANGE(arr, idx, count)                                  \
+    do {                                                                        \
+        ptrdiff_t _idx = (idx);                                                 \
+        if (_idx < 0)                                                           \
+            _idx += (arr).num;                                                  \
+        size_t _count = (count);                                                \
+        assert(_idx >= 0 && _idx + _count <= (arr).num);                        \
+        memmove(&(arr).elem[_idx], &(arr).elem[_idx + _count],                  \
+                ((arr).num - _idx - _count) * sizeof((arr).elem[0]));           \
+        (arr).num -= _count;                                                    \
+    } while (0)
+
+#define PL_ARRAY_REMOVE_AT(arr, idx) PL_ARRAY_REMOVE_RANGE(arr, idx, 1)
+
+#define PL_ARRAY_INSERT_AT(parent, arr, idx, ...)                               \
+    do {                                                                        \
+        ptrdiff_t _idx = (idx);                                                 \
+        if (_idx < 0)                                                           \
+            _idx += (arr).num + 1;                                              \
+        assert(_idx >= 0 && _idx <= (arr).num);                                 \
+        PL_ARRAY_GROW(parent, arr);                                             \
+        memmove(&(arr).elem[_idx + 1], &(arr).elem[_idx],                       \
+                ((arr).num++ - _idx) * sizeof((arr).elem[0]));                  \
+        (arr).elem[_idx] = __VA_ARGS__;                                         \
+    } while (0)
+
+// Returns whether or not there was any element to pop
+#define PL_ARRAY_POP(arr, out)                                                  \
+    ((arr).num > 0                                                              \
+        ? (*(out) = (arr).elem[--(arr).num], true)                              \
+        : false                                                                 \
+    )
+
+// Wrapper for dealing with non-PL_ARRAY arrays
+#define PL_ARRAY_APPEND_RAW(parent, arr, idxvar, ...)                           \
+    do {                                                                        \
+        PL_ARRAY(__typeof__((arr)[0])) _arr = { (arr), (idxvar) };              \
+        PL_ARRAY_APPEND(parent, _arr, __VA_ARGS__);                             \
+        (arr) = _arr.elem;                                                      \
+        (idxvar) = _arr.num;                                                    \
+    } while (0)
diff --git a/src/pl_assert.h b/src/pl_assert.h
new file mode 100644
index 0000000..b4c6656
--- /dev/null
+++ b/src/pl_assert.h
@@ -0,0 +1,37 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <assert.h>
+
+#ifndef NDEBUG
+# define pl_assert assert
+#else
+# define pl_assert(expr)                                        \
+  do {                                                          \
+      if (!(expr)) {                                            \
+          fprintf(stderr, "Assertion failed: %s in %s:%d\n",    \
+                  #expr, __FILE__, __LINE__);                   \
+          abort();                                              \
+      }                                                         \
+  } while (0)
+#endif
+
+// In C11, static asserts must have a string message
+#define pl_static_assert(expr) static_assert(expr, #expr)
diff --git a/src/pl_clock.h b/src/pl_clock.h
new file mode 100644
index 0000000..541ef0b
--- /dev/null
+++ b/src/pl_clock.h
@@ -0,0 +1,98 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <time.h>
+#include <stdint.h>
+
+#include "os.h"
+
+#ifdef PL_HAVE_WIN32
+# include <windows.h>
+# define PL_CLOCK_QPC
+#elif defined(PL_HAVE_APPLE)
+# include <Availability.h>
+# if (defined(__MAC_OS_X_VERSION_MIN_REQUIRED)  && __MAC_OS_X_VERSION_MIN_REQUIRED  < 101200) || \
+     (defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && __IPHONE_OS_VERSION_MIN_REQUIRED < 100000) || \
+     (defined(__TV_OS_VERSION_MIN_REQUIRED)     && __TV_OS_VERSION_MIN_REQUIRED     < 100000) || \
+     (defined(__WATCH_OS_VERSION_MIN_REQUIRED)  && __WATCH_OS_VERSION_MIN_REQUIRED  < 30000)  || \
+     !defined(CLOCK_MONOTONIC_RAW)
+#  include <mach/mach_time.h>
+#  define PL_CLOCK_MACH
+# else
+#  define PL_CLOCK_MONOTONIC_RAW
+# endif
+#elif defined(CLOCK_MONOTONIC_RAW)
+# define PL_CLOCK_MONOTONIC_RAW
+#elif defined(TIME_UTC)
+# define PL_CLOCK_TIMESPEC_GET
+#else
+# warning "pl_clock not implemented for this platform!"
+#endif
+
+typedef uint64_t pl_clock_t;
+
+static inline pl_clock_t pl_clock_now(void)
+{
+#if defined(PL_CLOCK_QPC)
+
+    LARGE_INTEGER counter;
+    QueryPerformanceCounter(&counter);
+    return counter.QuadPart;
+
+#elif defined(PL_CLOCK_MACH)
+
+    return mach_absolute_time();
+
+#else
+
+    struct timespec tp = { .tv_sec = 0, .tv_nsec = 0 };
+#if defined(PL_CLOCK_MONOTONIC_RAW)
+    clock_gettime(CLOCK_MONOTONIC_RAW, &tp);
+#elif defined(PL_CLOCK_TIMESPEC_GET)
+    timespec_get(&tp, TIME_UTC);
+#endif
+    return tp.tv_sec * UINT64_C(1000000000) + tp.tv_nsec;
+
+#endif
+}
+
+static inline double pl_clock_diff(pl_clock_t a, pl_clock_t b)
+{
+    double frequency = 1e9;
+
+#if defined(PL_CLOCK_QPC)
+
+    LARGE_INTEGER freq;
+    QueryPerformanceFrequency(&freq);
+    frequency = freq.QuadPart;
+
+#elif defined(PL_CLOCK_MACH)
+
+    mach_timebase_info_data_t time_base;
+    if (mach_timebase_info(&time_base) != KERN_SUCCESS)
+        return 0;
+    frequency = (time_base.denom * 1e9) / time_base.numer;
+
+#endif
+
+    if (b > a)
+        return (b - a) / -frequency;
+    else
+        return (a - b) / frequency;
+}
diff --git a/src/pl_string.c b/src/pl_string.c
new file mode 100644
index 0000000..ba25971
--- /dev/null
+++ b/src/pl_string.c
@@ -0,0 +1,418 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include "hash.h"
+
+static void grow_str(void *alloc, pl_str *str, size_t len)
+{
+    // Like pl_grow, but with some extra headroom
+    if (len > pl_get_size(str->buf))
+        str->buf = pl_realloc(alloc, str->buf, len * 1.5);
+}
+
+void pl_str_append(void *alloc, pl_str *str, pl_str append)
+{
+    // Also append an extra \0 for convenience, since a lot of the time
+    // this function will be used to generate a string buffer
+    grow_str(alloc, str, str->len + append.len + 1);
+    if (append.len) {
+        memcpy(str->buf + str->len, append.buf, append.len);
+        str->len += append.len;
+    }
+    str->buf[str->len] = '\0';
+}
+
+void pl_str_append_raw(void *alloc, pl_str *str, const void *ptr, size_t size)
+{
+    if (!size)
+        return;
+    grow_str(alloc, str, str->len + size);
+    memcpy(str->buf + str->len, ptr, size);
+    str->len += size;
+}
+
+void pl_str_append_asprintf(void *alloc, pl_str *str, const char *fmt, ...)
+{
+    va_list ap;
+    va_start(ap, fmt);
+    pl_str_append_vasprintf(alloc, str, fmt, ap);
+    va_end(ap);
+}
+
+void pl_str_append_vasprintf(void *alloc, pl_str *str, const char *fmt, va_list ap)
+{
+    // First, we need to determine the size that will be required for
+    // printing the entire string. Do this by making a copy of the va_list
+    // and printing it to a null buffer.
+    va_list copy;
+    va_copy(copy, ap);
+    int size = vsnprintf(NULL, 0, fmt, copy);
+    va_end(copy);
+    if (size < 0)
+        return;
+
+    // Make room in `str` and format to there directly
+    grow_str(alloc, str, str->len + size + 1);
+    str->len += vsnprintf((char *) (str->buf + str->len), size + 1, fmt, ap);
+}
+
+int pl_str_sscanf(pl_str str, const char *fmt, ...)
+{
+    char *tmp = pl_strdup0(NULL, str);
+    va_list va;
+    va_start(va, fmt);
+    int ret = vsscanf(tmp, fmt, va);
+    va_end(va);
+    pl_free(tmp);
+    return ret;
+}
+
+int pl_strchr(pl_str str, int c)
+{
+    if (!str.len)
+        return -1;
+
+    void *pos = memchr(str.buf, c, str.len);
+    if (pos)
+        return (intptr_t) pos - (intptr_t) str.buf;
+    return -1;
+}
+
+size_t pl_strspn(pl_str str, const char *accept)
+{
+    for (size_t i = 0; i < str.len; i++) {
+        if (!strchr(accept, str.buf[i]))
+            return i;
+    }
+
+    return str.len;
+}
+
+size_t pl_strcspn(pl_str str, const char *reject)
+{
+    for (size_t i = 0; i < str.len; i++) {
+        if (strchr(reject, str.buf[i]))
+            return i;
+    }
+
+    return str.len;
+}
+
+static inline bool pl_isspace(char c)
+{
+    switch (c) {
+    case ' ':
+    case '\n':
+    case '\r':
+    case '\t':
+    case '\v':
+    case '\f':
+        return true;
+    default:
+        return false;
+    }
+}
+
+pl_str pl_str_strip(pl_str str)
+{
+    while (str.len && pl_isspace(str.buf[0])) {
+        str.buf++;
+        str.len--;
+    }
+    while (str.len && pl_isspace(str.buf[str.len - 1]))
+        str.len--;
+    return str;
+}
+
+int pl_str_find(pl_str haystack, pl_str needle)
+{
+    if (!needle.len)
+        return 0;
+
+    for (size_t i = 0; i + needle.len <= haystack.len; i++) {
+        if (memcmp(&haystack.buf[i], needle.buf, needle.len) == 0)
+            return i;
+    }
+
+    return -1;
+}
+
+pl_str pl_str_split_char(pl_str str, char sep, pl_str *out_rest)
+{
+    int pos = pl_strchr(str, sep);
+    if (pos < 0) {
+        if (out_rest)
+            *out_rest = (pl_str) {0};
+        return str;
+    } else {
+        if (out_rest)
+            *out_rest = pl_str_drop(str, pos + 1);
+        return pl_str_take(str, pos);
+    }
+}
+
+pl_str pl_str_split_chars(pl_str str, const char *seps, pl_str *out_rest)
+{
+    int pos = pl_strcspn(str, seps);
+    if (pos < 0) {
+        if (out_rest)
+            *out_rest = (pl_str) {0};
+        return str;
+    } else {
+        if (out_rest)
+            *out_rest = pl_str_drop(str, pos + 1);
+        return pl_str_take(str, pos);
+    }
+}
+
+pl_str pl_str_split_str(pl_str str, pl_str sep, pl_str *out_rest)
+{
+    int pos = pl_str_find(str, sep);
+    if (pos < 0) {
+        if (out_rest)
+            *out_rest = (pl_str) {0};
+        return str;
+    } else {
+        if (out_rest)
+            *out_rest = pl_str_drop(str, pos + sep.len);
+        return pl_str_take(str, pos);
+    }
+}
+
+static bool get_hexdigit(pl_str *str, int *digit)
+{
+    while (str->len && pl_isspace(str->buf[0])) {
+        str->buf++;
+        str->len--;
+    }
+
+    if (!str->len) {
+        *digit = -1; // EOF
+        return true;
+    }
+
+    char c = str->buf[0];
+    str->buf++;
+    str->len--;
+
+    if (c >= '0' && c <= '9') {
+        *digit = c - '0';
+    } else if (c >= 'a' && c <= 'f') {
+        *digit = c - 'a' + 10;
+    } else if (c >= 'A' && c <= 'F') {
+        *digit = c - 'A' + 10;
+    } else {
+        return false; // invalid char
+    }
+
+    return true;
+}
+
+bool pl_str_decode_hex(void *alloc, pl_str hex, pl_str *out)
+{
+    if (!out)
+        return false;
+
+    uint8_t *buf = pl_alloc(alloc, hex.len / 2);
+    int len = 0;
+
+    while (hex.len) {
+        int a, b;
+        if (!get_hexdigit(&hex, &a) || !get_hexdigit(&hex, &b))
+            goto error; // invalid char
+        if (a < 0) // EOF
+            break;
+        if (b < 0) // only one digit
+            goto error;
+
+        buf[len++] = (a << 4) | b;
+    }
+
+    *out = (pl_str) { buf, len };
+    return true;
+
+error:
+    pl_free(buf);
+    return false;
+}
+
+struct pl_str_builder_t {
+    PL_ARRAY(pl_str_template) templates;
+    pl_str args;
+    pl_str output;
+};
+
+pl_str_builder pl_str_builder_alloc(void *alloc)
+{
+    pl_str_builder b = pl_zalloc_ptr(alloc, b);
+    return b;
+}
+
+void pl_str_builder_free(pl_str_builder *b)
+{
+    if (*b)
+        pl_free_ptr(b);
+}
+
+void pl_str_builder_reset(pl_str_builder b)
+{
+    *b = (struct pl_str_builder_t) {
+        .templates.elem = b->templates.elem,
+        .args.buf       = b->args.buf,
+        .output.buf     = b->output.buf,
+    };
+}
+
+uint64_t pl_str_builder_hash(const pl_str_builder b)
+{
+    size_t size = b->templates.num * sizeof(b->templates.elem[0]);
+    uint64_t hash = pl_mem_hash(b->templates.elem, size);
+    pl_hash_merge(&hash, pl_str_hash(b->args));
+    return hash;
+}
+
+pl_str pl_str_builder_exec(pl_str_builder b)
+{
+    pl_str args = b->args;
+
+    b->output.len = 0;
+    for (int i = 0; i < b->templates.num; i++) {
+        size_t consumed = b->templates.elem[i](b, &b->output, args.buf);
+        pl_assert(consumed <= args.len);
+        args = pl_str_drop(args, consumed);
+    }
+
+    // Terminate with an extra \0 byte for convenience
+    grow_str(b, &b->output, b->output.len + 1);
+    b->output.buf[b->output.len] = '\0';
+    return b->output;
+}
+
+void pl_str_builder_append(pl_str_builder b, pl_str_template tmpl,
+                           const void *args, size_t size)
+{
+    PL_ARRAY_APPEND(b, b->templates, tmpl);
+    pl_str_append_raw(b, &b->args, args, size);
+}
+
+void pl_str_builder_concat(pl_str_builder b, const pl_str_builder append)
+{
+    PL_ARRAY_CONCAT(b, b->templates, append->templates);
+    pl_str_append_raw(b, &b->args, append->args.buf, append->args.len);
+}
+
+static size_t template_str_ptr(void *alloc, pl_str *buf, const uint8_t *args)
+{
+    const char *str;
+    memcpy(&str, args, sizeof(str));
+    pl_str_append_raw(alloc, buf, str, strlen(str));
+    return sizeof(str);
+}
+
+void pl_str_builder_const_str(pl_str_builder b, const char *str)
+{
+    pl_str_builder_append(b, template_str_ptr, &str, sizeof(str));
+}
+
+static size_t template_str(void *alloc, pl_str *buf, const uint8_t *args)
+{
+    pl_str str;
+    memcpy(&str.len, args, sizeof(str.len));
+    pl_str_append_raw(alloc, buf, args + sizeof(str.len), str.len);
+    return sizeof(str.len) + str.len;
+}
+
+void pl_str_builder_str(pl_str_builder b, const pl_str str)
+{
+    pl_str_builder_append(b, template_str, &str.len, sizeof(str.len));
+    pl_str_append_raw(b, &b->args, str.buf, str.len);
+}
+
+void pl_str_builder_printf_c(pl_str_builder b, const char *fmt, ...)
+{
+    va_list ap;
+    va_start(ap, fmt);
+    pl_str_builder_vprintf_c(b, fmt, ap);
+    va_end(ap);
+}
+
+static size_t template_printf(void *alloc, pl_str *str, const uint8_t *args)
+{
+    const char *fmt;
+    memcpy(&fmt, args, sizeof(fmt));
+    args += sizeof(fmt);
+
+    return sizeof(fmt) + pl_str_append_memprintf_c(alloc, str, fmt, args);
+}
+
+void pl_str_builder_vprintf_c(pl_str_builder b, const char *fmt, va_list ap)
+{
+    pl_str_builder_append(b, template_printf, &fmt, sizeof(fmt));
+
+    // Push all of the variadic arguments directly onto `b->args`
+    for (const char *c; (c = strchr(fmt, '%')) != NULL; fmt = c + 1) {
+        c++;
+        switch (c[0]) {
+#define WRITE(T, x) pl_str_append_raw(b, &b->args, &(T) {x}, sizeof(T))
+        case '%': continue;
+        case 'c': WRITE(char,       va_arg(ap, int)); break;
+        case 'd': WRITE(int,        va_arg(ap, int)); break;
+        case 'u': WRITE(unsigned,   va_arg(ap, unsigned)); break;
+        case 'f': WRITE(double,     va_arg(ap, double)); break;
+        case 'h':
+            assert(c[1] == 'x');
+            WRITE(unsigned short, va_arg(ap, unsigned));
+            c++;
+            break;
+        case 'l':
+            assert(c[1] == 'l');
+            switch (c[2]) {
+            case 'u': WRITE(long long unsigned, va_arg(ap, long long unsigned)); break;
+            case 'd': WRITE(long long int,      va_arg(ap, long long int)); break;
+            default: abort();
+            }
+            c += 2;
+            break;
+        case 'z':
+            assert(c[1] == 'u');
+            WRITE(size_t, va_arg(ap, size_t));
+            c++;
+            break;
+        case 's': {
+            pl_str str = pl_str0(va_arg(ap, const char *));
+            pl_str_append(b, &b->args, str);
+            b->args.len++; // expand to include \0 byte (from pl_str_append)
+            break;
+        }
+        case '.': {
+            assert(c[1] == '*');
+            assert(c[2] == 's');
+            int len = va_arg(ap, int);
+            const char *str = va_arg(ap, const char *);
+            WRITE(int, len);
+            pl_str_append_raw(b, &b->args, str, len);
+            c += 2;
+            break;
+        }
+        default:
+            fprintf(stderr, "Invalid conversion character: '%c'!\n", c[0]);
+            abort();
+        }
+#undef WRITE
+    }
+}
diff --git a/src/pl_string.h b/src/pl_string.h
new file mode 100644
index 0000000..7a0005c
--- /dev/null
+++ b/src/pl_string.h
@@ -0,0 +1,318 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+PL_API_BEGIN
+
+typedef struct pl_str {
+    uint8_t *buf;
+    size_t len;
+} pl_str;
+
+// For formatting with "%.*s"
+#define PL_STR_FMT(str) (int)((str).len), ((str).buf ? (char *)((str).buf) : "")
+
+static inline pl_str pl_str0(const char *str)
+{
+    return (pl_str) {
+        .buf = (uint8_t *) str,
+        .len = str ? strlen(str) : 0,
+    };
+}
+
+// Macro version of pl_str0, for constants
+#define PL_STR0(str) ((pl_str) { (uint8_t *) (str), (str) ? strlen(str) : 0 })
+
+static inline pl_str pl_strdup(void *alloc, pl_str str)
+{
+    return (pl_str) {
+        .buf = (uint8_t *) (str.len ? pl_memdup(alloc, str.buf, str.len) : NULL),
+        .len = str.len,
+    };
+}
+
+// Always returns a valid string
+static inline char *pl_strdup0(void *alloc, pl_str str)
+{
+    return pl_strndup0(alloc, str.len ? (char *) str.buf : "", str.len);
+}
+
+// Adds a trailing \0 for convenience, even if `append` is an empty string
+void pl_str_append(void *alloc, pl_str *str, pl_str append);
+
+// Like `pl_str_append` but for raw memory, omits trailing \0
+void pl_str_append_raw(void *alloc, pl_str *str, const void *ptr, size_t size);
+
+// Locale-sensitive string functions
+char *pl_asprintf(void *parent, const char *fmt, ...)
+    PL_PRINTF(2, 3);
+char *pl_vasprintf(void *parent, const char *fmt, va_list ap)
+    PL_PRINTF(2, 0);
+void pl_str_append_asprintf(void *alloc, pl_str *str, const char *fmt, ...)
+    PL_PRINTF(3, 4);
+void pl_str_append_vasprintf(void *alloc, pl_str *str, const char *fmt, va_list va)
+    PL_PRINTF(3, 0);
+int pl_str_sscanf(pl_str str, const char *fmt, ...);
+
+// Locale-invariant versions of append_(v)asprintf
+//
+// NOTE: These only support a small handful of modifiers. Check `format.c`
+// for a list. Calling them on an invalid string will abort!
+void pl_str_append_asprintf_c(void *alloc, pl_str *str, const char *fmt, ...)
+    PL_PRINTF(3, 4);
+void pl_str_append_vasprintf_c(void *alloc, pl_str *str, const char *fmt, va_list va)
+    PL_PRINTF(3, 0);
+
+// Variant of the above which takes arguments directly from a pointer in memory,
+// reading them incrementally (tightly packed). Returns the amount of bytes
+// read from `args`, as determined by the following table:
+//
+// %c: sizeof(char)
+// %d, %u: sizeof(int)
+// %f: sizeof(double)
+// %lld, %llu: sizeof(long long int)
+// %zu: sizeof(size_t)
+// %s: \0 terminated string
+// %.*s: sizeof(int) + that many bytes (no \0 terminator)
+size_t pl_str_append_memprintf_c(void *alloc, pl_str *str, const char *fmt,
+                                 const void *args)
+    PL_PRINTF(3, 0);
+
+// Locale-invariant number printing
+int pl_str_print_hex(char *buf, size_t len, unsigned short n);
+int pl_str_print_int(char *buf, size_t len, int n);
+int pl_str_print_uint(char *buf, size_t len, unsigned int n);
+int pl_str_print_int64(char *buf, size_t len, int64_t n);
+int pl_str_print_uint64(char *buf, size_t len, uint64_t n);
+int pl_str_print_float(char *buf, size_t len, float n);
+int pl_str_print_double(char *buf, size_t len, double n);
+
+// Locale-invariant number parsing
+bool pl_str_parse_hex(pl_str str, unsigned short *out);
+bool pl_str_parse_int(pl_str str, int *out);
+bool pl_str_parse_uint(pl_str str, unsigned int *out);
+bool pl_str_parse_int64(pl_str str, int64_t *out);
+bool pl_str_parse_uint64(pl_str str, uint64_t *out);
+bool pl_str_parse_float(pl_str str, float *out);
+bool pl_str_parse_double(pl_str str, double *out);
+
+// Variants of string.h functions
+int pl_strchr(pl_str str, int c);
+size_t pl_strspn(pl_str str, const char *accept);
+size_t pl_strcspn(pl_str str, const char *reject);
+
+// Strip leading/trailing whitespace
+pl_str pl_str_strip(pl_str str);
+
+// Generic functions for cutting up strings
+static inline pl_str pl_str_take(pl_str str, size_t len)
+{
+    if (len < str.len)
+        str.len = len;
+    return str;
+}
+
+static inline pl_str pl_str_drop(pl_str str, size_t len)
+{
+    if (len >= str.len)
+        return (pl_str) { .buf = NULL, .len = 0 };
+
+    str.buf += len;
+    str.len -= len;
+    return str;
+}
+
+// Find a substring in another string, and return its index (or -1)
+int pl_str_find(pl_str haystack, pl_str needle);
+
+// String splitting functions. These return the part of the string before
+// the separator, and optionally the rest (in `out_rest`).
+//
+// Note that the separator is not included as part of either string.
+pl_str pl_str_split_char(pl_str str, char sep, pl_str *out_rest);
+pl_str pl_str_split_str(pl_str str, pl_str sep, pl_str *out_rest);
+
+// Like `pl_str_split_char`, but splits on any char in `seps`
+pl_str pl_str_split_chars(pl_str str, const char *seps, pl_str *out_rest);
+
+static inline pl_str pl_str_getline(pl_str str, pl_str *out_rest)
+{
+    return pl_str_split_char(str, '\n', out_rest);
+}
+
+// Decode a string containing hexadecimal data. All whitespace will be silently
+// ignored. When successful, this allocates a new array to store the output.
+bool pl_str_decode_hex(void *alloc, pl_str hex, pl_str *out);
+
+static inline bool pl_str_equals(pl_str str1, pl_str str2)
+{
+    if (str1.len != str2.len)
+        return false;
+    if (str1.buf == str2.buf || !str1.len)
+        return true;
+    return memcmp(str1.buf, str2.buf, str1.len) == 0;
+}
+
+static inline bool pl_str_startswith(pl_str str, pl_str prefix)
+{
+    if (!prefix.len)
+        return true;
+    if (str.len < prefix.len)
+        return false;
+    return memcmp(str.buf, prefix.buf, prefix.len) == 0;
+}
+
+static inline bool pl_str_endswith(pl_str str, pl_str suffix)
+{
+    if (!suffix.len)
+        return true;
+    if (str.len < suffix.len)
+        return false;
+    return memcmp(str.buf + str.len - suffix.len, suffix.buf, suffix.len) == 0;
+}
+
+static inline bool pl_str_eatstart(pl_str *str, pl_str prefix)
+{
+    if (!pl_str_startswith(*str, prefix))
+        return false;
+
+    str->buf += prefix.len;
+    str->len -= prefix.len;
+    return true;
+}
+
+static inline bool pl_str_eatend(pl_str *str, pl_str suffix)
+{
+    if (!pl_str_endswith(*str, suffix))
+        return false;
+
+    str->len -= suffix.len;
+    return true;
+}
+
+// Convenience wrappers for the above which save the use of a pl_str0
+static inline pl_str pl_str_split_str0(pl_str str, const char *sep, pl_str *out_rest)
+{
+    return pl_str_split_str(str, pl_str0(sep), out_rest);
+}
+
+static inline bool pl_str_startswith0(pl_str str, const char *prefix)
+{
+    return pl_str_startswith(str, pl_str0(prefix));
+}
+
+static inline bool pl_str_endswith0(pl_str str, const char *suffix)
+{
+    return pl_str_endswith(str, pl_str0(suffix));
+}
+
+static inline bool pl_str_equals0(pl_str str1, const char *str2)
+{
+    return pl_str_equals(str1, pl_str0(str2));
+}
+
+static inline bool pl_str_eatstart0(pl_str *str, const char *prefix)
+{
+    return pl_str_eatstart(str, pl_str0(prefix));
+}
+
+static inline bool pl_str_eatend0(pl_str *str, const char *prefix)
+{
+    return pl_str_eatend(str, pl_str0(prefix));
+}
+
+// String building helpers, used to lazily construct a string by appending a
+// series of string templates which can be executed on-demand into a final
+// output buffer.
+typedef struct pl_str_builder_t *pl_str_builder;
+
+// Returns the number of bytes consumed from `args`. Be warned that the pointer
+// given will not necessarily be aligned to the type you need it as, so make
+// sure to use `memcpy` or some other method of safely loading arbitrary data
+// from memory.
+typedef size_t (*pl_str_template)(void *alloc, pl_str *buf, const uint8_t *args);
+
+pl_str_builder pl_str_builder_alloc(void *alloc);
+void pl_str_builder_free(pl_str_builder *builder);
+
+// Resets string builder without destroying buffer
+void pl_str_builder_reset(pl_str_builder builder);
+
+// Returns a representative hash of the string builder's output, without
+// actually executing it. Note that this is *not* the same as a pl_str_hash of
+// the string builder's output.
+//
+// Note also that the output of this may not survive a process restart because
+// of position-independent code and address randomization moving around the
+// locatons of template functions, so special care must be taken not to
+// compare such hashes across process invocations.
+uint64_t pl_str_builder_hash(const pl_str_builder builder);
+
+// Executes a string builder, dispatching all templates. The resulting string
+// is guaranteed to be \0-terminated, as a minor convenience.
+//
+// Calling any other `pl_str_builder_*` function on this builder causes the
+// contents of the returned string to become undefined.
+pl_str pl_str_builder_exec(pl_str_builder builder);
+
+// Append a template and its arguments to a string builder
+void pl_str_builder_append(pl_str_builder builder, pl_str_template tmpl,
+                           const void *args, size_t args_size);
+
+// Append an entire other `pl_str_builder` onto `builder`
+void pl_str_builder_concat(pl_str_builder builder, const pl_str_builder append);
+
+// Append a constant string. This will only record &str into the buffer, which
+// may have a number of unwanted consequences if the memory pointed at by
+// `str` mutates at any point in time in the future, or if `str` is not
+// at a stable location in memory.
+//
+// This is intended for strings which are compile-time constants.
+void pl_str_builder_const_str(pl_str_builder builder, const char *str);
+
+// Append a string. This will make a full copy of `str`
+void pl_str_builder_str(pl_str_builder builder, const pl_str str);
+#define pl_str_builder_str0(b, str) pl_str_builder_str(b, pl_str0(str))
+
+// Append a string printf-style. This will preprocess `fmt` to determine the
+// number and type of arguments. Supports the same format conversion characters
+// as `pl_str_append_asprintf_c`.
+void pl_str_builder_printf_c(pl_str_builder builder, const char *fmt, ...)
+    PL_PRINTF(2, 3);
+
+void pl_str_builder_vprintf_c(pl_str_builder builder, const char *fmt, va_list ap)
+    PL_PRINTF(2, 0);
+
+// Helper macro to specialize `pl_str_builder_printf_c` to
+// `pl_str_builder_const_str` if it contains no format characters.
+#define pl_str_builder_addf(builder, ...) do                                    \
+{                                                                               \
+    if (_contains_fmt_chars(__VA_ARGS__)) {                                     \
+        pl_str_builder_printf_c(builder, __VA_ARGS__);                          \
+    } else {                                                                    \
+        pl_str_builder_const_str(builder, _get_fmt(__VA_ARGS__));               \
+    }                                                                           \
+} while (0)
+
+// Helper macros to deal with the non-portability of __VA_OPT__(,)
+#define _contains_fmt_chars(fmt, ...)   (strchr(fmt, '%'))
+#define _get_fmt(fmt, ...)              fmt
+
+PL_API_END
diff --git a/src/pl_thread.h b/src/pl_thread.h
new file mode 100644
index 0000000..7a5ae47
--- /dev/null
+++ b/src/pl_thread.h
@@ -0,0 +1,73 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "os.h"
+
+enum pl_mutex_type {
+    PL_MUTEX_NORMAL = 0,
+    PL_MUTEX_RECURSIVE,
+};
+
+#define pl_mutex_init(mutex) \
+    pl_mutex_init_type(mutex, PL_MUTEX_NORMAL)
+
+// Note: This is never compiled, and only documents the API. The actual
+// implementations of these prototypes may be macros.
+#ifdef PL_API_REFERENCE
+
+typedef void pl_mutex;
+void pl_mutex_init_type(pl_mutex *mutex, enum pl_mutex_type mtype);
+int pl_mutex_destroy(pl_mutex *mutex);
+int pl_mutex_lock(pl_mutex *mutex);
+int pl_mutex_unlock(pl_mutex *mutex);
+
+typedef void pl_cond;
+int pl_cond_init(pl_cond *cond);
+int pl_cond_destroy(pl_cond *cond);
+int pl_cond_broadcast(pl_cond *cond);
+int pl_cond_signal(pl_cond *cond);
+
+// `timeout` is in nanoseconds, or UINT64_MAX to block forever
+int pl_cond_timedwait(pl_cond *cond, pl_mutex *mutex, uint64_t timeout);
+int pl_cond_wait(pl_cond *cond, pl_mutex *mutex);
+
+typedef void pl_static_mutex;
+#define PL_STATIC_MUTEX_INITIALIZER
+int pl_static_mutex_lock(pl_static_mutex *mutex);
+int pl_static_mutex_unlock(pl_static_mutex *mutex);
+
+typedef void pl_thread;
+#define PL_THREAD_VOID void
+#define PL_THREAD_RETURN() return
+int pl_thread_create(pl_thread *thread, PL_THREAD_VOID (*fun)(void *), void *arg);
+int pl_thread_join(pl_thread thread);
+
+// Returns true if slept the full time, false otherwise
+bool pl_thread_sleep(double t);
+
+#endif
+
+// Actual platform-specific implementation
+#ifdef PL_HAVE_WIN32
+#include "pl_thread_win32.h"
+#elif defined(PL_HAVE_PTHREAD)
+#include "pl_thread_pthread.h"
+#else
+#error No threading implementation available!
+#endif
diff --git a/src/pl_thread_pthread.h b/src/pl_thread_pthread.h
new file mode 100644
index 0000000..5910650
--- /dev/null
+++ b/src/pl_thread_pthread.h
@@ -0,0 +1,137 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <errno.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <time.h>
+
+#include <pl_assert.h>
+
+typedef pthread_mutex_t pl_mutex;
+typedef pthread_cond_t  pl_cond;
+typedef pthread_mutex_t pl_static_mutex;
+typedef pthread_t       pl_thread;
+#define PL_STATIC_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
+
+static inline int pl_mutex_init_type_internal(pl_mutex *mutex, enum pl_mutex_type mtype)
+{
+    int mutex_type;
+    switch (mtype) {
+        case PL_MUTEX_RECURSIVE:
+            mutex_type = PTHREAD_MUTEX_RECURSIVE;
+            break;
+        case PL_MUTEX_NORMAL:
+        default:
+        #ifndef NDEBUG
+            mutex_type = PTHREAD_MUTEX_ERRORCHECK;
+        #else
+            mutex_type = PTHREAD_MUTEX_DEFAULT;
+        #endif
+            break;
+    }
+
+    int ret = 0;
+    pthread_mutexattr_t attr;
+    ret = pthread_mutexattr_init(&attr);
+    if (ret != 0)
+        return ret;
+
+    pthread_mutexattr_settype(&attr, mutex_type);
+    ret = pthread_mutex_init(mutex, &attr);
+    pthread_mutexattr_destroy(&attr);
+    return ret;
+}
+
+#define pl_mutex_init_type(mutex, mtype) \
+    pl_assert(!pl_mutex_init_type_internal(mutex, mtype))
+
+#define pl_mutex_destroy    pthread_mutex_destroy
+#define pl_mutex_lock       pthread_mutex_lock
+#define pl_mutex_unlock     pthread_mutex_unlock
+
+static inline int pl_cond_init(pl_cond *cond)
+{
+    int ret = 0;
+    pthread_condattr_t attr;
+    ret = pthread_condattr_init(&attr);
+    if (ret != 0)
+        return ret;
+
+#ifdef PTHREAD_HAS_SETCLOCK
+    pthread_condattr_setclock(&attr, CLOCK_MONOTONIC);
+#endif
+    ret = pthread_cond_init(cond, &attr);
+    pthread_condattr_destroy(&attr);
+    return ret;
+}
+
+#define pl_cond_destroy     pthread_cond_destroy
+#define pl_cond_broadcast   pthread_cond_broadcast
+#define pl_cond_signal      pthread_cond_signal
+#define pl_cond_wait        pthread_cond_wait
+
+static inline int pl_cond_timedwait(pl_cond *cond, pl_mutex *mutex, uint64_t timeout)
+{
+    if (timeout == UINT64_MAX)
+        return pthread_cond_wait(cond, mutex);
+
+    struct timespec ts;
+#ifdef PTHREAD_HAS_SETCLOCK
+    if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0)
+        return errno;
+#else
+    struct timeval tv;
+    if (gettimeofday(&tv, NULL) < 0) // equivalent to CLOCK_REALTIME
+        return errno;
+    ts.tv_sec = tv.tv_sec;
+    ts.tv_nsec = tv.tv_usec * 1000;
+#endif
+
+    ts.tv_sec  += timeout / 1000000000LLU;
+    ts.tv_nsec += timeout % 1000000000LLU;
+
+    if (ts.tv_nsec > 1000000000L) {
+        ts.tv_nsec -= 1000000000L;
+        ts.tv_sec++;
+    }
+
+    return pthread_cond_timedwait(cond, mutex, &ts);
+}
+
+#define pl_static_mutex_lock    pthread_mutex_lock
+#define pl_static_mutex_unlock  pthread_mutex_unlock
+
+#define PL_THREAD_VOID void *
+#define PL_THREAD_RETURN() return NULL
+
+#define pl_thread_create(t, f, a) pthread_create(t, NULL, f, a)
+#define pl_thread_join(t)         pthread_join(t, NULL)
+
+static inline bool pl_thread_sleep(double t)
+{
+    if (t <= 0.0)
+        return true;
+
+    struct timespec ts;
+    ts.tv_sec = (time_t) t;
+    ts.tv_nsec = (t - ts.tv_sec) * 1e9;
+
+    return nanosleep(&ts, NULL) == 0;
+}
diff --git a/src/pl_thread_win32.h b/src/pl_thread_win32.h
new file mode 100644
index 0000000..ef68d50
--- /dev/null
+++ b/src/pl_thread_win32.h
@@ -0,0 +1,182 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <windows.h>
+#include <process.h>
+#include <stdint.h>
+#include <errno.h>
+
+#include <pl_assert.h>
+
+typedef CRITICAL_SECTION   pl_mutex;
+typedef CONDITION_VARIABLE pl_cond;
+
+static inline int pl_mutex_init_type_internal(pl_mutex *mutex, enum pl_mutex_type mtype)
+{
+    (void) mtype;
+    return !InitializeCriticalSectionEx(mutex, 0, 0);
+}
+
+#define pl_mutex_init_type(mutex, mtype) \
+    pl_assert(!pl_mutex_init_type_internal(mutex, mtype))
+
+static inline int pl_mutex_destroy(pl_mutex *mutex)
+{
+    DeleteCriticalSection(mutex);
+    return 0;
+}
+
+static inline int pl_mutex_lock(pl_mutex *mutex)
+{
+    EnterCriticalSection(mutex);
+    return 0;
+}
+
+static inline int pl_mutex_unlock(pl_mutex *mutex)
+{
+    LeaveCriticalSection(mutex);
+    return 0;
+}
+
+static inline int pl_cond_init(pl_cond *cond)
+{
+    InitializeConditionVariable(cond);
+    return 0;
+}
+
+static inline int pl_cond_destroy(pl_cond *cond)
+{
+    // condition variables are not destroyed
+    (void) cond;
+    return 0;
+}
+
+static inline int pl_cond_broadcast(pl_cond *cond)
+{
+    WakeAllConditionVariable(cond);
+    return 0;
+}
+
+static inline int pl_cond_signal(pl_cond *cond)
+{
+    WakeConditionVariable(cond);
+    return 0;
+}
+
+static inline int pl_cond_wait(pl_cond *cond, pl_mutex *mutex)
+{
+    return !SleepConditionVariableCS(cond, mutex, INFINITE);
+}
+
+static inline int pl_cond_timedwait(pl_cond *cond, pl_mutex *mutex, uint64_t timeout)
+{
+    if (timeout == UINT64_MAX)
+        return pl_cond_wait(cond, mutex);
+
+    timeout /= UINT64_C(1000000);
+    if (timeout > INFINITE - 1)
+        timeout = INFINITE - 1;
+
+    BOOL bRet = SleepConditionVariableCS(cond, mutex, timeout);
+    if (bRet == FALSE)
+    {
+        if (GetLastError() == ERROR_TIMEOUT)
+            return ETIMEDOUT;
+        else
+            return EINVAL;
+    }
+    return 0;
+}
+
+typedef SRWLOCK pl_static_mutex;
+#define PL_STATIC_MUTEX_INITIALIZER SRWLOCK_INIT
+
+static inline int pl_static_mutex_lock(pl_static_mutex *mutex)
+{
+    AcquireSRWLockExclusive(mutex);
+    return 0;
+}
+
+static inline int pl_static_mutex_unlock(pl_static_mutex *mutex)
+{
+    ReleaseSRWLockExclusive(mutex);
+    return 0;
+}
+
+typedef HANDLE pl_thread;
+#define PL_THREAD_VOID unsigned __stdcall
+#define PL_THREAD_RETURN() return 0
+
+static inline int pl_thread_create(pl_thread *thread,
+                                   PL_THREAD_VOID (*fun)(void *),
+                                   void *__restrict arg)
+{
+    *thread = (HANDLE) _beginthreadex(NULL, 0, fun, arg, 0, NULL);
+    return *thread ? 0 : -1;
+}
+
+static inline int pl_thread_join(pl_thread thread)
+{
+    DWORD ret = WaitForSingleObject(thread, INFINITE);
+    if (ret != WAIT_OBJECT_0)
+        return ret == WAIT_ABANDONED ? EINVAL : EDEADLK;
+    CloseHandle(thread);
+    return 0;
+}
+
+static inline bool pl_thread_sleep(double t)
+{
+    // Time is expected in 100 nanosecond intervals.
+    // Negative values indicate relative time.
+    LARGE_INTEGER time = { .QuadPart = -(LONGLONG) (t * 1e7) };
+
+    if (time.QuadPart >= 0)
+        return true;
+
+    bool ret = false;
+
+#ifndef CREATE_WAITABLE_TIMER_HIGH_RESOLUTION
+# define CREATE_WAITABLE_TIMER_HIGH_RESOLUTION 0x2
+#endif
+
+    HANDLE timer = CreateWaitableTimerEx(NULL, NULL,
+                                         CREATE_WAITABLE_TIMER_HIGH_RESOLUTION,
+                                         TIMER_ALL_ACCESS);
+
+    // CREATE_WAITABLE_TIMER_HIGH_RESOLUTION is supported in Windows 10 1803+,
+    // retry without it.
+    if (!timer)
+        timer = CreateWaitableTimerEx(NULL, NULL, 0, TIMER_ALL_ACCESS);
+
+    if (!timer)
+        goto end;
+
+    if (!SetWaitableTimer(timer, &time, 0, NULL, NULL, 0))
+        goto end;
+
+    if (WaitForSingleObject(timer, INFINITE) != WAIT_OBJECT_0)
+        goto end;
+
+    ret = true;
+
+end:
+    if (timer)
+        CloseHandle(timer);
+    return ret;
+}
diff --git a/src/renderer.c b/src/renderer.c
new file mode 100644
index 0000000..cc56b6f
--- /dev/null
+++ b/src/renderer.c
@@ -0,0 +1,3815 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include "common.h"
+#include "filters.h"
+#include "hash.h"
+#include "shaders.h"
+#include "dispatch.h"
+
+#include <libplacebo/renderer.h>
+
+struct cached_frame {
+    uint64_t signature;
+    uint64_t params_hash; // for detecting `pl_render_params` changes
+    struct pl_color_space color;
+    struct pl_icc_profile profile;
+    pl_rect2df crop;
+    pl_tex tex;
+    int comps;
+    bool evict; // for garbage collection
+};
+
+struct sampler {
+    pl_shader_obj upscaler_state;
+    pl_shader_obj downscaler_state;
+};
+
+struct osd_vertex {
+    float pos[2];
+    float coord[2];
+    float color[4];
+};
+
+struct icc_state {
+    pl_icc_object icc;
+    uint64_t error; // set to profile signature on failure
+};
+
+struct pl_renderer_t {
+    pl_gpu gpu;
+    pl_dispatch dp;
+    pl_log log;
+
+    // Cached feature checks (inverted)
+    enum pl_render_error errors;
+
+    // List containing signatures of disabled hooks
+    PL_ARRAY(uint64_t) disabled_hooks;
+
+    // Shader resource objects and intermediate textures (FBOs)
+    pl_shader_obj tone_map_state;
+    pl_shader_obj dither_state;
+    pl_shader_obj grain_state[4];
+    pl_shader_obj lut_state[3];
+    pl_shader_obj icc_state[2];
+    PL_ARRAY(pl_tex) fbos;
+    struct sampler sampler_main;
+    struct sampler sampler_contrast;
+    struct sampler samplers_src[4];
+    struct sampler samplers_dst[4];
+
+    // Temporary storage for vertex/index data
+    PL_ARRAY(struct osd_vertex) osd_vertices;
+    PL_ARRAY(uint16_t) osd_indices;
+    struct pl_vertex_attrib osd_attribs[3];
+
+    // Frame cache (for frame mixing / interpolation)
+    PL_ARRAY(struct cached_frame) frames;
+    PL_ARRAY(pl_tex) frame_fbos;
+
+    // For debugging / logging purposes
+    int prev_dither;
+
+    // For backwards compatibility
+    struct icc_state icc_fallback[2];
+};
+
+enum {
+    // Index into `lut_state`
+    LUT_IMAGE,
+    LUT_TARGET,
+    LUT_PARAMS,
+};
+
+enum {
+    // Index into `icc_state`
+    ICC_IMAGE,
+    ICC_TARGET
+};
+
+pl_renderer pl_renderer_create(pl_log log, pl_gpu gpu)
+{
+    pl_renderer rr = pl_alloc_ptr(NULL, rr);
+    *rr = (struct pl_renderer_t) {
+        .gpu  = gpu,
+        .log = log,
+        .dp  = pl_dispatch_create(log, gpu),
+        .osd_attribs = {
+            {
+                .name = "pos",
+                .offset = offsetof(struct osd_vertex, pos),
+                .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2),
+            }, {
+                .name = "coord",
+                .offset = offsetof(struct osd_vertex, coord),
+                .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2),
+            }, {
+                .name = "osd_color",
+                .offset = offsetof(struct osd_vertex, color),
+                .fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 4),
+            }
+        },
+    };
+
+    assert(rr->dp);
+    return rr;
+}
+
+static void sampler_destroy(pl_renderer rr, struct sampler *sampler)
+{
+    pl_shader_obj_destroy(&sampler->upscaler_state);
+    pl_shader_obj_destroy(&sampler->downscaler_state);
+}
+
+void pl_renderer_destroy(pl_renderer *p_rr)
+{
+    pl_renderer rr = *p_rr;
+    if (!rr)
+        return;
+
+    // Free all intermediate FBOs
+    for (int i = 0; i < rr->fbos.num; i++)
+        pl_tex_destroy(rr->gpu, &rr->fbos.elem[i]);
+    for (int i = 0; i < rr->frames.num; i++)
+        pl_tex_destroy(rr->gpu, &rr->frames.elem[i].tex);
+    for (int i = 0; i < rr->frame_fbos.num; i++)
+        pl_tex_destroy(rr->gpu, &rr->frame_fbos.elem[i]);
+
+    // Free all shader resource objects
+    pl_shader_obj_destroy(&rr->tone_map_state);
+    pl_shader_obj_destroy(&rr->dither_state);
+    for (int i = 0; i < PL_ARRAY_SIZE(rr->lut_state); i++)
+        pl_shader_obj_destroy(&rr->lut_state[i]);
+    for (int i = 0; i < PL_ARRAY_SIZE(rr->grain_state); i++)
+        pl_shader_obj_destroy(&rr->grain_state[i]);
+    for (int i = 0; i < PL_ARRAY_SIZE(rr->icc_state); i++)
+        pl_shader_obj_destroy(&rr->icc_state[i]);
+
+    // Free all samplers
+    sampler_destroy(rr, &rr->sampler_main);
+    sampler_destroy(rr, &rr->sampler_contrast);
+    for (int i = 0; i < PL_ARRAY_SIZE(rr->samplers_src); i++)
+        sampler_destroy(rr, &rr->samplers_src[i]);
+    for (int i = 0; i < PL_ARRAY_SIZE(rr->samplers_dst); i++)
+        sampler_destroy(rr, &rr->samplers_dst[i]);
+
+    // Free fallback ICC profiles
+    for (int i = 0; i < PL_ARRAY_SIZE(rr->icc_fallback); i++)
+        pl_icc_close(&rr->icc_fallback[i].icc);
+
+    pl_dispatch_destroy(&rr->dp);
+    pl_free_ptr(p_rr);
+}
+
+size_t pl_renderer_save(pl_renderer rr, uint8_t *out)
+{
+    return pl_cache_save(pl_gpu_cache(rr->gpu), out, out ? SIZE_MAX : 0);
+}
+
+void pl_renderer_load(pl_renderer rr, const uint8_t *cache)
+{
+    pl_cache_load(pl_gpu_cache(rr->gpu), cache, SIZE_MAX);
+}
+
+void pl_renderer_flush_cache(pl_renderer rr)
+{
+    for (int i = 0; i < rr->frames.num; i++)
+        pl_tex_destroy(rr->gpu, &rr->frames.elem[i].tex);
+    rr->frames.num = 0;
+
+    pl_reset_detected_peak(rr->tone_map_state);
+}
+
+const struct pl_render_params pl_render_fast_params = { PL_RENDER_DEFAULTS };
+const struct pl_render_params pl_render_default_params = {
+    PL_RENDER_DEFAULTS
+    .upscaler           = &pl_filter_lanczos,
+    .downscaler         = &pl_filter_hermite,
+    .frame_mixer        = &pl_filter_oversample,
+    .sigmoid_params     = &pl_sigmoid_default_params,
+    .dither_params      = &pl_dither_default_params,
+    .peak_detect_params = &pl_peak_detect_default_params,
+};
+
+const struct pl_render_params pl_render_high_quality_params = {
+    PL_RENDER_DEFAULTS
+    .upscaler           = &pl_filter_ewa_lanczossharp,
+    .downscaler         = &pl_filter_hermite,
+    .frame_mixer        = &pl_filter_oversample,
+    .sigmoid_params     = &pl_sigmoid_default_params,
+    .peak_detect_params = &pl_peak_detect_high_quality_params,
+    .color_map_params   = &pl_color_map_high_quality_params,
+    .dither_params      = &pl_dither_default_params,
+    .deband_params      = &pl_deband_default_params,
+};
+
+const struct pl_filter_preset pl_frame_mixers[] = {
+    { "none",           NULL,                       "No frame mixing" },
+    { "linear",         &pl_filter_bilinear,        "Linear frame mixing" },
+    { "oversample",     &pl_filter_oversample,      "Oversample (AKA SmoothMotion)" },
+    { "mitchell_clamp", &pl_filter_mitchell_clamp,  "Clamped Mitchell spline" },
+    { "hermite",        &pl_filter_hermite,         "Cubic spline (Hermite)" },
+    {0}
+};
+
+const int pl_num_frame_mixers = PL_ARRAY_SIZE(pl_frame_mixers) - 1;
+
+const struct pl_filter_preset pl_scale_filters[] = {
+    {"none",                NULL,                   "Built-in sampling"},
+    {"oversample",          &pl_filter_oversample,  "Oversample (Aspect-preserving NN)"},
+    COMMON_FILTER_PRESETS,
+    {0}
+};
+
+const int pl_num_scale_filters = PL_ARRAY_SIZE(pl_scale_filters) - 1;
+
+// Represents a "in-flight" image, which is either a shader that's in the
+// process of producing some sort of image, or a texture that needs to be
+// sampled from
+struct img {
+    // Effective texture size, always set
+    int w, h;
+
+    // Recommended format (falls back to fbofmt otherwise), only for shaders
+    pl_fmt fmt;
+
+    // Exactly *one* of these two is set:
+    pl_shader sh;
+    pl_tex tex;
+
+    // If true, created shaders will be set to unique
+    bool unique;
+
+    // Information about what to log/disable/fallback to if the shader fails
+    const char *err_msg;
+    enum pl_render_error err_enum;
+    pl_tex err_tex;
+
+    // Current effective source area, will be sampled by the main scaler
+    pl_rect2df rect;
+
+    // The current effective colorspace
+    struct pl_color_repr repr;
+    struct pl_color_space color;
+    int comps;
+};
+
+// Plane 'type', ordered by incrementing priority
+enum plane_type {
+    PLANE_INVALID = 0,
+    PLANE_ALPHA,
+    PLANE_CHROMA,
+    PLANE_LUMA,
+    PLANE_RGB,
+    PLANE_XYZ,
+};
+
+static inline enum plane_type detect_plane_type(const struct pl_plane *plane,
+                                                const struct pl_color_repr *repr)
+{
+    if (pl_color_system_is_ycbcr_like(repr->sys)) {
+        int t = PLANE_INVALID;
+        for (int c = 0; c < plane->components; c++) {
+            switch (plane->component_mapping[c]) {
+            case PL_CHANNEL_Y: t = PL_MAX(t, PLANE_LUMA); continue;
+            case PL_CHANNEL_A: t = PL_MAX(t, PLANE_ALPHA); continue;
+
+            case PL_CHANNEL_CB:
+            case PL_CHANNEL_CR:
+                t = PL_MAX(t, PLANE_CHROMA);
+                continue;
+
+            default: continue;
+            }
+        }
+
+        pl_assert(t);
+        return t;
+    }
+
+    // Extra test for exclusive / separated alpha plane
+    if (plane->components == 1 && plane->component_mapping[0] == PL_CHANNEL_A)
+        return PLANE_ALPHA;
+
+    switch (repr->sys) {
+    case PL_COLOR_SYSTEM_UNKNOWN: // fall through to RGB
+    case PL_COLOR_SYSTEM_RGB: return PLANE_RGB;
+    case PL_COLOR_SYSTEM_XYZ: return PLANE_XYZ;
+
+    // For the switch completeness check
+    case PL_COLOR_SYSTEM_BT_601:
+    case PL_COLOR_SYSTEM_BT_709:
+    case PL_COLOR_SYSTEM_SMPTE_240M:
+    case PL_COLOR_SYSTEM_BT_2020_NC:
+    case PL_COLOR_SYSTEM_BT_2020_C:
+    case PL_COLOR_SYSTEM_BT_2100_PQ:
+    case PL_COLOR_SYSTEM_BT_2100_HLG:
+    case PL_COLOR_SYSTEM_DOLBYVISION:
+    case PL_COLOR_SYSTEM_YCGCO:
+    case PL_COLOR_SYSTEM_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+}
+
+struct pass_state {
+    void *tmp;
+    pl_renderer rr;
+    const struct pl_render_params *params;
+    struct pl_render_info info; // for info callback
+
+    // Represents the "current" image which we're in the process of rendering.
+    // This is initially set by pass_read_image, and all of the subsequent
+    // rendering steps will mutate this in-place.
+    struct img img;
+
+    // Represents the "reference rect". Canonically, this is functionally
+    // equivalent to `image.crop`, but also updates as the refplane evolves
+    // (e.g. due to user hook prescalers)
+    pl_rect2df ref_rect;
+
+    // Integer version of `target.crop`. Semantically identical.
+    pl_rect2d dst_rect;
+
+    // Logical end-to-end rotation
+    pl_rotation rotation;
+
+    // Cached copies of the `image` / `target` for this rendering pass,
+    // corrected to make sure all rects etc. are properly defaulted/inferred.
+    struct pl_frame image;
+    struct pl_frame target;
+
+    // Cached copies of the `prev` / `next` frames, for deinterlacing.
+    struct pl_frame prev, next;
+
+    // Some extra plane metadata, inferred from `planes`
+    enum plane_type src_type[4];
+    int src_ref, dst_ref; // index into `planes`
+
+    // Metadata for `rr->fbos`
+    pl_fmt fbofmt[5];
+    bool *fbos_used;
+    bool need_peak_fbo; // need indirection for peak detection
+
+    // Map of acquired frames
+    struct {
+        bool target, image, prev, next;
+    } acquired;
+};
+
+static void find_fbo_format(struct pass_state *pass)
+{
+    const struct pl_render_params *params = pass->params;
+    pl_renderer rr = pass->rr;
+    if (params->disable_fbos || (rr->errors & PL_RENDER_ERR_FBO) || pass->fbofmt[4])
+        return;
+
+    struct {
+        enum pl_fmt_type type;
+        int depth;
+        enum pl_fmt_caps caps;
+    } configs[] = {
+        // Prefer floating point formats first
+        {PL_FMT_FLOAT, 16, PL_FMT_CAP_LINEAR},
+        {PL_FMT_FLOAT, 16, PL_FMT_CAP_SAMPLEABLE},
+
+        // Otherwise, fall back to unorm/snorm, preferring linearly sampleable
+        {PL_FMT_UNORM, 16, PL_FMT_CAP_LINEAR},
+        {PL_FMT_SNORM, 16, PL_FMT_CAP_LINEAR},
+        {PL_FMT_UNORM, 16, PL_FMT_CAP_SAMPLEABLE},
+        {PL_FMT_SNORM, 16, PL_FMT_CAP_SAMPLEABLE},
+
+        // As a final fallback, allow 8-bit FBO formats (for UNORM only)
+        {PL_FMT_UNORM, 8, PL_FMT_CAP_LINEAR},
+        {PL_FMT_UNORM, 8, PL_FMT_CAP_SAMPLEABLE},
+    };
+
+    pl_fmt fmt = NULL;
+    for (int i = 0; i < PL_ARRAY_SIZE(configs); i++) {
+        if (params->force_low_bit_depth_fbos && configs[i].depth > 8)
+            continue;
+
+        fmt = pl_find_fmt(rr->gpu, configs[i].type, 4, configs[i].depth, 0,
+                          PL_FMT_CAP_RENDERABLE | configs[i].caps);
+        if (!fmt)
+            continue;
+
+        pass->fbofmt[4] = fmt;
+
+        // Probe the right variant for each number of channels, falling
+        // back to the next biggest format
+        for (int c = 1; c < 4; c++) {
+            pass->fbofmt[c] = pl_find_fmt(rr->gpu, configs[i].type, c,
+                                        configs[i].depth, 0, fmt->caps);
+            pass->fbofmt[c] = PL_DEF(pass->fbofmt[c], pass->fbofmt[c+1]);
+        }
+        return;
+    }
+
+    PL_WARN(rr, "Found no renderable FBO format! Most features disabled");
+    rr->errors |= PL_RENDER_ERR_FBO;
+}
+
+static void info_callback(void *priv, const struct pl_dispatch_info *dinfo)
+{
+    struct pass_state *pass = priv;
+    const struct pl_render_params *params = pass->params;
+    if (!params->info_callback)
+        return;
+
+    pass->info.pass = dinfo;
+    params->info_callback(params->info_priv, &pass->info);
+    pass->info.index++;
+}
+
+static pl_tex get_fbo(struct pass_state *pass, int w, int h, pl_fmt fmt,
+                      int comps, pl_debug_tag debug_tag)
+{
+    pl_renderer rr = pass->rr;
+    comps = PL_DEF(comps, 4);
+    fmt = PL_DEF(fmt, pass->fbofmt[comps]);
+    if (!fmt)
+        return NULL;
+
+    struct pl_tex_params params = {
+        .w          = w,
+        .h          = h,
+        .format     = fmt,
+        .sampleable = true,
+        .renderable = true,
+        .blit_src   = fmt->caps & PL_FMT_CAP_BLITTABLE,
+        .storable   = fmt->caps & PL_FMT_CAP_STORABLE,
+        .debug_tag  = debug_tag,
+    };
+
+    int best_idx = -1;
+    int best_diff = 0;
+
+    // Find the best-fitting texture out of rr->fbos
+    for (int i = 0; i < rr->fbos.num; i++) {
+        if (pass->fbos_used[i])
+            continue;
+
+        // Orthogonal distance, with penalty for format mismatches
+        int diff = abs(rr->fbos.elem[i]->params.w - w) +
+                   abs(rr->fbos.elem[i]->params.h - h) +
+                   ((rr->fbos.elem[i]->params.format != fmt) ? 1000 : 0);
+
+        if (best_idx < 0 || diff < best_diff) {
+            best_idx = i;
+            best_diff = diff;
+        }
+    }
+
+    // No texture found at all, add a new one
+    if (best_idx < 0) {
+        best_idx = rr->fbos.num;
+        PL_ARRAY_APPEND(rr, rr->fbos, NULL);
+        pl_grow(pass->tmp, &pass->fbos_used, rr->fbos.num * sizeof(bool));
+        pass->fbos_used[best_idx] = false;
+    }
+
+    if (!pl_tex_recreate(rr->gpu, &rr->fbos.elem[best_idx], &params))
+        return NULL;
+
+    pass->fbos_used[best_idx] = true;
+    return rr->fbos.elem[best_idx];
+}
+
+// Forcibly convert an img to `tex`, dispatching where necessary
+static pl_tex _img_tex(struct pass_state *pass, struct img *img, pl_debug_tag tag)
+{
+    if (img->tex) {
+        pl_assert(!img->sh);
+        return img->tex;
+    }
+
+    pl_renderer rr = pass->rr;
+    pl_tex tex = get_fbo(pass, img->w, img->h, img->fmt, img->comps, tag);
+    img->fmt = NULL;
+
+    if (!tex) {
+        PL_ERR(rr, "Failed creating FBO texture! Disabling advanced rendering..");
+        memset(pass->fbofmt, 0, sizeof(pass->fbofmt));
+        pl_dispatch_abort(rr->dp, &img->sh);
+        rr->errors |= PL_RENDER_ERR_FBO;
+        return img->err_tex;
+    }
+
+    pl_assert(img->sh);
+    bool ok = pl_dispatch_finish(rr->dp, pl_dispatch_params(
+        .shader = &img->sh,
+        .target = tex,
+    ));
+
+    const char *err_msg = img->err_msg;
+    enum pl_render_error err_enum = img->err_enum;
+    pl_tex err_tex = img->err_tex;
+    img->err_msg = NULL;
+    img->err_enum = PL_RENDER_ERR_NONE;
+    img->err_tex = NULL;
+
+    if (!ok) {
+        PL_ERR(rr, "%s", PL_DEF(err_msg, "Failed dispatching intermediate pass!"));
+        rr->errors |= err_enum;
+        img->sh = pl_dispatch_begin(rr->dp);
+        img->tex = err_tex;
+        return img->tex;
+    }
+
+    img->tex = tex;
+    return img->tex;
+}
+
+#define img_tex(pass, img) _img_tex(pass, img, PL_DEBUG_TAG)
+
+// Forcibly convert an img to `sh`, sampling where necessary
+static pl_shader img_sh(struct pass_state *pass, struct img *img)
+{
+    if (img->sh) {
+        pl_assert(!img->tex);
+        return img->sh;
+    }
+
+    pl_assert(img->tex);
+    img->sh = pl_dispatch_begin_ex(pass->rr->dp, img->unique);
+    pl_shader_sample_direct(img->sh, pl_sample_src( .tex = img->tex ));
+
+    img->tex = NULL;
+    return img->sh;
+}
+
+enum sampler_type {
+    SAMPLER_DIRECT,     // pick based on texture caps
+    SAMPLER_NEAREST,    // direct sampling, force nearest
+    SAMPLER_BICUBIC,    // fast bicubic scaling
+    SAMPLER_HERMITE,    // fast hermite scaling
+    SAMPLER_GAUSSIAN,   // fast gaussian scaling
+    SAMPLER_COMPLEX,    // complex custom filters
+    SAMPLER_OVERSAMPLE,
+};
+
+enum sampler_dir {
+    SAMPLER_NOOP, // 1:1 scaling
+    SAMPLER_UP,   // upscaling
+    SAMPLER_DOWN, // downscaling
+};
+
+enum sampler_usage {
+    SAMPLER_MAIN,
+    SAMPLER_PLANE,
+    SAMPLER_CONTRAST,
+};
+
+struct sampler_info {
+    const struct pl_filter_config *config; // if applicable
+    enum sampler_usage usage;
+    enum sampler_type type;
+    enum sampler_dir dir;
+    enum sampler_dir dir_sep[2];
+};
+
+static struct sampler_info sample_src_info(struct pass_state *pass,
+                                           const struct pl_sample_src *src,
+                                           enum sampler_usage usage)
+{
+    const struct pl_render_params *params = pass->params;
+    struct sampler_info info = { .usage = usage };
+    pl_renderer rr = pass->rr;
+
+    float rx = src->new_w / fabsf(pl_rect_w(src->rect));
+    if (rx < 1.0 - 1e-6) {
+        info.dir_sep[0] = SAMPLER_DOWN;
+    } else if (rx > 1.0 + 1e-6) {
+        info.dir_sep[0] = SAMPLER_UP;
+    }
+
+    float ry = src->new_h / fabsf(pl_rect_h(src->rect));
+    if (ry < 1.0 - 1e-6) {
+        info.dir_sep[1] = SAMPLER_DOWN;
+    } else if (ry > 1.0 + 1e-6) {
+        info.dir_sep[1] = SAMPLER_UP;
+    }
+
+    if (params->correct_subpixel_offsets) {
+        if (!info.dir_sep[0] && fabsf(src->rect.x0) > 1e-6f)
+            info.dir_sep[0] = SAMPLER_UP;
+        if (!info.dir_sep[1] && fabsf(src->rect.y0) > 1e-6f)
+            info.dir_sep[1] = SAMPLER_UP;
+    }
+
+    // We use PL_MAX so downscaling overrides upscaling when choosing scalers
+    info.dir = PL_MAX(info.dir_sep[0], info.dir_sep[1]);
+    switch (info.dir) {
+    case SAMPLER_DOWN:
+        if (usage == SAMPLER_CONTRAST) {
+            info.config = &pl_filter_bicubic;
+        } else if (usage == SAMPLER_PLANE && params->plane_downscaler) {
+            info.config = params->plane_downscaler;
+        } else {
+            info.config = params->downscaler;
+        }
+        break;
+    case SAMPLER_UP:
+        if (usage == SAMPLER_PLANE && params->plane_upscaler) {
+            info.config = params->plane_upscaler;
+        } else {
+            pl_assert(usage != SAMPLER_CONTRAST);
+            info.config = params->upscaler;
+        }
+        break;
+    case SAMPLER_NOOP:
+        info.type = SAMPLER_NEAREST;
+        return info;
+    }
+
+    if ((rr->errors & PL_RENDER_ERR_SAMPLING) || !info.config) {
+        info.type = SAMPLER_DIRECT;
+    } else if (info.config->kernel == &pl_filter_function_oversample) {
+        info.type = SAMPLER_OVERSAMPLE;
+    } else {
+        info.type = SAMPLER_COMPLEX;
+
+        // Try using faster replacements for GPU built-in scalers
+        pl_fmt texfmt = src->tex ? src->tex->params.format : pass->fbofmt[4];
+        bool can_linear = texfmt->caps & PL_FMT_CAP_LINEAR;
+        bool can_fast = info.dir == SAMPLER_UP || params->skip_anti_aliasing;
+
+        if (can_fast && !params->disable_builtin_scalers) {
+            if (can_linear && info.config == &pl_filter_bicubic)
+                info.type = SAMPLER_BICUBIC;
+            if (can_linear && info.config == &pl_filter_hermite)
+                info.type = SAMPLER_HERMITE;
+            if (can_linear && info.config == &pl_filter_gaussian)
+                info.type = SAMPLER_GAUSSIAN;
+            if (can_linear && info.config == &pl_filter_bilinear)
+                info.type = SAMPLER_DIRECT;
+            if (info.config == &pl_filter_nearest)
+                info.type = can_linear ? SAMPLER_NEAREST : SAMPLER_DIRECT;
+        }
+    }
+
+    // Disable advanced scaling without FBOs
+    if (!pass->fbofmt[4] && info.type == SAMPLER_COMPLEX)
+        info.type = SAMPLER_DIRECT;
+
+    return info;
+}
+
+static void dispatch_sampler(struct pass_state *pass, pl_shader sh,
+                             struct sampler *sampler, enum sampler_usage usage,
+                             pl_tex target_tex, const struct pl_sample_src *src)
+{
+    const struct pl_render_params *params = pass->params;
+    if (!sampler)
+        goto fallback;
+
+    pl_renderer rr = pass->rr;
+    struct sampler_info info = sample_src_info(pass, src, usage);
+    pl_shader_obj *lut = NULL;
+    switch (info.dir) {
+    case SAMPLER_NOOP:
+        goto fallback;
+    case SAMPLER_DOWN:
+        lut = &sampler->downscaler_state;
+        break;
+    case SAMPLER_UP:
+        lut = &sampler->upscaler_state;
+        break;
+    }
+
+    switch (info.type) {
+    case SAMPLER_DIRECT:
+        goto fallback;
+    case SAMPLER_NEAREST:
+        pl_shader_sample_nearest(sh, src);
+        return;
+    case SAMPLER_OVERSAMPLE:
+        pl_shader_sample_oversample(sh, src, info.config->kernel->params[0]);
+        return;
+    case SAMPLER_BICUBIC:
+        pl_shader_sample_bicubic(sh, src);
+        return;
+    case SAMPLER_HERMITE:
+        pl_shader_sample_hermite(sh, src);
+        return;
+    case SAMPLER_GAUSSIAN:
+        pl_shader_sample_gaussian(sh, src);
+        return;
+    case SAMPLER_COMPLEX:
+        break; // continue below
+    }
+
+    pl_assert(lut);
+    struct pl_sample_filter_params fparams = {
+        .filter      = *info.config,
+        .antiring    = params->antiringing_strength,
+        .no_widening = params->skip_anti_aliasing && usage != SAMPLER_CONTRAST,
+        .lut         = lut,
+    };
+
+    if (target_tex) {
+        fparams.no_compute = !target_tex->params.storable;
+    } else {
+        fparams.no_compute = !(pass->fbofmt[4]->caps & PL_FMT_CAP_STORABLE);
+    }
+
+    bool ok;
+    if (info.config->polar) {
+        // Polar samplers are always a single function call
+        ok = pl_shader_sample_polar(sh, src, &fparams);
+    } else if (info.dir_sep[0] && info.dir_sep[1]) {
+        // Scaling is needed in both directions
+        struct pl_sample_src src1 = *src, src2 = *src;
+        src1.new_w = src->tex->params.w;
+        src1.rect.x0 = 0;
+        src1.rect.x1 = src1.new_w;;
+        src2.rect.y0 = 0;
+        src2.rect.y1 = src1.new_h;
+
+        pl_shader tsh = pl_dispatch_begin(rr->dp);
+        ok = pl_shader_sample_ortho2(tsh, &src1, &fparams);
+        if (!ok) {
+            pl_dispatch_abort(rr->dp, &tsh);
+            goto done;
+        }
+
+        struct img img = {
+            .sh = tsh,
+            .w  = src1.new_w,
+            .h  = src1.new_h,
+            .comps = src->components,
+        };
+
+        src2.tex = img_tex(pass, &img);
+        src2.scale = 1.0;
+        ok = src2.tex && pl_shader_sample_ortho2(sh, &src2, &fparams);
+    } else {
+        // Scaling is needed only in one direction
+        ok = pl_shader_sample_ortho2(sh, src, &fparams);
+    }
+
+done:
+    if (!ok) {
+        PL_ERR(rr, "Failed dispatching scaler.. disabling");
+        rr->errors |= PL_RENDER_ERR_SAMPLING;
+        goto fallback;
+    }
+
+    return;
+
+fallback:
+    // If all else fails, fall back to auto sampling
+    pl_shader_sample_direct(sh, src);
+}
+
+static void swizzle_color(pl_shader sh, int comps, const int comp_map[4],
+                          bool force_alpha)
+{
+    ident_t orig = sh_fresh(sh, "orig_color");
+    GLSL("vec4 "$" = color;                 \n"
+         "color = vec4(0.0, 0.0, 0.0, 1.0); \n", orig);
+
+    static const int def_map[4] = {0, 1, 2, 3};
+    comp_map = PL_DEF(comp_map, def_map);
+
+    for (int c = 0; c < comps; c++) {
+        if (comp_map[c] >= 0)
+            GLSL("color[%d] = "$"[%d]; \n", c, orig, comp_map[c]);
+    }
+
+    if (force_alpha)
+        GLSL("color.a = "$".a; \n", orig);
+}
+
+// `scale` adapts from `pass->dst_rect` to the plane being rendered to
+static void draw_overlays(struct pass_state *pass, pl_tex fbo,
+                          int comps, const int comp_map[4],
+                          const struct pl_overlay *overlays, int num,
+                          struct pl_color_space color, struct pl_color_repr repr,
+                          const pl_transform2x2 *output_shift)
+{
+    pl_renderer rr = pass->rr;
+    if (num <= 0 || (rr->errors & PL_RENDER_ERR_OVERLAY))
+        return;
+
+    enum pl_fmt_caps caps = fbo->params.format->caps;
+    if (!(rr->errors & PL_RENDER_ERR_BLENDING) &&
+        !(caps & PL_FMT_CAP_BLENDABLE))
+    {
+        PL_WARN(rr, "Trying to draw an overlay to a non-blendable target. "
+                "Alpha blending is disabled, results may be incorrect!");
+        rr->errors |= PL_RENDER_ERR_BLENDING;
+    }
+
+    const struct pl_frame *image = pass->src_ref >= 0 ? &pass->image : NULL;
+    pl_transform2x2 src_to_dst;
+    if (image) {
+        float rx = pl_rect_w(pass->dst_rect) / pl_rect_w(image->crop);
+        float ry = pl_rect_h(pass->dst_rect) / pl_rect_h(image->crop);
+        src_to_dst = (pl_transform2x2) {
+            .mat.m = {{ rx, 0 }, { 0, ry }},
+            .c = {
+                pass->dst_rect.x0 - rx * image->crop.x0,
+                pass->dst_rect.y0 - ry * image->crop.y0,
+            },
+        };
+
+        if (pass->rotation % PL_ROTATION_180 == PL_ROTATION_90) {
+            PL_SWAP(src_to_dst.c[0], src_to_dst.c[1]);
+            src_to_dst.mat = (pl_matrix2x2) {{{ 0, ry }, { rx, 0 }}};
+        }
+    }
+
+    const struct pl_frame *target = &pass->target;
+    pl_rect2df dst_crop = target->crop;
+    pl_rect2df_rotate(&dst_crop, -pass->rotation);
+    pl_rect2df_normalize(&dst_crop);
+
+    for (int n = 0; n < num; n++) {
+        struct pl_overlay ol = overlays[n];
+        if (!ol.num_parts)
+            continue;
+
+        if (!ol.coords) {
+            ol.coords = overlays == target->overlays
+                            ? PL_OVERLAY_COORDS_DST_FRAME
+                            : PL_OVERLAY_COORDS_SRC_FRAME;
+        }
+
+        pl_transform2x2 tf = pl_transform2x2_identity;
+        switch (ol.coords) {
+            case PL_OVERLAY_COORDS_SRC_CROP:
+                if (!image)
+                    continue;
+                tf.c[0] = image->crop.x0;
+                tf.c[1] = image->crop.y0;
+                // fall through
+            case PL_OVERLAY_COORDS_SRC_FRAME:
+                if (!image)
+                    continue;
+                pl_transform2x2_rmul(&src_to_dst, &tf);
+                break;
+            case PL_OVERLAY_COORDS_DST_CROP:
+                tf.c[0] = dst_crop.x0;
+                tf.c[1] = dst_crop.y0;
+                break;
+            case PL_OVERLAY_COORDS_DST_FRAME:
+                break;
+            case PL_OVERLAY_COORDS_AUTO:
+            case PL_OVERLAY_COORDS_COUNT:
+                pl_unreachable();
+        }
+
+        if (output_shift)
+            pl_transform2x2_rmul(output_shift, &tf);
+
+        // Construct vertex/index buffers
+        rr->osd_vertices.num = 0;
+        rr->osd_indices.num = 0;
+        for (int i = 0; i < ol.num_parts; i++) {
+            const struct pl_overlay_part *part = &ol.parts[i];
+
+#define EMIT_VERT(x, y)                                                         \
+            do {                                                                \
+                float pos[2] = { part->dst.x, part->dst.y };                    \
+                pl_transform2x2_apply(&tf, pos);                                \
+                PL_ARRAY_APPEND(rr, rr->osd_vertices, (struct osd_vertex) {     \
+                    .pos = {                                                    \
+                        2.0 * (pos[0] / fbo->params.w) - 1.0,                   \
+                        2.0 * (pos[1] / fbo->params.h) - 1.0,                   \
+                    },                                                          \
+                    .coord = {                                                  \
+                        part->src.x / ol.tex->params.w,                         \
+                        part->src.y / ol.tex->params.h,                         \
+                    },                                                          \
+                    .color = {                                                  \
+                        part->color[0], part->color[1],                         \
+                        part->color[2], part->color[3],                         \
+                    },                                                          \
+                });                                                             \
+            } while (0)
+
+            int idx_base = rr->osd_vertices.num;
+            EMIT_VERT(x0, y0); // idx 0: top left
+            EMIT_VERT(x1, y0); // idx 1: top right
+            EMIT_VERT(x0, y1); // idx 2: bottom left
+            EMIT_VERT(x1, y1); // idx 3: bottom right
+            PL_ARRAY_APPEND(rr, rr->osd_indices, idx_base + 0);
+            PL_ARRAY_APPEND(rr, rr->osd_indices, idx_base + 1);
+            PL_ARRAY_APPEND(rr, rr->osd_indices, idx_base + 2);
+            PL_ARRAY_APPEND(rr, rr->osd_indices, idx_base + 2);
+            PL_ARRAY_APPEND(rr, rr->osd_indices, idx_base + 1);
+            PL_ARRAY_APPEND(rr, rr->osd_indices, idx_base + 3);
+        }
+
+        // Draw parts
+        pl_shader sh = pl_dispatch_begin(rr->dp);
+        ident_t tex = sh_desc(sh, (struct pl_shader_desc) {
+            .desc = {
+                .name = "osd_tex",
+                .type = PL_DESC_SAMPLED_TEX,
+            },
+            .binding = {
+                .object = ol.tex,
+                .sample_mode = (ol.tex->params.format->caps & PL_FMT_CAP_LINEAR)
+                    ? PL_TEX_SAMPLE_LINEAR
+                    : PL_TEX_SAMPLE_NEAREST,
+            },
+        });
+
+        sh_describe(sh, "overlay");
+        GLSL("// overlay \n");
+
+        switch (ol.mode) {
+        case PL_OVERLAY_NORMAL:
+            GLSL("vec4 color = textureLod("$", coord, 0.0); \n", tex);
+            break;
+        case PL_OVERLAY_MONOCHROME:
+            GLSL("vec4 color = osd_color; \n");
+            break;
+        case PL_OVERLAY_MODE_COUNT:
+            pl_unreachable();
+        };
+
+        static const struct pl_color_map_params osd_params = {
+            PL_COLOR_MAP_DEFAULTS
+            .tone_mapping_function = &pl_tone_map_linear,
+            .gamut_mapping         = &pl_gamut_map_saturation,
+        };
+
+        sh->output = PL_SHADER_SIG_COLOR;
+        pl_shader_decode_color(sh, &ol.repr, NULL);
+        if (target->icc)
+            color.transfer = PL_COLOR_TRC_LINEAR;
+        pl_shader_color_map_ex(sh, &osd_params, pl_color_map_args(ol.color, color));
+        if (target->icc)
+            pl_icc_encode(sh, target->icc, &rr->icc_state[ICC_TARGET]);
+
+        bool premul = repr.alpha == PL_ALPHA_PREMULTIPLIED;
+        pl_shader_encode_color(sh, &repr);
+        if (ol.mode == PL_OVERLAY_MONOCHROME) {
+            GLSL("color.%s *= textureLod("$", coord, 0.0).r; \n",
+                 premul ? "rgba" : "a", tex);
+        }
+
+        swizzle_color(sh, comps, comp_map, true);
+
+        struct pl_blend_params blend_params = {
+            .src_rgb = premul ? PL_BLEND_ONE : PL_BLEND_SRC_ALPHA,
+            .src_alpha = PL_BLEND_ONE,
+            .dst_rgb = PL_BLEND_ONE_MINUS_SRC_ALPHA,
+            .dst_alpha = PL_BLEND_ONE_MINUS_SRC_ALPHA,
+        };
+
+        bool ok = pl_dispatch_vertex(rr->dp, pl_dispatch_vertex_params(
+            .shader = &sh,
+            .target = fbo,
+            .blend_params = (rr->errors & PL_RENDER_ERR_BLENDING)
+                            ? NULL : &blend_params,
+            .vertex_stride = sizeof(struct osd_vertex),
+            .num_vertex_attribs = ol.mode == PL_OVERLAY_NORMAL ? 2 : 3,
+            .vertex_attribs = rr->osd_attribs,
+            .vertex_position_idx = 0,
+            .vertex_coords = PL_COORDS_NORMALIZED,
+            .vertex_type = PL_PRIM_TRIANGLE_LIST,
+            .vertex_count = rr->osd_indices.num,
+            .vertex_data = rr->osd_vertices.elem,
+            .index_data = rr->osd_indices.elem,
+        ));
+
+        if (!ok) {
+            PL_ERR(rr, "Failed rendering overlays!");
+            rr->errors |= PL_RENDER_ERR_OVERLAY;
+            return;
+        }
+    }
+}
+
+static pl_tex get_hook_tex(void *priv, int width, int height)
+{
+    struct pass_state *pass = priv;
+
+    return get_fbo(pass, width, height, NULL, 4, PL_DEBUG_TAG);
+}
+
+// Returns if any hook was applied (even if there were errors)
+static bool pass_hook(struct pass_state *pass, struct img *img,
+                      enum pl_hook_stage stage)
+{
+    const struct pl_render_params *params = pass->params;
+    pl_renderer rr = pass->rr;
+    if (!pass->fbofmt[4] || !stage)
+        return false;
+
+    bool ret = false;
+
+    for (int n = 0; n < params->num_hooks; n++) {
+        const struct pl_hook *hook = params->hooks[n];
+        if (!(hook->stages & stage))
+            continue;
+
+        // Hopefully the list of disabled hooks is small, search linearly.
+        for (int i = 0; i < rr->disabled_hooks.num; i++) {
+            if (rr->disabled_hooks.elem[i] != hook->signature)
+                continue;
+            PL_TRACE(rr, "Skipping hook %d (0x%"PRIx64") stage 0x%x",
+                     n, hook->signature, stage);
+            goto hook_skip;
+        }
+
+        PL_TRACE(rr, "Dispatching hook %d (0x%"PRIx64") stage 0x%x",
+                 n, hook->signature, stage);
+        struct pl_hook_params hparams = {
+            .gpu = rr->gpu,
+            .dispatch = rr->dp,
+            .get_tex = get_hook_tex,
+            .priv = pass,
+            .stage = stage,
+            .rect = img->rect,
+            .repr = img->repr,
+            .color = img->color,
+            .orig_repr = &pass->image.repr,
+            .orig_color = &pass->image.color,
+            .components = img->comps,
+            .src_rect = pass->ref_rect,
+            .dst_rect = pass->dst_rect,
+        };
+
+        // TODO: Add some sort of `test` API function to the hooks that allows
+        // us to skip having to touch the `img` state at all for no-ops
+
+        switch (hook->input) {
+        case PL_HOOK_SIG_NONE:
+            break;
+
+        case PL_HOOK_SIG_TEX: {
+            hparams.tex = img_tex(pass, img);
+            if (!hparams.tex) {
+                PL_ERR(rr, "Failed dispatching shader prior to hook!");
+                goto hook_error;
+            }
+            break;
+        }
+
+        case PL_HOOK_SIG_COLOR:
+            hparams.sh = img_sh(pass, img);
+            break;
+
+        case PL_HOOK_SIG_COUNT:
+            pl_unreachable();
+        }
+
+        struct pl_hook_res res = hook->hook(hook->priv, &hparams);
+        if (res.failed) {
+            PL_ERR(rr, "Failed executing hook, disabling");
+            goto hook_error;
+        }
+
+        bool resizable = pl_hook_stage_resizable(stage);
+        switch (res.output) {
+        case PL_HOOK_SIG_NONE:
+            break;
+
+        case PL_HOOK_SIG_TEX:
+            if (!resizable) {
+                if (res.tex->params.w != img->w ||
+                    res.tex->params.h != img->h ||
+                    !pl_rect2d_eq(res.rect, img->rect))
+                {
+                    PL_ERR(rr, "User hook tried resizing non-resizable stage!");
+                    goto hook_error;
+                }
+            }
+
+            *img = (struct img) {
+                .tex    = res.tex,
+                .repr   = res.repr,
+                .color  = res.color,
+                .comps  = res.components,
+                .rect   = res.rect,
+                .w      = res.tex->params.w,
+                .h      = res.tex->params.h,
+                .unique = img->unique,
+            };
+            break;
+
+        case PL_HOOK_SIG_COLOR:
+            if (!resizable) {
+                if (res.sh->output_w != img->w ||
+                    res.sh->output_h != img->h ||
+                    !pl_rect2d_eq(res.rect, img->rect))
+                {
+                    PL_ERR(rr, "User hook tried resizing non-resizable stage!");
+                    goto hook_error;
+                }
+            }
+
+            *img = (struct img) {
+                .sh       = res.sh,
+                .repr     = res.repr,
+                .color    = res.color,
+                .comps    = res.components,
+                .rect     = res.rect,
+                .w        = res.sh->output_w,
+                .h        = res.sh->output_h,
+                .unique   = img->unique,
+                .err_enum = PL_RENDER_ERR_HOOKS,
+                .err_msg  = "Failed applying user hook",
+                .err_tex  = hparams.tex, // if any
+            };
+            break;
+
+        case PL_HOOK_SIG_COUNT:
+            pl_unreachable();
+        }
+
+        // a hook was performed successfully
+        ret = true;
+
+hook_skip:
+        continue;
+hook_error:
+        PL_ARRAY_APPEND(rr, rr->disabled_hooks, hook->signature);
+        rr->errors |= PL_RENDER_ERR_HOOKS;
+    }
+
+    // Make sure the state remains as valid as possible, even if the resulting
+    // shaders might end up nonsensical, to prevent segfaults
+    if (!img->tex && !img->sh)
+        img->sh = pl_dispatch_begin(rr->dp);
+    return ret;
+}
+
+static void hdr_update_peak(struct pass_state *pass)
+{
+    const struct pl_render_params *params = pass->params;
+    pl_renderer rr = pass->rr;
+    if (!params->peak_detect_params || !pl_color_space_is_hdr(&pass->img.color))
+        goto cleanup;
+
+    if (rr->errors & PL_RENDER_ERR_PEAK_DETECT)
+        goto cleanup;
+
+    if (pass->fbofmt[4] && !(pass->fbofmt[4]->caps & PL_FMT_CAP_STORABLE))
+        goto cleanup;
+
+    if (!rr->gpu->limits.max_ssbo_size)
+        goto cleanup;
+
+    float max_peak = pl_color_transfer_nominal_peak(pass->img.color.transfer) *
+                     PL_COLOR_SDR_WHITE;
+    if (pass->img.color.transfer == PL_COLOR_TRC_HLG)
+        max_peak = pass->img.color.hdr.max_luma;
+    if (max_peak <= pass->target.color.hdr.max_luma + 1e-6)
+        goto cleanup; // no adaptation needed
+
+    if (pass->img.color.hdr.avg_pq_y)
+        goto cleanup; // DV metadata already present
+
+    enum pl_hdr_metadata_type metadata = PL_HDR_METADATA_ANY;
+    if (params->color_map_params)
+        metadata = params->color_map_params->metadata;
+
+    if (metadata && metadata != PL_HDR_METADATA_CIE_Y)
+        goto cleanup; // metadata will be unused
+
+    const struct pl_color_map_params *cpars = params->color_map_params;
+    bool uses_ootf = cpars && cpars->tone_mapping_function == &pl_tone_map_st2094_40;
+    if (uses_ootf && pass->img.color.hdr.ootf.num_anchors)
+        goto cleanup; // HDR10+ OOTF is being used
+
+    if (params->lut && params->lut_type == PL_LUT_CONVERSION)
+        goto cleanup; // LUT handles tone mapping
+
+    if (!pass->fbofmt[4] && !params->peak_detect_params->allow_delayed) {
+        PL_WARN(rr, "Disabling peak detection because "
+                "`pl_peak_detect_params.allow_delayed` is false, but lack of "
+                "FBOs forces the result to be delayed.");
+        rr->errors |= PL_RENDER_ERR_PEAK_DETECT;
+        goto cleanup;
+    }
+
+    bool ok = pl_shader_detect_peak(img_sh(pass, &pass->img), pass->img.color,
+                                    &rr->tone_map_state, params->peak_detect_params);
+    if (!ok) {
+        PL_WARN(rr, "Failed creating HDR peak detection shader.. disabling");
+        rr->errors |= PL_RENDER_ERR_PEAK_DETECT;
+        goto cleanup;
+    }
+
+    pass->need_peak_fbo = !params->peak_detect_params->allow_delayed;
+    return;
+
+cleanup:
+    // No peak detection required or supported, so clean up the state to avoid
+    // confusing it with later frames where peak detection is enabled again
+    pl_reset_detected_peak(rr->tone_map_state);
+}
+
+bool pl_renderer_get_hdr_metadata(pl_renderer rr,
+                                  struct pl_hdr_metadata *metadata)
+{
+    return pl_get_detected_hdr_metadata(rr->tone_map_state, metadata);
+}
+
+struct plane_state {
+    enum plane_type type;
+    struct pl_plane plane;
+    struct img img; // for per-plane shaders
+    float plane_w, plane_h; // logical plane dimensions
+};
+
+static const char *plane_type_names[] = {
+    [PLANE_INVALID] = "invalid",
+    [PLANE_ALPHA]   = "alpha",
+    [PLANE_CHROMA]  = "chroma",
+    [PLANE_LUMA]    = "luma",
+    [PLANE_RGB]     = "rgb",
+    [PLANE_XYZ]     = "xyz",
+};
+
+static void log_plane_info(pl_renderer rr, const struct plane_state *st)
+{
+    const struct pl_plane *plane = &st->plane;
+    PL_TRACE(rr, "    Type: %s", plane_type_names[st->type]);
+
+    switch (plane->components) {
+    case 0:
+        PL_TRACE(rr, "    Components: (none)");
+        break;
+    case 1:
+        PL_TRACE(rr, "    Components: {%d}",
+                 plane->component_mapping[0]);
+        break;
+    case 2:
+        PL_TRACE(rr, "    Components: {%d %d}",
+                 plane->component_mapping[0],
+                 plane->component_mapping[1]);
+        break;
+    case 3:
+        PL_TRACE(rr, "    Components: {%d %d %d}",
+                 plane->component_mapping[0],
+                 plane->component_mapping[1],
+                 plane->component_mapping[2]);
+        break;
+    case 4:
+        PL_TRACE(rr, "    Components: {%d %d %d %d}",
+                 plane->component_mapping[0],
+                 plane->component_mapping[1],
+                 plane->component_mapping[2],
+                 plane->component_mapping[3]);
+        break;
+    }
+
+    PL_TRACE(rr, "    Rect: {%f %f} -> {%f %f}",
+             st->img.rect.x0, st->img.rect.y0, st->img.rect.x1, st->img.rect.y1);
+
+    PL_TRACE(rr, "    Bits: %d (used) / %d (sampled), shift %d",
+             st->img.repr.bits.color_depth,
+             st->img.repr.bits.sample_depth,
+             st->img.repr.bits.bit_shift);
+}
+
+// Returns true if debanding was applied
+static bool plane_deband(struct pass_state *pass, struct img *img, float neutral[3])
+{
+    const struct pl_render_params *params = pass->params;
+    const struct pl_frame *image = &pass->image;
+    pl_renderer rr = pass->rr;
+    if ((rr->errors & PL_RENDER_ERR_DEBANDING) ||
+        !params->deband_params || !pass->fbofmt[4])
+    {
+        return false;
+    }
+
+    struct pl_color_repr repr = img->repr;
+    struct pl_sample_src src = {
+        .tex = img_tex(pass, img),
+        .components = img->comps,
+        .scale = pl_color_repr_normalize(&repr),
+    };
+
+    if (!(src.tex->params.format->caps & PL_FMT_CAP_LINEAR)) {
+        PL_WARN(rr, "Debanding requires uploaded textures to be linearly "
+                "sampleable (params.sample_mode = PL_TEX_SAMPLE_LINEAR)! "
+                "Disabling debanding..");
+        rr->errors |= PL_RENDER_ERR_DEBANDING;
+        return false;
+    }
+
+    // Divide the deband grain scale by the effective current colorspace nominal
+    // peak, to make sure the output intensity of the grain is as independent
+    // of the source as possible, even though it happens this early in the
+    // process (well before any linearization / output adaptation)
+    struct pl_deband_params dparams = *params->deband_params;
+    dparams.grain /= image->color.hdr.max_luma / PL_COLOR_SDR_WHITE;
+    memcpy(dparams.grain_neutral, neutral, sizeof(dparams.grain_neutral));
+
+    img->tex = NULL;
+    img->sh = pl_dispatch_begin_ex(rr->dp, true);
+    pl_shader_deband(img->sh, &src, &dparams);
+    img->err_msg = "Failed applying debanding... disabling!";
+    img->err_enum = PL_RENDER_ERR_DEBANDING;
+    img->err_tex = src.tex;
+    img->repr = repr;
+    return true;
+}
+
+// Returns true if grain was applied
+static bool plane_film_grain(struct pass_state *pass, int plane_idx,
+                             struct plane_state *st,
+                             const struct plane_state *ref)
+{
+    const struct pl_frame *image = &pass->image;
+    pl_renderer rr = pass->rr;
+    if (rr->errors & PL_RENDER_ERR_FILM_GRAIN)
+        return false;
+
+    struct img *img = &st->img;
+    struct pl_plane *plane = &st->plane;
+    struct pl_color_repr repr = image->repr;
+    bool is_orig_repr = pl_color_repr_equal(&st->img.repr, &image->repr);
+    if (!is_orig_repr) {
+        // Propagate the original color depth to the film grain algorithm, but
+        // update the sample depth and effective bit shift based on the state
+        // of the current texture, which is guaranteed to already be
+        // normalized.
+        pl_assert(st->img.repr.bits.bit_shift == 0);
+        repr.bits.sample_depth = st->img.repr.bits.sample_depth;
+        repr.bits.bit_shift = repr.bits.sample_depth - repr.bits.color_depth;
+    }
+
+    struct pl_film_grain_params grain_params = {
+        .data = image->film_grain,
+        .luma_tex = ref->plane.texture,
+        .repr = &repr,
+        .components = plane->components,
+    };
+
+    switch (image->film_grain.type) {
+    case PL_FILM_GRAIN_NONE: return false;
+    case PL_FILM_GRAIN_H274: break;
+    case PL_FILM_GRAIN_AV1:
+        grain_params.luma_tex = ref->plane.texture;
+        for (int c = 0; c < ref->plane.components; c++) {
+            if (ref->plane.component_mapping[c] == PL_CHANNEL_Y)
+                grain_params.luma_comp = c;
+        }
+        break;
+    default: pl_unreachable();
+    }
+
+    for (int c = 0; c < plane->components; c++)
+        grain_params.component_mapping[c] = plane->component_mapping[c];
+
+    if (!pl_needs_film_grain(&grain_params))
+        return false;
+
+    if (!pass->fbofmt[plane->components]) {
+        PL_ERR(rr, "Film grain required but no renderable format available.. "
+              "disabling!");
+        rr->errors |= PL_RENDER_ERR_FILM_GRAIN;
+        return false;
+    }
+
+    grain_params.tex = img_tex(pass, img);
+    if (!grain_params.tex)
+        return false;
+
+    img->sh = pl_dispatch_begin_ex(rr->dp, true);
+    if (!pl_shader_film_grain(img->sh, &rr->grain_state[plane_idx], &grain_params)) {
+        pl_dispatch_abort(rr->dp, &img->sh);
+        rr->errors |= PL_RENDER_ERR_FILM_GRAIN;
+        return false;
+    }
+
+    img->tex = NULL;
+    img->err_msg = "Failed applying film grain.. disabling!";
+    img->err_enum = PL_RENDER_ERR_FILM_GRAIN;
+    img->err_tex = grain_params.tex;
+    if (is_orig_repr)
+        img->repr = repr;
+    return true;
+}
+
+static const enum pl_hook_stage plane_hook_stages[] = {
+    [PLANE_ALPHA]   = PL_HOOK_ALPHA_INPUT,
+    [PLANE_CHROMA]  = PL_HOOK_CHROMA_INPUT,
+    [PLANE_LUMA]    = PL_HOOK_LUMA_INPUT,
+    [PLANE_RGB]     = PL_HOOK_RGB_INPUT,
+    [PLANE_XYZ]     = PL_HOOK_XYZ_INPUT,
+};
+
+static const enum pl_hook_stage plane_scaled_hook_stages[] = {
+    [PLANE_ALPHA]   = PL_HOOK_ALPHA_SCALED,
+    [PLANE_CHROMA]  = PL_HOOK_CHROMA_SCALED,
+    [PLANE_LUMA]    = 0, // never hooked
+    [PLANE_RGB]     = 0,
+    [PLANE_XYZ]     = 0,
+};
+
+static enum pl_lut_type guess_frame_lut_type(const struct pl_frame *frame,
+                                             bool reversed)
+{
+    if (!frame->lut)
+        return PL_LUT_UNKNOWN;
+    if (frame->lut_type)
+        return frame->lut_type;
+
+    enum pl_color_system sys_in = frame->lut->repr_in.sys;
+    enum pl_color_system sys_out = frame->lut->repr_out.sys;
+    if (reversed)
+        PL_SWAP(sys_in, sys_out);
+
+    if (sys_in == PL_COLOR_SYSTEM_RGB && sys_out == sys_in)
+        return PL_LUT_NORMALIZED;
+
+    if (sys_in == frame->repr.sys && sys_out == PL_COLOR_SYSTEM_RGB)
+        return PL_LUT_CONVERSION;
+
+    // Unknown, just fall back to the default
+    return PL_LUT_NATIVE;
+}
+
+static pl_fmt merge_fmt(struct pass_state *pass, const struct img *a,
+                        const struct img *b)
+{
+    pl_renderer rr = pass->rr;
+    pl_fmt fmta = a->tex ? a->tex->params.format : PL_DEF(a->fmt, pass->fbofmt[a->comps]);
+    pl_fmt fmtb = b->tex ? b->tex->params.format : PL_DEF(b->fmt, pass->fbofmt[b->comps]);
+    pl_assert(fmta && fmtb);
+    if (fmta->type != fmtb->type)
+        return NULL;
+
+    int num_comps = PL_MIN(4, a->comps + b->comps);
+    int min_depth = PL_MAX(a->repr.bits.sample_depth, b->repr.bits.sample_depth);
+
+    // Only return formats that support all relevant caps of both formats
+    const enum pl_fmt_caps mask = PL_FMT_CAP_SAMPLEABLE | PL_FMT_CAP_LINEAR;
+    enum pl_fmt_caps req_caps = (fmta->caps & mask) | (fmtb->caps & mask);
+
+    return pl_find_fmt(rr->gpu, fmta->type, num_comps, min_depth, 0, req_caps);
+}
+
+// Applies a series of rough heuristics to figure out whether we expect any
+// performance gains from plane merging. This is basically a series of checks
+// for operations that we *know* benefit from merged planes
+static bool want_merge(struct pass_state *pass,
+                       const struct plane_state *st,
+                       const struct plane_state *ref)
+{
+    const struct pl_render_params *params = pass->params;
+    const pl_renderer rr = pass->rr;
+    if (!pass->fbofmt[4])
+        return false;
+
+    // Debanding
+    if (!(rr->errors & PL_RENDER_ERR_DEBANDING) && params->deband_params)
+        return true;
+
+    // Other plane hooks, which are generally nontrivial
+    enum pl_hook_stage stage = plane_hook_stages[st->type];
+    for (int i = 0; i < params->num_hooks; i++) {
+        if (params->hooks[i]->stages & stage)
+            return true;
+    }
+
+    // Non-trivial scaling
+    struct pl_sample_src src = {
+        .new_w = ref->img.w,
+        .new_h = ref->img.h,
+        .rect = {
+            .x1 = st->img.w,
+            .y1 = st->img.h,
+        },
+    };
+
+    struct sampler_info info = sample_src_info(pass, &src, SAMPLER_PLANE);
+    if (info.type == SAMPLER_COMPLEX)
+        return true;
+
+    // Film grain synthesis, can be merged for compatible channels, saving on
+    // redundant sampling of the grain/offset textures
+    struct pl_film_grain_params grain_params = {
+        .data = pass->image.film_grain,
+        .repr = (struct pl_color_repr *) &st->img.repr,
+        .components = st->plane.components,
+    };
+
+    for (int c = 0; c < st->plane.components; c++)
+        grain_params.component_mapping[c] = st->plane.component_mapping[c];
+
+    if (!(rr->errors & PL_RENDER_ERR_FILM_GRAIN) &&
+        pl_needs_film_grain(&grain_params))
+    {
+        return true;
+    }
+
+    return false;
+}
+
+// This scales and merges all of the source images, and initializes pass->img.
+static bool pass_read_image(struct pass_state *pass)
+{
+    const struct pl_render_params *params = pass->params;
+    struct pl_frame *image = &pass->image;
+    pl_renderer rr = pass->rr;
+
+    struct plane_state planes[4];
+    struct plane_state *ref = &planes[pass->src_ref];
+    pl_assert(pass->src_ref >= 0 && pass->src_ref < image->num_planes);
+
+    for (int i = 0; i < image->num_planes; i++) {
+        planes[i] = (struct plane_state) {
+            .type = detect_plane_type(&image->planes[i], &image->repr),
+            .plane = image->planes[i],
+            .img = {
+                .w = image->planes[i].texture->params.w,
+                .h = image->planes[i].texture->params.h,
+                .tex = image->planes[i].texture,
+                .repr = image->repr,
+                .color = image->color,
+                .comps = image->planes[i].components,
+            },
+        };
+
+        // Deinterlace plane if needed
+        if (image->field != PL_FIELD_NONE && params->deinterlace_params &&
+            pass->fbofmt[4] && !(rr->errors & PL_RENDER_ERR_DEINTERLACING))
+        {
+            struct img *img = &planes[i].img;
+            struct pl_deinterlace_source src = {
+                .cur.top  = img->tex,
+                .prev.top = image->prev ? image->prev->planes[i].texture : NULL,
+                .next.top = image->next ? image->next->planes[i].texture : NULL,
+                .field    = image->field,
+                .first_field = image->first_field,
+                .component_mask = (1 << img->comps) - 1,
+            };
+
+            img->tex = NULL;
+            img->sh = pl_dispatch_begin_ex(pass->rr->dp, true);
+            pl_shader_deinterlace(img->sh, &src, params->deinterlace_params);
+            img->err_msg = "Failed deinterlacing plane.. disabling!";
+            img->err_enum = PL_RENDER_ERR_DEINTERLACING;
+            img->err_tex = planes[i].plane.texture;
+        }
+    }
+
+    // Original ref texture, even after preprocessing
+    pl_tex ref_tex = ref->plane.texture;
+
+    // Merge all compatible planes into 'combined' shaders
+    for (int i = 0; i < image->num_planes; i++) {
+        struct plane_state *sti = &planes[i];
+        if (!sti->type)
+            continue;
+        if (!want_merge(pass, sti, ref))
+            continue;
+
+        bool did_merge = false;
+        for (int j = i+1; j < image->num_planes; j++) {
+            struct plane_state *stj = &planes[j];
+            bool merge = sti->type == stj->type &&
+                         sti->img.w == stj->img.w &&
+                         sti->img.h == stj->img.h &&
+                         sti->plane.shift_x == stj->plane.shift_x &&
+                         sti->plane.shift_y == stj->plane.shift_y;
+            if (!merge)
+                continue;
+
+            pl_fmt fmt = merge_fmt(pass, &sti->img, &stj->img);
+            if (!fmt)
+                continue;
+
+            PL_TRACE(rr, "Merging plane %d into plane %d", j, i);
+            pl_shader sh = sti->img.sh;
+            if (!sh) {
+                sh = sti->img.sh = pl_dispatch_begin_ex(pass->rr->dp, true);
+                pl_shader_sample_direct(sh, pl_sample_src( .tex = sti->img.tex ));
+                sti->img.tex = NULL;
+            }
+
+            pl_shader psh = NULL;
+            if (!stj->img.sh) {
+                psh = pl_dispatch_begin_ex(pass->rr->dp, true);
+                pl_shader_sample_direct(psh, pl_sample_src( .tex = stj->img.tex ));
+            }
+
+            ident_t sub = sh_subpass(sh, psh ? psh : stj->img.sh);
+            pl_dispatch_abort(rr->dp, &psh);
+            if (!sub)
+                break; // skip merging
+
+            sh_describe(sh, "merging planes");
+            GLSL("{                 \n"
+                 "vec4 tmp = "$"(); \n", sub);
+            for (int jc = 0; jc < stj->img.comps; jc++) {
+                int map = stj->plane.component_mapping[jc];
+                if (map == PL_CHANNEL_NONE)
+                    continue;
+                int ic = sti->img.comps++;
+                pl_assert(ic < 4);
+                GLSL("color[%d] = tmp[%d]; \n", ic, jc);
+                sti->plane.components = sti->img.comps;
+                sti->plane.component_mapping[ic] = map;
+            }
+            GLSL("} \n");
+
+            sti->img.fmt = fmt;
+            pl_dispatch_abort(rr->dp, &stj->img.sh);
+            *stj = (struct plane_state) {0};
+            did_merge = true;
+        }
+
+        if (!did_merge)
+            continue;
+
+        if (!img_tex(pass, &sti->img)) {
+            PL_ERR(rr, "Failed dispatching plane merging shader, disabling FBOs!");
+            memset(pass->fbofmt, 0, sizeof(pass->fbofmt));
+            rr->errors |= PL_RENDER_ERR_FBO;
+            return false;
+        }
+    }
+
+    int bits = image->repr.bits.sample_depth;
+    float out_scale = bits ? (1llu << bits) / ((1llu << bits) - 1.0f) : 1.0f;
+    float neutral_luma = 0.0, neutral_chroma = 0.5f * out_scale;
+    if (pl_color_levels_guess(&image->repr) == PL_COLOR_LEVELS_LIMITED)
+        neutral_luma = 16 / 256.0f * out_scale;
+    if (!pl_color_system_is_ycbcr_like(image->repr.sys))
+        neutral_chroma = neutral_luma;
+
+    // Compute the sampling rc of each plane
+    for (int i = 0; i < image->num_planes; i++) {
+        struct plane_state *st = &planes[i];
+        if (!st->type)
+            continue;
+
+        float rx = (float) st->plane.texture->params.w / ref_tex->params.w,
+              ry = (float) st->plane.texture->params.h / ref_tex->params.h;
+
+        // Only accept integer scaling ratios. This accounts for the fact that
+        // fractionally subsampled planes get rounded up to the nearest integer
+        // size, which we want to discard.
+        float rrx = rx >= 1 ? roundf(rx) : 1.0 / roundf(1.0 / rx),
+              rry = ry >= 1 ? roundf(ry) : 1.0 / roundf(1.0 / ry);
+
+        float sx = st->plane.shift_x,
+              sy = st->plane.shift_y;
+
+        st->img.rect = (pl_rect2df) {
+            .x0 = (image->crop.x0 - sx) * rrx,
+            .y0 = (image->crop.y0 - sy) * rry,
+            .x1 = (image->crop.x1 - sx) * rrx,
+            .y1 = (image->crop.y1 - sy) * rry,
+        };
+
+        st->plane_w = ref_tex->params.w * rrx;
+        st->plane_h = ref_tex->params.h * rry;
+
+        PL_TRACE(rr, "Plane %d:", i);
+        log_plane_info(rr, st);
+
+        float neutral[3] = {0.0};
+        for (int c = 0, idx = 0; c < st->plane.components; c++) {
+            switch (st->plane.component_mapping[c]) {
+            case PL_CHANNEL_Y: neutral[idx++] = neutral_luma; break;
+            case PL_CHANNEL_U: // fall through
+            case PL_CHANNEL_V: neutral[idx++] = neutral_chroma; break;
+            }
+        }
+
+        // The order of operations (deband -> film grain -> user hooks) is
+        // chosen to maximize quality. Note that film grain requires unmodified
+        // plane sizes, so it has to be before user hooks. As for debanding,
+        // it's reduced in quality after e.g. plane scalers as well. It's also
+        // made less effective by performing film grain synthesis first.
+
+        if (plane_deband(pass, &st->img, neutral)) {
+            PL_TRACE(rr, "After debanding:");
+            log_plane_info(rr, st);
+        }
+
+        if (plane_film_grain(pass, i, st, ref)) {
+            PL_TRACE(rr, "After film grain:");
+            log_plane_info(rr, st);
+        }
+
+        if (pass_hook(pass, &st->img, plane_hook_stages[st->type])) {
+            PL_TRACE(rr, "After user hooks:");
+            log_plane_info(rr, st);
+        }
+    }
+
+    pl_shader sh = pl_dispatch_begin_ex(rr->dp, true);
+    sh_require(sh, PL_SHADER_SIG_NONE, 0, 0);
+
+    // Initialize the color to black
+    GLSL("vec4 color = vec4("$", vec2("$"), 1.0);   \n"
+         "// pass_read_image                        \n"
+         "{                                         \n"
+         "vec4 tmp;                                 \n",
+         SH_FLOAT(neutral_luma), SH_FLOAT(neutral_chroma));
+
+    // For quality reasons, explicitly drop subpixel offsets from the ref rect
+    // and re-add them as part of `pass->img.rect`, always rounding towards 0.
+    // Additionally, drop anamorphic subpixel mismatches.
+    pl_rect2d ref_rounded;
+    ref_rounded.x0 = truncf(ref->img.rect.x0);
+    ref_rounded.y0 = truncf(ref->img.rect.y0);
+    ref_rounded.x1 = ref_rounded.x0 + roundf(pl_rect_w(ref->img.rect));
+    ref_rounded.y1 = ref_rounded.y0 + roundf(pl_rect_h(ref->img.rect));
+
+    PL_TRACE(rr, "Rounded reference rect: {%d %d %d %d}",
+             ref_rounded.x0, ref_rounded.y0,
+             ref_rounded.x1, ref_rounded.y1);
+
+    float off_x = ref->img.rect.x0 - ref_rounded.x0,
+          off_y = ref->img.rect.y0 - ref_rounded.y0,
+          stretch_x = pl_rect_w(ref_rounded) / pl_rect_w(ref->img.rect),
+          stretch_y = pl_rect_h(ref_rounded) / pl_rect_h(ref->img.rect);
+
+    for (int i = 0; i < image->num_planes; i++) {
+        struct plane_state *st = &planes[i];
+        const struct pl_plane *plane = &st->plane;
+        if (!st->type)
+            continue;
+
+        float scale_x = pl_rect_w(st->img.rect) / pl_rect_w(ref->img.rect),
+              scale_y = pl_rect_h(st->img.rect) / pl_rect_h(ref->img.rect),
+              base_x = st->img.rect.x0 - scale_x * off_x,
+              base_y = st->img.rect.y0 - scale_y * off_y;
+
+        struct pl_sample_src src = {
+            .components = plane->components,
+            .address_mode = plane->address_mode,
+            .scale      = pl_color_repr_normalize(&st->img.repr),
+            .new_w      = pl_rect_w(ref_rounded),
+            .new_h      = pl_rect_h(ref_rounded),
+            .rect = {
+                base_x,
+                base_y,
+                base_x + stretch_x * pl_rect_w(st->img.rect),
+                base_y + stretch_y * pl_rect_h(st->img.rect),
+            },
+        };
+
+        if (plane->flipped) {
+            src.rect.y0 = st->plane_h - src.rect.y0;
+            src.rect.y1 = st->plane_h - src.rect.y1;
+        }
+
+        PL_TRACE(rr, "Aligning plane %d: {%f %f %f %f} -> {%f %f %f %f}%s",
+                 i, st->img.rect.x0, st->img.rect.y0,
+                 st->img.rect.x1, st->img.rect.y1,
+                 src.rect.x0, src.rect.y0,
+                 src.rect.x1, src.rect.y1,
+                 plane->flipped ? " (flipped) " : "");
+
+        st->img.unique = true;
+        pl_rect2d unscaled = { .x1 = src.new_w, .y1 = src.new_h };
+        if (st->img.sh && st->img.w == src.new_w && st->img.h == src.new_h &&
+            pl_rect2d_eq(src.rect, unscaled))
+        {
+            // Image rects are already equal, no indirect scaling needed
+        } else {
+            src.tex = img_tex(pass, &st->img);
+            st->img.tex = NULL;
+            st->img.sh = pl_dispatch_begin_ex(rr->dp, true);
+            dispatch_sampler(pass, st->img.sh, &rr->samplers_src[i],
+                             SAMPLER_PLANE, NULL, &src);
+            st->img.err_enum |= PL_RENDER_ERR_SAMPLING;
+            st->img.rect.x0 = st->img.rect.y0 = 0.0f;
+            st->img.w = st->img.rect.x1 = src.new_w;
+            st->img.h = st->img.rect.y1 = src.new_h;
+        }
+
+        pass_hook(pass, &st->img, plane_scaled_hook_stages[st->type]);
+        ident_t sub = sh_subpass(sh, img_sh(pass, &st->img));
+        if (!sub) {
+            if (!img_tex(pass, &st->img)) {
+                pl_dispatch_abort(rr->dp, &sh);
+                return false;
+            }
+
+            sub = sh_subpass(sh, img_sh(pass, &st->img));
+            pl_assert(sub);
+        }
+
+        GLSL("tmp = "$"(); \n", sub);
+        for (int c = 0; c < src.components; c++) {
+            if (plane->component_mapping[c] < 0)
+                continue;
+            GLSL("color[%d] = tmp[%d];\n", plane->component_mapping[c], c);
+        }
+
+        // we don't need it anymore
+        pl_dispatch_abort(rr->dp, &st->img.sh);
+    }
+
+    GLSL("}\n");
+
+    pass->img = (struct img) {
+        .sh     = sh,
+        .w      = pl_rect_w(ref_rounded),
+        .h      = pl_rect_h(ref_rounded),
+        .repr   = ref->img.repr,
+        .color  = image->color,
+        .comps  = ref->img.repr.alpha ? 4 : 3,
+        .rect   = {
+            off_x,
+            off_y,
+            off_x + pl_rect_w(ref->img.rect),
+            off_y + pl_rect_h(ref->img.rect),
+        },
+    };
+
+    // Update the reference rect to our adjusted image coordinates
+    pass->ref_rect = pass->img.rect;
+
+    pass_hook(pass, &pass->img, PL_HOOK_NATIVE);
+
+    // Apply LUT logic and colorspace conversion
+    enum pl_lut_type lut_type = guess_frame_lut_type(image, false);
+    sh = img_sh(pass, &pass->img);
+    bool needs_conversion = true;
+
+    if (lut_type == PL_LUT_NATIVE || lut_type == PL_LUT_CONVERSION) {
+        // Fix bit depth normalization before applying LUT
+        float scale = pl_color_repr_normalize(&pass->img.repr);
+        GLSL("color *= vec4("$"); \n", SH_FLOAT(scale));
+        pl_shader_set_alpha(sh, &pass->img.repr, PL_ALPHA_INDEPENDENT);
+        pl_shader_custom_lut(sh, image->lut, &rr->lut_state[LUT_IMAGE]);
+
+        if (lut_type == PL_LUT_CONVERSION) {
+            pass->img.repr.sys = PL_COLOR_SYSTEM_RGB;
+            pass->img.repr.levels = PL_COLOR_LEVELS_FULL;
+            needs_conversion = false;
+        }
+    }
+
+    if (needs_conversion) {
+        if (pass->img.repr.sys == PL_COLOR_SYSTEM_XYZ)
+            pass->img.color.transfer = PL_COLOR_TRC_LINEAR;
+        pl_shader_decode_color(sh, &pass->img.repr, params->color_adjustment);
+    }
+
+    if (lut_type == PL_LUT_NORMALIZED)
+        pl_shader_custom_lut(sh, image->lut, &rr->lut_state[LUT_IMAGE]);
+
+    // A main PL_LUT_CONVERSION LUT overrides ICC profiles
+    bool main_lut_override = params->lut && params->lut_type == PL_LUT_CONVERSION;
+    if (image->icc && !main_lut_override) {
+        pl_shader_set_alpha(sh, &pass->img.repr, PL_ALPHA_INDEPENDENT);
+        pl_icc_decode(sh, image->icc, &rr->icc_state[ICC_IMAGE], &pass->img.color);
+    }
+
+    // Pre-multiply alpha channel before the rest of the pipeline, to avoid
+    // bleeding colors from transparent regions into non-transparent regions
+    pl_shader_set_alpha(sh, &pass->img.repr, PL_ALPHA_PREMULTIPLIED);
+
+    pass_hook(pass, &pass->img, PL_HOOK_RGB);
+    sh = NULL;
+    return true;
+}
+
+static bool pass_scale_main(struct pass_state *pass)
+{
+    const struct pl_render_params *params = pass->params;
+    pl_renderer rr = pass->rr;
+
+    pl_fmt fbofmt = pass->fbofmt[pass->img.comps];
+    if (!fbofmt) {
+        PL_TRACE(rr, "Skipping main scaler (no FBOs)");
+        return true;
+    }
+
+    const pl_rect2df new_rect = {
+        .x1 = abs(pl_rect_w(pass->dst_rect)),
+        .y1 = abs(pl_rect_h(pass->dst_rect)),
+    };
+
+    struct img *img = &pass->img;
+    struct pl_sample_src src = {
+        .components = img->comps,
+        .new_w      = pl_rect_w(new_rect),
+        .new_h      = pl_rect_h(new_rect),
+        .rect       = img->rect,
+    };
+
+    const struct pl_frame *image = &pass->image;
+    bool need_fbo = false;
+
+    // Force FBO indirection if this shader is non-resizable
+    int out_w, out_h;
+    if (img->sh && pl_shader_output_size(img->sh, &out_w, &out_h))
+        need_fbo |= out_w != src.new_w || out_h != src.new_h;
+
+    struct sampler_info info = sample_src_info(pass, &src, SAMPLER_MAIN);
+    bool use_sigmoid = info.dir == SAMPLER_UP && params->sigmoid_params;
+    bool use_linear  = info.dir == SAMPLER_DOWN;
+
+    // Opportunistically update peak here if it would save performance
+    if (info.dir == SAMPLER_UP)
+        hdr_update_peak(pass);
+
+    // We need to enable the full rendering pipeline if there are any user
+    // shaders / hooks that might depend on it.
+    uint64_t scaling_hooks = PL_HOOK_PRE_KERNEL | PL_HOOK_POST_KERNEL;
+    uint64_t linear_hooks = PL_HOOK_LINEAR | PL_HOOK_SIGMOID;
+
+    for (int i = 0; i < params->num_hooks; i++) {
+        if (params->hooks[i]->stages & (scaling_hooks | linear_hooks)) {
+            need_fbo = true;
+            if (params->hooks[i]->stages & linear_hooks)
+                use_linear = true;
+            if (params->hooks[i]->stages & PL_HOOK_SIGMOID)
+                use_sigmoid = true;
+        }
+    }
+
+    if (info.dir == SAMPLER_NOOP && !need_fbo) {
+        pl_assert(src.new_w == img->w && src.new_h == img->h);
+        PL_TRACE(rr, "Skipping main scaler (would be no-op)");
+        goto done;
+    }
+
+    if (info.type == SAMPLER_DIRECT && !need_fbo) {
+        img->w = src.new_w;
+        img->h = src.new_h;
+        img->rect = new_rect;
+        PL_TRACE(rr, "Skipping main scaler (free sampling)");
+        goto done;
+    }
+
+    // Hard-disable both sigmoidization and linearization when required
+    if (params->disable_linear_scaling || fbofmt->component_depth[0] < 16)
+        use_sigmoid = use_linear = false;
+
+    // Avoid sigmoidization for HDR content because it clips to [0,1], and
+    // linearization because it causes very nasty ringing artefacts.
+    if (pl_color_space_is_hdr(&img->color))
+        use_sigmoid = use_linear = false;
+
+    if (!(use_linear || use_sigmoid) && img->color.transfer == PL_COLOR_TRC_LINEAR) {
+        img->color.transfer = image->color.transfer;
+        if (image->color.transfer == PL_COLOR_TRC_LINEAR)
+            img->color.transfer = PL_COLOR_TRC_GAMMA22; // arbitrary fallback
+        pl_shader_delinearize(img_sh(pass, img), &img->color);
+    }
+
+    if (use_linear || use_sigmoid) {
+        pl_shader_linearize(img_sh(pass, img), &img->color);
+        img->color.transfer = PL_COLOR_TRC_LINEAR;
+        pass_hook(pass, img, PL_HOOK_LINEAR);
+    }
+
+    if (use_sigmoid) {
+        pl_shader_sigmoidize(img_sh(pass, img), params->sigmoid_params);
+        pass_hook(pass, img, PL_HOOK_SIGMOID);
+    }
+
+    pass_hook(pass, img, PL_HOOK_PRE_KERNEL);
+
+    src.tex = img_tex(pass, img);
+    if (!src.tex)
+        return false;
+    pass->need_peak_fbo = false;
+
+    pl_shader sh = pl_dispatch_begin_ex(rr->dp, true);
+    dispatch_sampler(pass, sh, &rr->sampler_main, SAMPLER_MAIN, NULL, &src);
+    img->tex  = NULL;
+    img->sh   = sh;
+    img->w    = src.new_w;
+    img->h    = src.new_h;
+    img->rect = new_rect;
+
+    pass_hook(pass, img, PL_HOOK_POST_KERNEL);
+
+    if (use_sigmoid)
+        pl_shader_unsigmoidize(img_sh(pass, img), params->sigmoid_params);
+
+done:
+    if (info.dir != SAMPLER_UP)
+        hdr_update_peak(pass);
+    pass_hook(pass, img, PL_HOOK_SCALED);
+    return true;
+}
+
+static pl_tex get_feature_map(struct pass_state *pass)
+{
+    const struct pl_render_params *params = pass->params;
+    pl_renderer rr = pass->rr;
+    const struct pl_color_map_params *cparams = params->color_map_params;
+    cparams = PL_DEF(cparams, &pl_color_map_default_params);
+    if (!cparams->contrast_recovery || cparams->contrast_smoothness <= 1)
+        return NULL;
+    if (!pass->fbofmt[4])
+        return NULL;
+    if (!pl_color_space_is_hdr(&pass->img.color))
+        return NULL;
+    if (rr->errors & (PL_RENDER_ERR_SAMPLING | PL_RENDER_ERR_CONTRAST_RECOVERY))
+        return NULL;
+    if (pass->img.color.hdr.max_luma <= pass->target.color.hdr.max_luma + 1e-6)
+        return NULL; // no adaptation needed
+    if (params->lut && params->lut_type == PL_LUT_CONVERSION)
+        return NULL; // LUT handles tone mapping
+
+    struct img *img = &pass->img;
+    if (!img_tex(pass, img))
+        return NULL;
+
+    const float ratio = cparams->contrast_smoothness;
+    const int cr_w = ceilf(abs(pl_rect_w(pass->dst_rect)) / ratio);
+    const int cr_h = ceilf(abs(pl_rect_h(pass->dst_rect)) / ratio);
+    pl_tex inter_tex = get_fbo(pass, img->w, img->h, NULL, 1, PL_DEBUG_TAG);
+    pl_tex out_tex   = get_fbo(pass, cr_w, cr_h, NULL, 1, PL_DEBUG_TAG);
+    if (!inter_tex || !out_tex)
+        goto error;
+
+    pl_shader sh = pl_dispatch_begin(rr->dp);
+    pl_shader_sample_direct(sh, pl_sample_src( .tex = img->tex ));
+    pl_shader_extract_features(sh, img->color);
+    bool ok = pl_dispatch_finish(rr->dp, pl_dispatch_params(
+        .shader = &sh,
+        .target = inter_tex,
+    ));
+    if (!ok)
+        goto error;
+
+    const struct pl_sample_src src = {
+        .tex          = inter_tex,
+        .rect         = img->rect,
+        .address_mode = PL_TEX_ADDRESS_MIRROR,
+        .components   = 1,
+        .new_w        = cr_w,
+        .new_h        = cr_h,
+    };
+
+    sh = pl_dispatch_begin(rr->dp);
+    dispatch_sampler(pass, sh, &rr->sampler_contrast, SAMPLER_CONTRAST, out_tex, &src);
+    ok = pl_dispatch_finish(rr->dp, pl_dispatch_params(
+        .shader = &sh,
+        .target = out_tex,
+    ));
+    if (!ok)
+        goto error;
+
+    return out_tex;
+
+error:
+    PL_ERR(rr, "Failed extracting luma for contrast recovery, disabling");
+    rr->errors |= PL_RENDER_ERR_CONTRAST_RECOVERY;
+    return NULL;
+}
+
+// Transforms image into the output color space (tone-mapping, ICC 3DLUT, etc)
+static void pass_convert_colors(struct pass_state *pass)
+{
+    const struct pl_render_params *params = pass->params;
+    const struct pl_frame *image = &pass->image;
+    const struct pl_frame *target = &pass->target;
+    pl_renderer rr = pass->rr;
+
+    struct img *img = &pass->img;
+    pl_shader sh = img_sh(pass, img);
+
+    bool prelinearized = false;
+    bool need_conversion = true;
+    assert(image->color.primaries == img->color.primaries);
+    if (img->color.transfer == PL_COLOR_TRC_LINEAR) {
+        if (img->repr.alpha == PL_ALPHA_PREMULTIPLIED) {
+            // Very annoying edge case: since prelinerization happens with
+            // premultiplied alpha, but color mapping happens with independent
+            // alpha, we need to go back to non-linear representation *before*
+            // alpha mode conversion, to avoid distortion
+            img->color.transfer = image->color.transfer;
+            pl_shader_delinearize(sh, &img->color);
+        } else {
+            prelinearized = true;
+        }
+    } else if (img->color.transfer != image->color.transfer) {
+        if (image->color.transfer == PL_COLOR_TRC_LINEAR) {
+            // Another annoying edge case: if the input is linear light, but we
+            // decide to un-linearize it for scaling purposes, we need to
+            // re-linearize before passing it into `pl_shader_color_map`
+            pl_shader_linearize(sh, &img->color);
+            img->color.transfer = PL_COLOR_TRC_LINEAR;
+        }
+    }
+
+    // Do all processing in independent alpha, to avoid nonlinear distortions
+    pl_shader_set_alpha(sh, &img->repr, PL_ALPHA_INDEPENDENT);
+
+    // Apply color blindness simulation if requested
+    if (params->cone_params)
+        pl_shader_cone_distort(sh, img->color, params->cone_params);
+
+    if (params->lut) {
+        struct pl_color_space lut_in = params->lut->color_in;
+        struct pl_color_space lut_out = params->lut->color_out;
+        switch (params->lut_type) {
+        case PL_LUT_UNKNOWN:
+        case PL_LUT_NATIVE:
+            pl_color_space_merge(&lut_in, &image->color);
+            pl_color_space_merge(&lut_out, &image->color);
+            break;
+        case PL_LUT_CONVERSION:
+            pl_color_space_merge(&lut_in, &image->color);
+            need_conversion = false; // conversion LUT the highest priority
+            break;
+        case PL_LUT_NORMALIZED:
+            if (!prelinearized) {
+                // PL_LUT_NORMALIZED wants linear input data
+                pl_shader_linearize(sh, &img->color);
+                img->color.transfer = PL_COLOR_TRC_LINEAR;
+                prelinearized = true;
+            }
+            pl_color_space_merge(&lut_in, &img->color);
+            pl_color_space_merge(&lut_out, &img->color);
+            break;
+        }
+
+        pl_shader_color_map_ex(sh, params->color_map_params, pl_color_map_args(
+            .src = image->color,
+            .dst = lut_in,
+            .prelinearized = prelinearized,
+        ));
+
+        if (params->lut_type == PL_LUT_NORMALIZED) {
+            GLSLF("color.rgb *= vec3(1.0/"$"); \n",
+                  SH_FLOAT(pl_color_transfer_nominal_peak(lut_in.transfer)));
+        }
+
+        pl_shader_custom_lut(sh, params->lut, &rr->lut_state[LUT_PARAMS]);
+
+        if (params->lut_type == PL_LUT_NORMALIZED) {
+            GLSLF("color.rgb *= vec3("$"); \n",
+                  SH_FLOAT(pl_color_transfer_nominal_peak(lut_out.transfer)));
+        }
+
+        if (params->lut_type != PL_LUT_CONVERSION) {
+            pl_shader_color_map_ex(sh, params->color_map_params, pl_color_map_args(
+                .src = lut_out,
+                .dst = img->color,
+            ));
+        }
+    }
+
+    if (need_conversion) {
+        struct pl_color_space target_csp = target->color;
+        if (target->icc)
+            target_csp.transfer = PL_COLOR_TRC_LINEAR;
+
+        if (pass->need_peak_fbo && !img_tex(pass, img))
+            return;
+
+        // generate HDR feature map if required
+        pl_tex feature_map = get_feature_map(pass);
+        sh = img_sh(pass, img); // `get_feature_map` dispatches previous shader
+
+        // current -> target
+        pl_shader_color_map_ex(sh, params->color_map_params, pl_color_map_args(
+            .src           = image->color,
+            .dst           = target_csp,
+            .prelinearized = prelinearized,
+            .state         = &rr->tone_map_state,
+            .feature_map   = feature_map,
+        ));
+
+        if (target->icc)
+            pl_icc_encode(sh, target->icc, &rr->icc_state[ICC_TARGET]);
+    }
+
+    enum pl_lut_type lut_type = guess_frame_lut_type(target, true);
+    if (lut_type == PL_LUT_NORMALIZED || lut_type == PL_LUT_CONVERSION)
+        pl_shader_custom_lut(sh, target->lut, &rr->lut_state[LUT_TARGET]);
+
+    img->color = target->color;
+}
+
+// Returns true if error diffusion was successfully performed
+static bool pass_error_diffusion(struct pass_state *pass, pl_shader *sh,
+                                 int new_depth, int comps, int out_w, int out_h)
+{
+    const struct pl_render_params *params = pass->params;
+    pl_renderer rr = pass->rr;
+    if (!params->error_diffusion || (rr->errors & PL_RENDER_ERR_ERROR_DIFFUSION))
+        return false;
+
+    size_t shmem_req = pl_error_diffusion_shmem_req(params->error_diffusion, out_h);
+    if (shmem_req > rr->gpu->glsl.max_shmem_size) {
+        PL_TRACE(rr, "Disabling error diffusion due to shmem requirements (%zu) "
+                 "exceeding capabilities (%zu)", shmem_req, rr->gpu->glsl.max_shmem_size);
+        return false;
+    }
+
+    pl_fmt fmt = pass->fbofmt[comps];
+    if (!fmt || !(fmt->caps & PL_FMT_CAP_STORABLE)) {
+        PL_ERR(rr, "Error diffusion requires storable FBOs but GPU does not "
+               "provide them... disabling!");
+        goto error;
+    }
+
+    struct pl_error_diffusion_params edpars = {
+        .new_depth = new_depth,
+        .kernel = params->error_diffusion,
+    };
+
+    // Create temporary framebuffers
+    edpars.input_tex = get_fbo(pass, out_w, out_h, fmt, comps, PL_DEBUG_TAG);
+    edpars.output_tex = get_fbo(pass, out_w, out_h, fmt, comps, PL_DEBUG_TAG);
+    if (!edpars.input_tex || !edpars.output_tex)
+        goto error;
+
+    pl_shader dsh = pl_dispatch_begin(rr->dp);
+    if (!pl_shader_error_diffusion(dsh, &edpars)) {
+        pl_dispatch_abort(rr->dp, &dsh);
+        goto error;
+    }
+
+    // Everything was okay, run the shaders
+    bool ok = pl_dispatch_finish(rr->dp, pl_dispatch_params(
+        .shader = sh,
+        .target = edpars.input_tex,
+    ));
+
+    if (ok) {
+        ok = pl_dispatch_compute(rr->dp, pl_dispatch_compute_params(
+            .shader = &dsh,
+            .dispatch_size = {1, 1, 1},
+        ));
+    }
+
+    *sh = pl_dispatch_begin(rr->dp);
+    pl_shader_sample_direct(*sh, pl_sample_src(
+        .tex = ok ? edpars.output_tex : edpars.input_tex,
+    ));
+    return ok;
+
+error:
+    rr->errors |= PL_RENDER_ERR_ERROR_DIFFUSION;
+    return false;
+}
+
+#define CLEAR_COL(params)                                                       \
+    (float[4]) {                                                                \
+        (params)->background_color[0],                                          \
+        (params)->background_color[1],                                          \
+        (params)->background_color[2],                                          \
+        1.0 - (params)->background_transparency,                                \
+    }
+
+static bool pass_output_target(struct pass_state *pass)
+{
+    const struct pl_render_params *params = pass->params;
+    const struct pl_frame *image = &pass->image;
+    const struct pl_frame *target = &pass->target;
+    pl_renderer rr = pass->rr;
+
+    struct img *img = &pass->img;
+    pl_shader sh = img_sh(pass, img);
+
+    if (params->corner_rounding > 0.0f) {
+        const float out_w2 = fabsf(pl_rect_w(target->crop)) / 2.0f;
+        const float out_h2 = fabsf(pl_rect_h(target->crop)) / 2.0f;
+        const float radius = fminf(params->corner_rounding, 1.0f) *
+                             fminf(out_w2, out_h2);
+        const struct pl_rect2df relpos = {
+            .x0 = -out_w2, .y0 = -out_h2,
+            .x1 =  out_w2, .y1 =  out_h2,
+        };
+        GLSL("float radius = "$";                           \n"
+             "vec2 size2 = vec2("$", "$");                  \n"
+             "vec2 relpos = "$";                            \n"
+             "vec2 rd = abs(relpos) - size2 + vec2(radius); \n"
+             "float rdist = length(max(rd, 0.0)) - radius;  \n"
+             "float border = smoothstep(2.0f, 0.0f, rdist); \n",
+             SH_FLOAT_DYN(radius),
+             SH_FLOAT_DYN(out_w2), SH_FLOAT_DYN(out_h2),
+             sh_attr_vec2(sh, "relpos", &relpos));
+
+        switch (img->repr.alpha) {
+        case PL_ALPHA_UNKNOWN:
+            GLSL("color.a = border; \n");
+            img->repr.alpha = PL_ALPHA_INDEPENDENT;
+            img->comps = 4;
+            break;
+        case PL_ALPHA_INDEPENDENT:
+            GLSL("color.a *= border; \n");
+            break;
+        case PL_ALPHA_PREMULTIPLIED:
+            GLSL("color *= border; \n");
+            break;
+        case PL_ALPHA_MODE_COUNT:
+            pl_unreachable();
+        }
+    }
+
+    const struct pl_plane *ref = &target->planes[pass->dst_ref];
+    pl_rect2d dst_rect = pass->dst_rect;
+    if (params->distort_params) {
+        struct pl_distort_params dpars = *params->distort_params;
+        if (dpars.alpha_mode) {
+            pl_shader_set_alpha(sh, &img->repr, dpars.alpha_mode);
+            img->repr.alpha = dpars.alpha_mode;
+            img->comps = 4;
+        }
+        pl_tex tex = img_tex(pass, img);
+        if (!tex)
+            return false;
+        // Expand canvas to fit result of distortion
+        const float ar = pl_rect2df_aspect(&target->crop);
+        const float sx = fminf(ar, 1.0f);
+        const float sy = fminf(1.0f / ar, 1.0f);
+        pl_rect2df bb = pl_transform2x2_bounds(&dpars.transform, &(pl_rect2df) {
+            .x0 = -sx, .x1 = sx,
+            .y0 = -sy, .y1 = sy,
+        });
+
+        // Clamp to output size and adjust as needed when constraining output
+        pl_rect2df tmp = target->crop;
+        pl_rect2df_stretch(&tmp, pl_rect_w(bb) / (2*sx), pl_rect_h(bb) / (2*sy));
+        const float tmp_w = pl_rect_w(tmp), tmp_h = pl_rect_h(tmp);
+        int canvas_w = ref->texture->params.w,
+            canvas_h = ref->texture->params.h;
+        if (pass->rotation % PL_ROTATION_180 == PL_ROTATION_90)
+            PL_SWAP(canvas_w, canvas_h);
+        tmp.x0 = PL_CLAMP(tmp.x0, 0.0f, canvas_w);
+        tmp.x1 = PL_CLAMP(tmp.x1, 0.0f, canvas_w);
+        tmp.y0 = PL_CLAMP(tmp.y0, 0.0f, canvas_h);
+        tmp.y1 = PL_CLAMP(tmp.y1, 0.0f, canvas_h);
+        if (dpars.constrain) {
+            const float rx = pl_rect_w(tmp) / tmp_w;
+            const float ry = pl_rect_h(tmp) / tmp_h;
+            pl_rect2df_stretch(&tmp, fminf(ry / rx, 1.0f), fminf(rx / ry, 1.0f));
+        }
+        dst_rect.x0 = roundf(tmp.x0);
+        dst_rect.x1 = roundf(tmp.x1);
+        dst_rect.y0 = roundf(tmp.y0);
+        dst_rect.y1 = roundf(tmp.y1);
+        dpars.unscaled = true;
+        img->w = abs(pl_rect_w(dst_rect));
+        img->h = abs(pl_rect_h(dst_rect));
+        img->tex = NULL;
+        img->sh = sh = pl_dispatch_begin(rr->dp);
+        pl_shader_distort(sh, tex, img->w, img->h, &dpars);
+    }
+
+    pass_hook(pass, img, PL_HOOK_PRE_OUTPUT);
+
+    bool need_blend = params->blend_against_tiles ||
+                      (!target->repr.alpha && !params->blend_params);
+    if (img->comps == 4 && need_blend) {
+        if (params->blend_against_tiles) {
+            static const float zero[2][3] = {0};
+            const float (*color)[3] = params->tile_colors;
+            if (memcmp(color, zero, sizeof(zero)) == 0)
+                color = pl_render_default_params.tile_colors;
+            int size = PL_DEF(params->tile_size, pl_render_default_params.tile_size);
+            GLSLH("#define bg_tile_a vec3("$", "$", "$") \n",
+                  SH_FLOAT(color[0][0]), SH_FLOAT(color[0][1]), SH_FLOAT(color[0][2]));
+            GLSLH("#define bg_tile_b vec3("$", "$", "$") \n",
+                  SH_FLOAT(color[1][0]), SH_FLOAT(color[1][1]), SH_FLOAT(color[1][2]));
+            GLSL("vec2 outcoord = gl_FragCoord.xy * "$";                    \n"
+                 "bvec2 tile = lessThan(fract(outcoord), vec2(0.5));        \n"
+                 "vec3 bg_color = tile.x == tile.y ? bg_tile_a : bg_tile_b; \n",
+                 SH_FLOAT(1.0 / size));
+        } else {
+            GLSLH("#define bg_color vec3("$", "$", "$") \n",
+                  SH_FLOAT(params->background_color[0]),
+                  SH_FLOAT(params->background_color[1]),
+                  SH_FLOAT(params->background_color[2]));
+        }
+
+        pl_shader_set_alpha(sh, &img->repr, PL_ALPHA_PREMULTIPLIED);
+        GLSL("color = vec4(color.rgb + bg_color * (1.0 - color.a), 1.0); \n");
+        img->repr.alpha = PL_ALPHA_UNKNOWN;
+        img->comps = 3;
+    }
+
+    // Apply the color scale separately, after encoding is done, to make sure
+    // that the intermediate FBO (if any) has the correct precision.
+    struct pl_color_repr repr = target->repr;
+    float scale = pl_color_repr_normalize(&repr);
+    enum pl_lut_type lut_type = guess_frame_lut_type(target, true);
+    if (lut_type != PL_LUT_CONVERSION)
+        pl_shader_encode_color(sh, &repr);
+    if (lut_type == PL_LUT_NATIVE) {
+        pl_shader_set_alpha(sh, &img->repr, PL_ALPHA_INDEPENDENT);
+        pl_shader_custom_lut(sh, target->lut, &rr->lut_state[LUT_TARGET]);
+        pl_shader_set_alpha(sh, &img->repr, PL_ALPHA_PREMULTIPLIED);
+    }
+
+    // Rotation handling
+    if (pass->rotation % PL_ROTATION_180 == PL_ROTATION_90) {
+        PL_SWAP(dst_rect.x0, dst_rect.y0);
+        PL_SWAP(dst_rect.x1, dst_rect.y1);
+        PL_SWAP(img->w, img->h);
+        sh->transpose = true;
+    }
+
+    pass_hook(pass, img, PL_HOOK_OUTPUT);
+    sh = NULL;
+
+    bool flipped_x = dst_rect.x1 < dst_rect.x0,
+         flipped_y = dst_rect.y1 < dst_rect.y0;
+
+    if (!params->skip_target_clearing && pl_frame_is_cropped(target))
+        pl_frame_clear_rgba(rr->gpu, target, CLEAR_COL(params));
+
+    for (int p = 0; p < target->num_planes; p++) {
+        const struct pl_plane *plane = &target->planes[p];
+        float rx = (float) plane->texture->params.w / ref->texture->params.w,
+              ry = (float) plane->texture->params.h / ref->texture->params.h;
+
+        // Only accept integer scaling ratios. This accounts for the fact
+        // that fractionally subsampled planes get rounded up to the
+        // nearest integer size, which we want to over-render.
+        float rrx = rx >= 1 ? roundf(rx) : 1.0 / roundf(1.0 / rx),
+              rry = ry >= 1 ? roundf(ry) : 1.0 / roundf(1.0 / ry);
+        float sx = plane->shift_x, sy = plane->shift_y;
+
+        pl_rect2df plane_rectf = {
+            .x0 = (dst_rect.x0 - sx) * rrx,
+            .y0 = (dst_rect.y0 - sy) * rry,
+            .x1 = (dst_rect.x1 - sx) * rrx,
+            .y1 = (dst_rect.y1 - sy) * rry,
+        };
+
+        // Normalize to make the math easier
+        pl_rect2df_normalize(&plane_rectf);
+
+        // Round the output rect
+        int rx0 = floorf(plane_rectf.x0), ry0 = floorf(plane_rectf.y0),
+            rx1 =  ceilf(plane_rectf.x1), ry1 =  ceilf(plane_rectf.y1);
+
+        PL_TRACE(rr, "Subsampled target %d: {%f %f %f %f} -> {%d %d %d %d}",
+                 p, plane_rectf.x0, plane_rectf.y0,
+                 plane_rectf.x1, plane_rectf.y1,
+                 rx0, ry0, rx1, ry1);
+
+        if (target->num_planes > 1) {
+
+            // Planar output, so we need to sample from an intermediate FBO
+            struct pl_sample_src src = {
+                .tex        = img_tex(pass, img),
+                .new_w      = rx1 - rx0,
+                .new_h      = ry1 - ry0,
+                .rect = {
+                    .x0 = (rx0 - plane_rectf.x0) / rrx,
+                    .x1 = (rx1 - plane_rectf.x0) / rrx,
+                    .y0 = (ry0 - plane_rectf.y0) / rry,
+                    .y1 = (ry1 - plane_rectf.y0) / rry,
+                },
+            };
+
+            if (!src.tex) {
+                PL_ERR(rr, "Output requires multiple planes, but FBOs are "
+                       "unavailable. This combination is unsupported.");
+                return false;
+            }
+
+            PL_TRACE(rr, "Sampling %dx%d img aligned from {%f %f %f %f}",
+                     pass->img.w, pass->img.h,
+                     src.rect.x0, src.rect.y0,
+                     src.rect.x1, src.rect.y1);
+
+            for (int c = 0; c < plane->components; c++) {
+                if (plane->component_mapping[c] < 0)
+                    continue;
+                src.component_mask |= 1 << plane->component_mapping[c];
+            }
+
+            sh = pl_dispatch_begin(rr->dp);
+            dispatch_sampler(pass, sh, &rr->samplers_dst[p], SAMPLER_PLANE,
+                             plane->texture, &src);
+
+        } else {
+
+            // Single plane, so we can directly re-use the img shader unless
+            // it's incompatible with the FBO capabilities
+            bool is_comp = pl_shader_is_compute(img_sh(pass, img));
+            if (is_comp && !plane->texture->params.storable) {
+                if (!img_tex(pass, img)) {
+                    PL_ERR(rr, "Rendering requires compute shaders, but output "
+                           "is not storable, and FBOs are unavailable. This "
+                           "combination is unsupported.");
+                    return false;
+                }
+            }
+
+            sh = img_sh(pass, img);
+            img->sh = NULL;
+
+        }
+
+        // Ignore dithering for > 16-bit outputs by default, since it makes
+        // little sense to do so (and probably just adds errors)
+        int depth = target->repr.bits.color_depth, applied_dither = 0;
+        if (depth && (depth < 16 || params->force_dither)) {
+            if (pass_error_diffusion(pass, &sh, depth, plane->components,
+                                     rx1 - rx0, ry1 - ry0))
+            {
+                applied_dither = depth;
+            } else if (params->dither_params) {
+                struct pl_dither_params dparams = *params->dither_params;
+                if (!params->disable_dither_gamma_correction)
+                    dparams.transfer = target->color.transfer;
+                pl_shader_dither(sh, depth, &rr->dither_state, &dparams);
+                applied_dither = depth;
+            }
+        }
+
+        if (applied_dither != rr->prev_dither) {
+            if (applied_dither) {
+                PL_INFO(rr, "Dithering to %d bit depth", applied_dither);
+            } else {
+                PL_INFO(rr, "Dithering disabled");
+            }
+            rr->prev_dither = applied_dither;
+        }
+
+        GLSL("color *= vec4(1.0 / "$"); \n", SH_FLOAT(scale));
+        swizzle_color(sh, plane->components, plane->component_mapping,
+                      params->blend_params);
+
+        pl_rect2d plane_rect = {
+            .x0 = flipped_x ? rx1 : rx0,
+            .x1 = flipped_x ? rx0 : rx1,
+            .y0 = flipped_y ? ry1 : ry0,
+            .y1 = flipped_y ? ry0 : ry1,
+        };
+
+        pl_transform2x2 tscale = {
+            .mat = {{{ rrx, 0.0 }, { 0.0, rry }}},
+            .c = { -sx, -sy },
+        };
+
+        if (plane->flipped) {
+            int plane_h = rry * ref->texture->params.h;
+            plane_rect.y0 = plane_h - plane_rect.y0;
+            plane_rect.y1 = plane_h - plane_rect.y1;
+            tscale.mat.m[1][1] = -tscale.mat.m[1][1];
+            tscale.c[1] += plane->texture->params.h;
+        }
+
+        bool ok = pl_dispatch_finish(rr->dp, pl_dispatch_params(
+            .shader = &sh,
+            .target = plane->texture,
+            .blend_params = params->blend_params,
+            .rect = plane_rect,
+        ));
+
+        if (!ok)
+            return false;
+
+        if (pass->info.stage != PL_RENDER_STAGE_BLEND) {
+            draw_overlays(pass, plane->texture, plane->components,
+                          plane->component_mapping, image->overlays,
+                          image->num_overlays, target->color, target->repr,
+                          &tscale);
+        }
+
+        draw_overlays(pass, plane->texture, plane->components,
+                      plane->component_mapping, target->overlays,
+                      target->num_overlays, target->color, target->repr,
+                      &tscale);
+    }
+
+    *img = (struct img) {0};
+    return true;
+}
+
+#define require(expr) pl_require(rr, expr)
+#define validate_plane(plane, param)                                            \
+  do {                                                                          \
+      require((plane).texture);                                                 \
+      require((plane).texture->params.param);                                   \
+      require((plane).components > 0 && (plane).components <= 4);               \
+      for (int c = 0; c < (plane).components; c++) {                            \
+          require((plane).component_mapping[c] >= PL_CHANNEL_NONE &&            \
+                  (plane).component_mapping[c] <= PL_CHANNEL_A);                \
+      }                                                                         \
+  } while (0)
+
+#define validate_overlay(overlay)                                               \
+  do {                                                                          \
+      require((overlay).tex);                                                   \
+      require((overlay).tex->params.sampleable);                                \
+      require((overlay).num_parts >= 0);                                        \
+      for (int n = 0; n < (overlay).num_parts; n++) {                           \
+          const struct pl_overlay_part *p = &(overlay).parts[n];                \
+          require(pl_rect_w(p->dst) && pl_rect_h(p->dst));                      \
+      }                                                                         \
+  } while (0)
+
+#define validate_deinterlace_ref(image, ref)                                    \
+  do {                                                                          \
+      require((image)->num_planes == (ref)->num_planes);                        \
+      const struct pl_tex_params *imgp, *refp;                                  \
+      for (int p = 0; p < (image)->num_planes; p++) {                           \
+          validate_plane((ref)->planes[p], sampleable);                         \
+          imgp = &(image)->planes[p].texture->params;                           \
+          refp = &(ref)->planes[p].texture->params;                             \
+          require(imgp->w == refp->w);                                          \
+          require(imgp->h == refp->h);                                          \
+          require(imgp->format->num_components == refp->format->num_components);\
+      }                                                                         \
+  } while (0)
+
+// Perform some basic validity checks on incoming structs to help catch invalid
+// API usage. This is not an exhaustive check. In particular, enums are not
+// bounds checked. This is because most functions accepting enums already
+// abort() in the default case, and because it's not the intent of this check
+// to catch all instances of memory corruption - just common logic bugs.
+static bool validate_structs(pl_renderer rr,
+                             const struct pl_frame *image,
+                             const struct pl_frame *target)
+{
+    // Rendering to/from a frame with no planes is technically allowed, but so
+    // pointless that it's more likely to be a user error worth catching.
+    require(target->num_planes > 0 && target->num_planes <= PL_MAX_PLANES);
+    for (int i = 0; i < target->num_planes; i++)
+        validate_plane(target->planes[i], renderable);
+    require(!pl_rect_w(target->crop) == !pl_rect_h(target->crop));
+    require(target->num_overlays >= 0);
+    for (int i = 0; i < target->num_overlays; i++)
+        validate_overlay(target->overlays[i]);
+
+    if (!image)
+        return true;
+
+    require(image->num_planes > 0 && image->num_planes <= PL_MAX_PLANES);
+    for (int i = 0; i < image->num_planes; i++)
+        validate_plane(image->planes[i], sampleable);
+    require(!pl_rect_w(image->crop) == !pl_rect_h(image->crop));
+    require(image->num_overlays >= 0);
+    for (int i = 0; i < image->num_overlays; i++)
+        validate_overlay(image->overlays[i]);
+
+    if (image->field != PL_FIELD_NONE) {
+        require(image->first_field != PL_FIELD_NONE);
+        if (image->prev)
+            validate_deinterlace_ref(image, image->prev);
+        if (image->next)
+            validate_deinterlace_ref(image, image->next);
+    }
+
+    return true;
+
+error:
+    return false;
+}
+
+// returns index
+static int frame_ref(const struct pl_frame *frame)
+{
+    pl_assert(frame->num_planes);
+    for (int i = 0; i < frame->num_planes; i++) {
+        switch (detect_plane_type(&frame->planes[i], &frame->repr)) {
+        case PLANE_RGB:
+        case PLANE_LUMA:
+        case PLANE_XYZ:
+            return i;
+        case PLANE_CHROMA:
+        case PLANE_ALPHA:
+            continue;
+        case PLANE_INVALID:
+            pl_unreachable();
+        }
+    }
+
+    return 0;
+}
+
+static void fix_refs_and_rects(struct pass_state *pass)
+{
+    struct pl_frame *target = &pass->target;
+    pl_rect2df *dst = &target->crop;
+    pass->dst_ref = frame_ref(target);
+    pl_tex dst_ref = target->planes[pass->dst_ref].texture;
+    int dst_w = dst_ref->params.w, dst_h = dst_ref->params.h;
+
+    if ((!dst->x0 && !dst->x1) || (!dst->y0 && !dst->y1)) {
+        dst->x1 = dst_w;
+        dst->y1 = dst_h;
+    }
+
+    if (pass->src_ref < 0) {
+        // Simplified version of the below code which only rounds the target
+        // rect but doesn't retroactively apply the crop to the image
+        pass->rotation = pl_rotation_normalize(-target->rotation);
+        pl_rect2df_rotate(dst, -pass->rotation);
+        if (pass->rotation % PL_ROTATION_180 == PL_ROTATION_90)
+            PL_SWAP(dst_w, dst_h);
+
+        *dst = (pl_rect2df) {
+            .x0 = roundf(PL_CLAMP(dst->x0, 0.0, dst_w)),
+            .y0 = roundf(PL_CLAMP(dst->y0, 0.0, dst_h)),
+            .x1 = roundf(PL_CLAMP(dst->x1, 0.0, dst_w)),
+            .y1 = roundf(PL_CLAMP(dst->y1, 0.0, dst_h)),
+        };
+
+        pass->dst_rect = (pl_rect2d) {
+            dst->x0, dst->y0, dst->x1, dst->y1,
+        };
+
+        return;
+    }
+
+    struct pl_frame *image = &pass->image;
+    pl_rect2df *src = &image->crop;
+    pass->src_ref = frame_ref(image);
+    pl_tex src_ref = image->planes[pass->src_ref].texture;
+
+    if ((!src->x0 && !src->x1) || (!src->y0 && !src->y1)) {
+        src->x1 = src_ref->params.w;
+        src->y1 = src_ref->params.h;
+    };
+
+    // Compute end-to-end rotation
+    pass->rotation = pl_rotation_normalize(image->rotation - target->rotation);
+    pl_rect2df_rotate(dst, -pass->rotation); // normalize by counter-rotating
+    if (pass->rotation % PL_ROTATION_180 == PL_ROTATION_90)
+        PL_SWAP(dst_w, dst_h);
+
+    // Keep track of whether the end-to-end rendering is flipped
+    bool flipped_x = (src->x0 > src->x1) != (dst->x0 > dst->x1),
+         flipped_y = (src->y0 > src->y1) != (dst->y0 > dst->y1);
+
+    // Normalize both rects to make the math easier
+    pl_rect2df_normalize(src);
+    pl_rect2df_normalize(dst);
+
+    // Round the output rect and clip it to the framebuffer dimensions
+    float rx0 = roundf(PL_CLAMP(dst->x0, 0.0, dst_w)),
+          ry0 = roundf(PL_CLAMP(dst->y0, 0.0, dst_h)),
+          rx1 = roundf(PL_CLAMP(dst->x1, 0.0, dst_w)),
+          ry1 = roundf(PL_CLAMP(dst->y1, 0.0, dst_h));
+
+    // Adjust the src rect corresponding to the rounded crop
+    float scale_x = pl_rect_w(*src) / pl_rect_w(*dst),
+          scale_y = pl_rect_h(*src) / pl_rect_h(*dst),
+          base_x = src->x0,
+          base_y = src->y0;
+
+    src->x0 = base_x + (rx0 - dst->x0) * scale_x;
+    src->x1 = base_x + (rx1 - dst->x0) * scale_x;
+    src->y0 = base_y + (ry0 - dst->y0) * scale_y;
+    src->y1 = base_y + (ry1 - dst->y0) * scale_y;
+
+    // Update dst_rect to the rounded values and re-apply flip if needed. We
+    // always do this in the `dst` rather than the `src`` because this allows
+    // e.g. polar sampling compute shaders to work.
+    *dst = (pl_rect2df) {
+        .x0 = flipped_x ? rx1 : rx0,
+        .y0 = flipped_y ? ry1 : ry0,
+        .x1 = flipped_x ? rx0 : rx1,
+        .y1 = flipped_y ? ry0 : ry1,
+    };
+
+    // Copies of the above, for convenience
+    pass->ref_rect = *src;
+    pass->dst_rect = (pl_rect2d) {
+        dst->x0, dst->y0, dst->x1, dst->y1,
+    };
+}
+
+static void fix_frame(struct pl_frame *frame)
+{
+    pl_tex tex = frame->planes[frame_ref(frame)].texture;
+
+    if (frame->repr.sys == PL_COLOR_SYSTEM_XYZ) {
+        // XYZ is implicity converted to linear DCI-P3 in pl_color_repr_decode
+        frame->color.primaries = PL_COLOR_PRIM_DCI_P3;
+        frame->color.transfer = PL_COLOR_TRC_ST428;
+    }
+
+    // If the primaries are not known, guess them based on the resolution
+    if (tex && !frame->color.primaries)
+        frame->color.primaries = pl_color_primaries_guess(tex->params.w, tex->params.h);
+
+    // For UNORM formats, we can infer the sampled bit depth from the texture
+    // itself. This is ignored for other format types, because the logic
+    // doesn't really work out for them anyways, and it's best not to do
+    // anything too crazy unless the user provides explicit details.
+    struct pl_bit_encoding *bits = &frame->repr.bits;
+    if (!bits->sample_depth && tex && tex->params.format->type == PL_FMT_UNORM) {
+        // Just assume the first component's depth is canonical. This works in
+        // practice, since for cases like rgb565 we want to use the lower depth
+        // anyway. Plus, every format has at least one component.
+        bits->sample_depth = tex->params.format->component_depth[0];
+
+        // If we don't know the color depth, assume it spans the full range of
+        // the texture. Otherwise, clamp it to the texture depth.
+        bits->color_depth = PL_DEF(bits->color_depth, bits->sample_depth);
+        bits->color_depth = PL_MIN(bits->color_depth, bits->sample_depth);
+
+        // If the texture depth is higher than the known color depth, assume
+        // the colors were left-shifted.
+        bits->bit_shift += bits->sample_depth - bits->color_depth;
+    }
+}
+
+static bool acquire_frame(struct pass_state *pass, struct pl_frame *frame,
+                          bool *acquired)
+{
+    if (!frame || !frame->acquire || *acquired)
+        return true;
+
+    *acquired = true;
+    return frame->acquire(pass->rr->gpu, frame);
+}
+
+static void release_frame(struct pass_state *pass, struct pl_frame *frame,
+                          bool *acquired)
+{
+    if (frame && frame->release && *acquired)
+        frame->release(pass->rr->gpu, frame);
+    *acquired = false;
+}
+
+static void pass_uninit(struct pass_state *pass)
+{
+    pl_renderer rr = pass->rr;
+    pl_dispatch_abort(rr->dp, &pass->img.sh);
+    release_frame(pass, &pass->next, &pass->acquired.next);
+    release_frame(pass, &pass->prev, &pass->acquired.prev);
+    release_frame(pass, &pass->image, &pass->acquired.image);
+    release_frame(pass, &pass->target, &pass->acquired.target);
+    pl_free_ptr(&pass->tmp);
+}
+
+static void icc_fallback(struct pass_state *pass, struct pl_frame *frame,
+                         struct icc_state *fallback)
+{
+    if (!frame || frame->icc || !frame->profile.data)
+        return;
+
+    // Don't re-attempt opening already failed profiles
+    if (fallback->error && fallback->error == frame->profile.signature)
+        return;
+
+#ifdef PL_HAVE_LCMS
+    pl_renderer rr = pass->rr;
+    if (pl_icc_update(rr->log, &fallback->icc, &frame->profile, NULL)) {
+        frame->icc = fallback->icc;
+    } else {
+        PL_WARN(rr, "Failed opening ICC profile... ignoring");
+        fallback->error = frame->profile.signature;
+    }
+#endif
+}
+
+static void pass_fix_frames(struct pass_state *pass)
+{
+    pl_renderer rr = pass->rr;
+    struct pl_frame *image = pass->src_ref < 0 ? NULL : &pass->image;
+    struct pl_frame *target = &pass->target;
+
+    fix_refs_and_rects(pass);
+
+    // Fallback for older ICC profile API
+    icc_fallback(pass, image,  &rr->icc_fallback[ICC_IMAGE]);
+    icc_fallback(pass, target, &rr->icc_fallback[ICC_TARGET]);
+
+    // Force colorspace metadata to ICC profile values, if present
+    if (image && image->icc) {
+        image->color.primaries = image->icc->containing_primaries;
+        image->color.hdr = image->icc->csp.hdr;
+    }
+
+    if (target->icc) {
+        target->color.primaries = target->icc->containing_primaries;
+        target->color.hdr = target->icc->csp.hdr;
+    }
+
+    // Infer the target color space info based on the image's
+    if (image) {
+        fix_frame(image);
+        pl_color_space_infer_map(&image->color, &target->color);
+        fix_frame(target); // do this only after infer_map
+    } else {
+        fix_frame(target);
+        pl_color_space_infer(&target->color);
+    }
+
+    // Detect the presence of an alpha channel in the frames and explicitly
+    // default the alpha mode in this case, so we can use it to detect whether
+    // or not to strip the alpha channel during rendering.
+    //
+    // Note the different defaults for the image and target, because files
+    // are usually independent but windowing systems usually expect
+    // premultiplied. (We also premultiply for internal rendering, so this
+    // way of doing it avoids a possible division-by-zero path!)
+    if (image && !image->repr.alpha) {
+        for (int i = 0; i < image->num_planes; i++) {
+            const struct pl_plane *plane = &image->planes[i];
+            for (int c = 0; c < plane->components; c++) {
+                if (plane->component_mapping[c] == PL_CHANNEL_A)
+                    image->repr.alpha = PL_ALPHA_INDEPENDENT;
+            }
+        }
+    }
+
+    if (!target->repr.alpha) {
+        for (int i = 0; i < target->num_planes; i++) {
+            const struct pl_plane *plane = &target->planes[i];
+            for (int c = 0; c < plane->components; c++) {
+                if (plane->component_mapping[c] == PL_CHANNEL_A)
+                    target->repr.alpha = PL_ALPHA_PREMULTIPLIED;
+            }
+        }
+    }
+}
+
+void pl_frames_infer(pl_renderer rr, struct pl_frame *image,
+                     struct pl_frame *target)
+{
+    struct pass_state pass = {
+        .rr     = rr,
+        .image  = *image,
+        .target = *target,
+    };
+
+    pass_fix_frames(&pass);
+    *image  = pass.image;
+    *target = pass.target;
+}
+
+static bool pass_init(struct pass_state *pass, bool acquire_image)
+{
+    struct pl_frame *image = pass->src_ref < 0 ? NULL : &pass->image;
+    struct pl_frame *target = &pass->target;
+
+    if (!acquire_frame(pass, target, &pass->acquired.target))
+        goto error;
+    if (acquire_image && image) {
+        if (!acquire_frame(pass, image, &pass->acquired.image))
+            goto error;
+
+        const struct pl_render_params *params = pass->params;
+        const struct pl_deinterlace_params *deint = params->deinterlace_params;
+        bool needs_refs = image->field != PL_FIELD_NONE && deint &&
+                          pl_deinterlace_needs_refs(deint->algo);
+
+        if (image->prev && needs_refs) {
+            // Move into local copy so we can acquire/release it
+            pass->prev = *image->prev;
+            image->prev = &pass->prev;
+            if (!acquire_frame(pass, &pass->prev, &pass->acquired.prev))
+                goto error;
+        }
+        if (image->next && needs_refs) {
+            pass->next = *image->next;
+            image->next = &pass->next;
+            if (!acquire_frame(pass, &pass->next, &pass->acquired.next))
+                goto error;
+        }
+    }
+
+    if (!validate_structs(pass->rr, acquire_image ? image : NULL, target))
+        goto error;
+
+    find_fbo_format(pass);
+    pass_fix_frames(pass);
+
+    pass->tmp = pl_tmp(NULL);
+    return true;
+
+error:
+    pass_uninit(pass);
+    return false;
+}
+
+static void pass_begin_frame(struct pass_state *pass)
+{
+    pl_renderer rr = pass->rr;
+    const struct pl_render_params *params = pass->params;
+
+    pl_dispatch_callback(rr->dp, pass, info_callback);
+    pl_dispatch_reset_frame(rr->dp);
+
+    for (int i = 0; i < params->num_hooks; i++) {
+        if (params->hooks[i]->reset)
+            params->hooks[i]->reset(params->hooks[i]->priv);
+    }
+
+    size_t size = rr->fbos.num * sizeof(bool);
+    pass->fbos_used = pl_realloc(pass->tmp, pass->fbos_used, size);
+    memset(pass->fbos_used, 0, size);
+}
+
+static bool draw_empty_overlays(pl_renderer rr,
+                                const struct pl_frame *ptarget,
+                                const struct pl_render_params *params)
+{
+    if (!params->skip_target_clearing)
+        pl_frame_clear_rgba(rr->gpu, ptarget, CLEAR_COL(params));
+
+    if (!ptarget->num_overlays)
+        return true;
+
+    struct pass_state pass = {
+        .rr = rr,
+        .params = params,
+        .src_ref = -1,
+        .target = *ptarget,
+        .info.stage = PL_RENDER_STAGE_BLEND,
+        .info.count = 0,
+    };
+
+    if (!pass_init(&pass, false))
+        return false;
+
+    pass_begin_frame(&pass);
+    struct pl_frame *target = &pass.target;
+    pl_tex ref = target->planes[pass.dst_ref].texture;
+    for (int p = 0; p < target->num_planes; p++) {
+        const struct pl_plane *plane = &target->planes[p];
+        // Math replicated from `pass_output_target`
+        float rx = (float) plane->texture->params.w / ref->params.w,
+              ry = (float) plane->texture->params.h / ref->params.h;
+        float rrx = rx >= 1 ? roundf(rx) : 1.0 / roundf(1.0 / rx),
+              rry = ry >= 1 ? roundf(ry) : 1.0 / roundf(1.0 / ry);
+        float sx = plane->shift_x, sy = plane->shift_y;
+
+        pl_transform2x2 tscale = {
+            .mat = {{{ rrx, 0.0 }, { 0.0, rry }}},
+            .c = { -sx, -sy },
+        };
+
+        if (plane->flipped) {
+            tscale.mat.m[1][1] = -tscale.mat.m[1][1];
+            tscale.c[1] += plane->texture->params.h;
+        }
+
+        draw_overlays(&pass, plane->texture, plane->components,
+                      plane->component_mapping, target->overlays,
+                      target->num_overlays, target->color, target->repr,
+                      &tscale);
+    }
+
+    pass_uninit(&pass);
+    return true;
+}
+
+bool pl_render_image(pl_renderer rr, const struct pl_frame *pimage,
+                     const struct pl_frame *ptarget,
+                     const struct pl_render_params *params)
+{
+    params = PL_DEF(params, &pl_render_default_params);
+    pl_dispatch_mark_dynamic(rr->dp, params->dynamic_constants);
+    if (!pimage)
+        return draw_empty_overlays(rr, ptarget, params);
+
+    struct pass_state pass = {
+        .rr = rr,
+        .params = params,
+        .image = *pimage,
+        .target = *ptarget,
+        .info.stage = PL_RENDER_STAGE_FRAME,
+    };
+
+    if (!pass_init(&pass, true))
+        return false;
+
+    // No-op (empty crop)
+    if (!pl_rect_w(pass.dst_rect) || !pl_rect_h(pass.dst_rect)) {
+        pass_uninit(&pass);
+        return draw_empty_overlays(rr, ptarget, params);
+    }
+
+    pass_begin_frame(&pass);
+    if (!pass_read_image(&pass))
+        goto error;
+    if (!pass_scale_main(&pass))
+        goto error;
+    pass_convert_colors(&pass);
+    if (!pass_output_target(&pass))
+        goto error;
+
+    pass_uninit(&pass);
+    return true;
+
+error:
+    PL_ERR(rr, "Failed rendering image!");
+    pass_uninit(&pass);
+    return false;
+}
+
+const struct pl_frame *pl_frame_mix_current(const struct pl_frame_mix *mix)
+{
+    const struct pl_frame *cur = NULL;
+    for (int i = 0; i < mix->num_frames; i++) {
+        if (mix->timestamps[i] > 0.0f)
+            break;
+        cur = mix->frames[i];
+    }
+
+    return cur;
+}
+
+const struct pl_frame *pl_frame_mix_nearest(const struct pl_frame_mix *mix)
+{
+    if (!mix->num_frames)
+        return NULL;
+
+    const struct pl_frame *best = mix->frames[0];
+    float best_dist = fabsf(mix->timestamps[0]);
+    for (int i = 1; i < mix->num_frames; i++) {
+        float dist = fabsf(mix->timestamps[i]);
+        if (dist < best_dist) {
+            best = mix->frames[i];
+            best_dist = dist;
+            continue;
+        } else {
+            break;
+        }
+    }
+
+    return best;
+}
+
+struct params_info {
+    uint64_t hash;
+    bool trivial;
+};
+
+static struct params_info render_params_info(const struct pl_render_params *params_orig)
+{
+    struct pl_render_params params = *params_orig;
+    struct params_info info = {
+        .trivial = true,
+        .hash = 0,
+    };
+
+#define HASH_PTR(ptr, def, ptr_trivial)                                         \
+    do {                                                                        \
+        if (ptr) {                                                              \
+            pl_hash_merge(&info.hash, pl_mem_hash(ptr, sizeof(*ptr)));          \
+            info.trivial &= (ptr_trivial);                                      \
+            ptr = NULL;                                                         \
+        } else if ((def) != NULL) {                                             \
+            pl_hash_merge(&info.hash, pl_mem_hash(def, sizeof(*ptr)));          \
+        }                                                                       \
+    } while (0)
+
+#define HASH_FILTER(scaler)                                                     \
+    do {                                                                        \
+        if ((scaler == &pl_filter_bilinear || scaler == &pl_filter_nearest) &&  \
+            params.skip_anti_aliasing)                                          \
+        {                                                                       \
+            /* treat as NULL */                                                 \
+        } else if (scaler) {                                                    \
+            struct pl_filter_config filter = *scaler;                           \
+            HASH_PTR(filter.kernel, NULL, false);                               \
+            HASH_PTR(filter.window, NULL, false);                               \
+            pl_hash_merge(&info.hash, pl_var_hash(filter));                     \
+            scaler = NULL;                                                      \
+        }                                                                       \
+    } while (0)
+
+    HASH_FILTER(params.upscaler);
+    HASH_FILTER(params.downscaler);
+
+    HASH_PTR(params.deband_params, NULL, false);
+    HASH_PTR(params.sigmoid_params, NULL, false);
+    HASH_PTR(params.deinterlace_params, NULL, false);
+    HASH_PTR(params.cone_params, NULL, true);
+    HASH_PTR(params.icc_params, &pl_icc_default_params, true);
+    HASH_PTR(params.color_adjustment, &pl_color_adjustment_neutral, true);
+    HASH_PTR(params.color_map_params, &pl_color_map_default_params, true);
+    HASH_PTR(params.peak_detect_params, NULL, false);
+
+    // Hash all hooks
+    for (int i = 0; i < params.num_hooks; i++) {
+        const struct pl_hook *hook = params.hooks[i];
+        if (hook->stages == PL_HOOK_OUTPUT)
+            continue; // ignore hooks only relevant to pass_output_target
+        pl_hash_merge(&info.hash, pl_var_hash(*hook));
+        info.trivial = false;
+    }
+    params.hooks = NULL;
+
+    // Hash the LUT by only looking at the signature
+    if (params.lut) {
+        pl_hash_merge(&info.hash, params.lut->signature);
+        info.trivial = false;
+        params.lut = NULL;
+    }
+
+#define CLEAR(field) field = (__typeof__(field)) {0}
+
+    // Clear out fields only relevant to pl_render_image_mix
+    CLEAR(params.frame_mixer);
+    CLEAR(params.preserve_mixing_cache);
+    CLEAR(params.skip_caching_single_frame);
+    memset(params.background_color, 0, sizeof(params.background_color));
+    CLEAR(params.background_transparency);
+    CLEAR(params.skip_target_clearing);
+    CLEAR(params.blend_against_tiles);
+    memset(params.tile_colors, 0, sizeof(params.tile_colors));
+    CLEAR(params.tile_size);
+
+    // Clear out fields only relevant to pass_output_target
+    CLEAR(params.blend_params);
+    CLEAR(params.distort_params);
+    CLEAR(params.dither_params);
+    CLEAR(params.error_diffusion);
+    CLEAR(params.force_dither);
+    CLEAR(params.corner_rounding);
+
+    // Clear out other irrelevant fields
+    CLEAR(params.dynamic_constants);
+    CLEAR(params.info_callback);
+    CLEAR(params.info_priv);
+
+    pl_hash_merge(&info.hash, pl_var_hash(params));
+    return info;
+}
+
+#define MAX_MIX_FRAMES 16
+
+bool pl_render_image_mix(pl_renderer rr, const struct pl_frame_mix *images,
+                         const struct pl_frame *ptarget,
+                         const struct pl_render_params *params)
+{
+    if (!images->num_frames)
+        return pl_render_image(rr, NULL, ptarget, params);
+
+    params = PL_DEF(params, &pl_render_default_params);
+    struct params_info par_info = render_params_info(params);
+    pl_dispatch_mark_dynamic(rr->dp, params->dynamic_constants);
+
+    require(images->num_frames >= 1);
+    require(images->vsync_duration > 0.0);
+    for (int i = 0; i < images->num_frames - 1; i++)
+        require(images->timestamps[i] <= images->timestamps[i+1]);
+
+    const struct pl_frame *refimg = pl_frame_mix_nearest(images);
+    struct pass_state pass = {
+        .rr = rr,
+        .params = params,
+        .image = *refimg,
+        .target = *ptarget,
+        .info.stage = PL_RENDER_STAGE_BLEND,
+    };
+
+    if (rr->errors & PL_RENDER_ERR_FRAME_MIXING)
+        goto fallback;
+    if (!pass_init(&pass, false))
+        return false;
+    if (!pass.fbofmt[4])
+        goto fallback;
+
+    const struct pl_frame *target = &pass.target;
+    int out_w = abs(pl_rect_w(pass.dst_rect)),
+        out_h = abs(pl_rect_h(pass.dst_rect));
+    if (!out_w || !out_h)
+        goto fallback;
+
+    int fidx = 0;
+    struct cached_frame frames[MAX_MIX_FRAMES];
+    float weights[MAX_MIX_FRAMES];
+    float wsum = 0.0;
+
+    // Garbage collect the cache by evicting all frames from the cache that are
+    // not determined to still be required
+    for (int i = 0; i < rr->frames.num; i++)
+        rr->frames.elem[i].evict = true;
+
+    // Blur frame mixer according to vsync ratio (source / display)
+    struct pl_filter_config mixer;
+    if (params->frame_mixer) {
+        mixer = *params->frame_mixer;
+        mixer.blur = PL_DEF(mixer.blur, 1.0);
+        for (int i = 1; i < images->num_frames; i++) {
+            if (images->timestamps[i] >= 0.0 && images->timestamps[i - 1] < 0) {
+                float frame_dur = images->timestamps[i] - images->timestamps[i - 1];
+                if (images->vsync_duration > frame_dur && !params->skip_anti_aliasing)
+                    mixer.blur *= images->vsync_duration / frame_dur;
+                break;
+            }
+        }
+    }
+
+    // Traverse the input frames and determine/prepare the ones we need
+    bool single_frame = !params->frame_mixer || images->num_frames == 1;
+retry:
+    for (int i = 0; i < images->num_frames; i++) {
+        uint64_t sig = images->signatures[i];
+        float rts = images->timestamps[i];
+        const struct pl_frame *img = images->frames[i];
+        PL_TRACE(rr, "Considering image with signature 0x%llx, rts %f",
+                 (unsigned long long) sig, rts);
+
+        // Combining images with different rotations is basically unfeasible
+        if (pl_rotation_normalize(img->rotation - refimg->rotation)) {
+            PL_TRACE(rr, "  -> Skipping: incompatible rotation");
+            continue;
+        }
+
+        float weight;
+        if (single_frame) {
+
+            // Only render the refimg, ignore others
+            if (img == refimg) {
+                weight = 1.0;
+            } else {
+                PL_TRACE(rr, "  -> Skipping: no frame mixer");
+                continue;
+            }
+
+        // For backwards compatibility, treat !kernel as oversample
+        } else if (!mixer.kernel || mixer.kernel == &pl_filter_function_oversample) {
+
+            // Compute the visible interval [rts, end] of this frame
+            float end = i+1 < images->num_frames ? images->timestamps[i+1] : INFINITY;
+            if (rts > images->vsync_duration || end < 0.0) {
+                PL_TRACE(rr, "  -> Skipping: no intersection with vsync");
+                continue;
+            } else {
+                rts = PL_MAX(rts, 0.0);
+                end = PL_MIN(end, images->vsync_duration);
+                pl_assert(end >= rts);
+            }
+
+            // Weight is the fraction of vsync interval that frame is visible
+            weight = (end - rts) / images->vsync_duration;
+            PL_TRACE(rr, "  -> Frame [%f, %f] intersects [%f, %f] = weight %f",
+                     rts, end, 0.0, images->vsync_duration, weight);
+
+            if (weight < mixer.kernel->params[0]) {
+                PL_TRACE(rr, "     (culling due to threshold)");
+                weight = 0.0;
+            }
+
+        } else {
+
+            const float radius = pl_filter_radius_bound(&mixer);
+            if (fabsf(rts) >= radius) {
+                PL_TRACE(rr, "  -> Skipping: outside filter radius (%f)", radius);
+                continue;
+            }
+
+            // Weight is directly sampled from the filter
+            weight = pl_filter_sample(&mixer, rts);
+            PL_TRACE(rr, "  -> Filter offset %f = weight %f", rts, weight);
+
+        }
+
+        struct cached_frame *f = NULL;
+        for (int j = 0; j < rr->frames.num; j++) {
+            if (rr->frames.elem[j].signature == sig) {
+                f = &rr->frames.elem[j];
+                f->evict = false;
+                break;
+            }
+        }
+
+        // Skip frames with negligible contributions. Do this after the loop
+        // above to make sure these frames don't get evicted just yet, and
+        // also exclude the reference image from this optimization to ensure
+        // that we always have at least one frame.
+        const float cutoff = 1e-3;
+        if (fabsf(weight) <= cutoff && img != refimg) {
+            PL_TRACE(rr, "   -> Skipping: weight (%f) below threshold (%f)",
+                     weight, cutoff);
+            continue;
+        }
+
+        bool skip_cache = single_frame && (params->skip_caching_single_frame || par_info.trivial);
+        if (!f && skip_cache) {
+            PL_TRACE(rr, "Single frame not found in cache, bypassing");
+            goto fallback;
+        }
+
+        if (!f) {
+            // Signature does not exist in the cache at all yet,
+            // so grow the cache by this entry.
+            PL_ARRAY_GROW(rr, rr->frames);
+            f = &rr->frames.elem[rr->frames.num++];
+            *f = (struct cached_frame) {
+                .signature = sig,
+            };
+        }
+
+        // Check to see if we can blindly reuse this cache entry. This is the
+        // case if either the params are compatible, or the user doesn't care
+        bool can_reuse = f->tex;
+        bool strict_reuse = skip_cache || single_frame ||
+                            !params->preserve_mixing_cache;
+        if (can_reuse && strict_reuse) {
+            can_reuse = f->tex->params.w == out_w &&
+                        f->tex->params.h == out_h &&
+                        pl_rect2d_eq(f->crop, img->crop) &&
+                        f->params_hash == par_info.hash &&
+                        pl_color_space_equal(&f->color, &target->color) &&
+                        pl_icc_profile_equal(&f->profile, &target->profile);
+        }
+
+        if (!can_reuse && skip_cache) {
+            PL_TRACE(rr, "Single frame cache entry invalid, bypassing");
+            goto fallback;
+        }
+
+        if (!can_reuse) {
+            // If we can't reuse the entry, we need to re-render this frame
+            PL_TRACE(rr, "  -> Cached texture missing or invalid.. (re)creating");
+            if (!f->tex) {
+                if (PL_ARRAY_POP(rr->frame_fbos, &f->tex))
+                    pl_tex_invalidate(rr->gpu, f->tex);
+            }
+
+            bool ok = pl_tex_recreate(rr->gpu, &f->tex, pl_tex_params(
+                .w = out_w,
+                .h = out_h,
+                .format = pass.fbofmt[4],
+                .sampleable = true,
+                .renderable = true,
+                .blit_dst = pass.fbofmt[4]->caps & PL_FMT_CAP_BLITTABLE,
+                .storable = pass.fbofmt[4]->caps & PL_FMT_CAP_STORABLE,
+            ));
+
+            if (!ok) {
+                PL_ERR(rr, "Could not create intermediate texture for "
+                       "frame mixing.. disabling!");
+                rr->errors |= PL_RENDER_ERR_FRAME_MIXING;
+                goto fallback;
+            }
+
+            struct pass_state inter_pass = {
+                .rr = rr,
+                .params = pass.params,
+                .image = *img,
+                .target = *ptarget,
+                .info.stage = PL_RENDER_STAGE_FRAME,
+                .acquired = pass.acquired,
+            };
+
+            // Render a single frame up to `pass_output_target`
+            memcpy(inter_pass.fbofmt, pass.fbofmt, sizeof(pass.fbofmt));
+            if (!pass_init(&inter_pass, true))
+                goto fail;
+
+            pass_begin_frame(&inter_pass);
+            if (!(ok = pass_read_image(&inter_pass)))
+                goto inter_pass_error;
+            if (!(ok = pass_scale_main(&inter_pass)))
+                goto inter_pass_error;
+            pass_convert_colors(&inter_pass);
+
+            pl_assert(inter_pass.img.sh); // guaranteed by `pass_convert_colors`
+            pl_shader_set_alpha(inter_pass.img.sh, &inter_pass.img.repr,
+                                PL_ALPHA_PREMULTIPLIED); // for frame mixing
+
+            pl_assert(inter_pass.img.w == out_w &&
+                      inter_pass.img.h == out_h);
+
+            ok = pl_dispatch_finish(rr->dp, pl_dispatch_params(
+                .shader = &inter_pass.img.sh,
+                .target = f->tex,
+            ));
+            if (!ok)
+                goto inter_pass_error;
+
+            float sx = out_w / pl_rect_w(inter_pass.dst_rect),
+                  sy = out_h / pl_rect_h(inter_pass.dst_rect);
+
+            pl_transform2x2 shift = {
+                .mat.m = {{ sx, 0, }, { 0, sy, }},
+                .c = {
+                    -sx * inter_pass.dst_rect.x0,
+                    -sy * inter_pass.dst_rect.y0
+                },
+            };
+
+            if (inter_pass.rotation % PL_ROTATION_180 == PL_ROTATION_90) {
+                PL_SWAP(shift.mat.m[0][0], shift.mat.m[0][1]);
+                PL_SWAP(shift.mat.m[1][0], shift.mat.m[1][1]);
+            }
+
+            draw_overlays(&inter_pass, f->tex, inter_pass.img.comps, NULL,
+                          inter_pass.image.overlays,
+                          inter_pass.image.num_overlays,
+                          inter_pass.img.color,
+                          inter_pass.img.repr,
+                          &shift);
+
+            f->params_hash = par_info.hash;
+            f->crop = img->crop;
+            f->color = inter_pass.img.color;
+            f->comps = inter_pass.img.comps;
+            f->profile = target->profile;
+            // fall through
+
+inter_pass_error:
+            inter_pass.acquired.target = false; // don't release target
+            pass_uninit(&inter_pass);
+            if (!ok)
+                goto fail;
+        }
+
+        pl_assert(fidx < MAX_MIX_FRAMES);
+        frames[fidx] = *f;
+        weights[fidx] = weight;
+        wsum += weight;
+        fidx++;
+    }
+
+    // Evict the frames we *don't* need
+    for (int i = 0; i < rr->frames.num; ) {
+        if (rr->frames.elem[i].evict) {
+            PL_TRACE(rr, "Evicting frame with signature %llx from cache",
+                     (unsigned long long) rr->frames.elem[i].signature);
+            PL_ARRAY_APPEND(rr, rr->frame_fbos, rr->frames.elem[i].tex);
+            PL_ARRAY_REMOVE_AT(rr->frames, i);
+            continue;
+        } else {
+            i++;
+        }
+    }
+
+    // If we got back no frames, retry with ZOH semantics
+    if (!fidx) {
+        pl_assert(!single_frame);
+        single_frame = true;
+        goto retry;
+    }
+
+    // Sample and mix the output color
+    pass_begin_frame(&pass);
+    pass.info.count = fidx;
+    pl_assert(fidx > 0);
+
+    pl_shader sh = pl_dispatch_begin(rr->dp);
+    sh_describef(sh, "frame mixing (%d frame%s)", fidx, fidx > 1 ? "s" : "");
+    sh->output = PL_SHADER_SIG_COLOR;
+    sh->output_w = out_w;
+    sh->output_h = out_h;
+
+    GLSL("vec4 color;                   \n"
+         "// pl_render_image_mix        \n"
+         "{                             \n"
+         "vec4 mix_color = vec4(0.0);   \n");
+
+    int comps = 0;
+    for (int i = 0; i < fidx; i++) {
+        const struct pl_tex_params *tpars = &frames[i].tex->params;
+
+        // Use linear sampling if desired and possible
+        enum pl_tex_sample_mode sample_mode = PL_TEX_SAMPLE_NEAREST;
+        if ((tpars->w != out_w || tpars->h != out_h) &&
+            (tpars->format->caps & PL_FMT_CAP_LINEAR))
+        {
+            sample_mode = PL_TEX_SAMPLE_LINEAR;
+        }
+
+        ident_t pos, tex = sh_bind(sh, frames[i].tex, PL_TEX_ADDRESS_CLAMP,
+                                   sample_mode, "frame", NULL, &pos, NULL);
+
+        GLSL("color = textureLod("$", "$", 0.0); \n", tex, pos);
+
+        // Note: This ignores differences in ICC profile, which we decide to
+        // just simply not care about. Doing that properly would require
+        // converting between different image profiles, and the headache of
+        // finagling that state is just not worth it because this is an
+        // exceptionally unlikely hypothetical.
+        //
+        // This also ignores differences in HDR metadata, which we deliberately
+        // ignore because it causes aggressive shader recompilation.
+        struct pl_color_space frame_csp = frames[i].color;
+        struct pl_color_space mix_csp = target->color;
+        frame_csp.hdr = mix_csp.hdr = (struct pl_hdr_metadata) {0};
+        pl_shader_color_map_ex(sh, NULL, pl_color_map_args(frame_csp, mix_csp));
+
+        float weight = weights[i] / wsum;
+        GLSL("mix_color += vec4("$") * color; \n", SH_FLOAT_DYN(weight));
+        comps = PL_MAX(comps, frames[i].comps);
+    }
+
+    GLSL("color = mix_color; \n"
+         "}                  \n");
+
+    // Dispatch this to the destination
+    pass.img = (struct img) {
+        .sh = sh,
+        .w = out_w,
+        .h = out_h,
+        .comps = comps,
+        .color = target->color,
+        .repr = {
+            .sys = PL_COLOR_SYSTEM_RGB,
+            .levels = PL_COLOR_LEVELS_PC,
+            .alpha = comps >= 4 ? PL_ALPHA_PREMULTIPLIED : PL_ALPHA_UNKNOWN,
+        },
+    };
+
+    if (!pass_output_target(&pass))
+        goto fallback;
+
+    pass_uninit(&pass);
+    return true;
+
+fail:
+    PL_ERR(rr, "Could not render image for frame mixing.. disabling!");
+    rr->errors |= PL_RENDER_ERR_FRAME_MIXING;
+    // fall through
+
+fallback:
+    pass_uninit(&pass);
+    return pl_render_image(rr, refimg, ptarget, params);
+
+error: // for parameter validation failures
+    return false;
+}
+
+void pl_frames_infer_mix(pl_renderer rr, const struct pl_frame_mix *mix,
+                         struct pl_frame *target, struct pl_frame *out_ref)
+{
+    struct pass_state pass = {
+        .rr     = rr,
+        .target = *target,
+    };
+
+    const struct pl_frame *refimg = pl_frame_mix_nearest(mix);
+    if (refimg) {
+        pass.image = *refimg;
+    } else {
+        pass.src_ref = -1;
+    }
+
+    pass_fix_frames(&pass);
+    *target = pass.target;
+    if (out_ref)
+        *out_ref = pass.image;
+}
+
+void pl_frame_set_chroma_location(struct pl_frame *frame,
+                                  enum pl_chroma_location chroma_loc)
+{
+    pl_tex ref = frame->planes[frame_ref(frame)].texture;
+
+    if (ref) {
+        // Texture dimensions are already known, so apply the chroma location
+        // only to subsampled planes
+        int ref_w = ref->params.w, ref_h = ref->params.h;
+
+        for (int i = 0; i < frame->num_planes; i++) {
+            struct pl_plane *plane = &frame->planes[i];
+            pl_tex tex = plane->texture;
+            bool subsampled = tex->params.w < ref_w || tex->params.h < ref_h;
+            if (subsampled)
+                pl_chroma_location_offset(chroma_loc, &plane->shift_x, &plane->shift_y);
+        }
+    } else {
+        // Texture dimensions are not yet known, so apply the chroma location
+        // to all chroma planes, regardless of subsampling
+        for (int i = 0; i < frame->num_planes; i++) {
+            struct pl_plane *plane = &frame->planes[i];
+            if (detect_plane_type(plane, &frame->repr) == PLANE_CHROMA)
+                pl_chroma_location_offset(chroma_loc, &plane->shift_x, &plane->shift_y);
+        }
+    }
+}
+
+void pl_frame_from_swapchain(struct pl_frame *out_frame,
+                             const struct pl_swapchain_frame *frame)
+{
+    pl_tex fbo = frame->fbo;
+    int num_comps = fbo->params.format->num_components;
+    if (!frame->color_repr.alpha)
+        num_comps = PL_MIN(num_comps, 3);
+
+    *out_frame = (struct pl_frame) {
+        .num_planes = 1,
+        .planes = {{
+            .texture = fbo,
+            .flipped = frame->flipped,
+            .components = num_comps,
+            .component_mapping = {0, 1, 2, 3},
+        }},
+        .crop = { 0, 0, fbo->params.w, fbo->params.h },
+        .repr = frame->color_repr,
+        .color = frame->color_space,
+    };
+}
+
+bool pl_frame_is_cropped(const struct pl_frame *frame)
+{
+    int x0 = roundf(PL_MIN(frame->crop.x0, frame->crop.x1)),
+        y0 = roundf(PL_MIN(frame->crop.y0, frame->crop.y1)),
+        x1 = roundf(PL_MAX(frame->crop.x0, frame->crop.x1)),
+        y1 = roundf(PL_MAX(frame->crop.y0, frame->crop.y1));
+
+    pl_tex ref = frame->planes[frame_ref(frame)].texture;
+    pl_assert(ref);
+
+    if (!x0 && !x1)
+        x1 = ref->params.w;
+    if (!y0 && !y1)
+        y1 = ref->params.h;
+
+    return x0 > 0 || y0 > 0 || x1 < ref->params.w || y1 < ref->params.h;
+}
+
+void pl_frame_clear_rgba(pl_gpu gpu, const struct pl_frame *frame,
+                         const float rgba[4])
+{
+    struct pl_color_repr repr = frame->repr;
+    pl_transform3x3 tr = pl_color_repr_decode(&repr, NULL);
+    pl_transform3x3_invert(&tr);
+
+    float encoded[3] = { rgba[0], rgba[1], rgba[2] };
+    pl_transform3x3_apply(&tr, encoded);
+
+    float mult = frame->repr.alpha == PL_ALPHA_PREMULTIPLIED ? rgba[3] : 1.0;
+    for (int p = 0; p < frame->num_planes; p++) {
+        const struct pl_plane *plane =  &frame->planes[p];
+        float clear[4] = { 0.0, 0.0, 0.0, rgba[3] };
+        for (int c = 0; c < plane->components; c++) {
+            int ch = plane->component_mapping[c];
+            if (ch >= 0 && ch < 3)
+                clear[c] = mult * encoded[plane->component_mapping[c]];
+        }
+
+        pl_tex_clear(gpu, plane->texture, clear);
+    }
+}
+
+struct pl_render_errors pl_renderer_get_errors(pl_renderer rr)
+{
+    return (struct pl_render_errors) {
+        .errors = rr->errors,
+        .disabled_hooks = rr->disabled_hooks.elem,
+        .num_disabled_hooks = rr->disabled_hooks.num,
+    };
+}
+
+void pl_renderer_reset_errors(pl_renderer rr,
+                              const struct pl_render_errors *errors)
+{
+    if (!errors) {
+        // Reset everything
+        rr->errors = PL_RENDER_ERR_NONE;
+        rr->disabled_hooks.num = 0;
+        return;
+    }
+
+    // Reset only requested errors
+    rr->errors &= ~errors->errors;
+
+    // Not clearing hooks
+    if (!(errors->errors & PL_RENDER_ERR_HOOKS))
+        goto done;
+
+    // Remove all hook signatures
+    if (!errors->num_disabled_hooks) {
+        rr->disabled_hooks.num = 0;
+        goto done;
+    }
+
+    // At this point we require valid array of hooks
+    if (!errors->disabled_hooks) {
+        assert(errors->disabled_hooks);
+        goto done;
+    }
+
+    for (int i = 0; i < errors->num_disabled_hooks; i++) {
+        for (int j = 0; j < rr->disabled_hooks.num; j++) {
+            // Remove only requested hook signatures
+            if (rr->disabled_hooks.elem[j] == errors->disabled_hooks[i]) {
+                PL_ARRAY_REMOVE_AT(rr->disabled_hooks, j);
+                break;
+            }
+        }
+    }
+
+    done:
+        if (rr->disabled_hooks.num)
+            rr->errors |= PL_RENDER_ERR_HOOKS;
+        return;
+}
diff --git a/src/shaders.c b/src/shaders.c
new file mode 100644
index 0000000..503ea78
--- /dev/null
+++ b/src/shaders.c
@@ -0,0 +1,992 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <math.h>
+
+#include "common.h"
+#include "log.h"
+#include "shaders.h"
+
+pl_shader_info pl_shader_info_ref(pl_shader_info pinfo)
+{
+    struct sh_info *info = (struct sh_info *) pinfo;
+    if (!info)
+        return NULL;
+
+    pl_rc_ref(&info->rc);
+    return &info->info;
+}
+
+void pl_shader_info_deref(pl_shader_info *pinfo)
+{
+    struct sh_info *info = (struct sh_info *) *pinfo;
+    if (!info)
+        return;
+
+    if (pl_rc_deref(&info->rc))
+        pl_free(info);
+    *pinfo = NULL;
+}
+
+static struct sh_info *sh_info_alloc(void *alloc)
+{
+    struct sh_info *info = pl_zalloc_ptr(alloc, info);
+    info->tmp = pl_tmp(info);
+    pl_rc_init(&info->rc);
+    return info;
+}
+
+// Re-use `sh_info` allocation if possible, allocate new otherwise
+static struct sh_info *sh_info_recycle(struct sh_info *info)
+{
+    if (!pl_rc_deref(&info->rc))
+        return sh_info_alloc(NULL);
+
+    memset(&info->info, 0, sizeof(info->info)); // reset public fields
+    pl_free_children(info->tmp);
+    pl_rc_ref(&info->rc);
+    info->desc.len = 0;
+    info->steps.num = 0;
+    return info;
+}
+
+static uint8_t reverse_bits(uint8_t x)
+{
+    static const uint8_t reverse_nibble[16] = {
+        0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
+        0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf,
+    };
+
+    return reverse_nibble[x & 0xF] << 4 | reverse_nibble[x >> 4];
+}
+
+static void init_shader(pl_shader sh, const struct pl_shader_params *params)
+{
+    if (params) {
+        sh->info->info.params = *params;
+
+        // To avoid collisions for shaders with very high number of
+        // identifiers, pack the shader ID into the highest bits (MSB -> LSB)
+        pl_static_assert(sizeof(sh->prefix) > sizeof(params->id));
+        const int shift = 8 * (sizeof(sh->prefix) - sizeof(params->id));
+        sh->prefix = reverse_bits(params->id) << shift;
+    }
+
+    sh->name = sh_fresh(sh, "main");
+}
+
+pl_shader pl_shader_alloc(pl_log log, const struct pl_shader_params *params)
+{
+    static const int glsl_ver_req = 130;
+    if (params && params->glsl.version && params->glsl.version < 130) {
+        pl_err(log, "Requested GLSL version %d too low (required: %d)",
+               params->glsl.version, glsl_ver_req);
+        return NULL;
+    }
+
+    pl_shader sh = pl_alloc_ptr(NULL, sh);
+    *sh = (struct pl_shader_t) {
+        .log        = log,
+        .tmp        = pl_tmp(sh),
+        .info       = sh_info_alloc(NULL),
+        .mutable    = true,
+    };
+
+    for (int i = 0; i < PL_ARRAY_SIZE(sh->buffers); i++)
+        sh->buffers[i] = pl_str_builder_alloc(sh);
+
+    init_shader(sh, params);
+    return sh;
+}
+
+static void sh_obj_deref(pl_shader_obj obj);
+
+void sh_deref(pl_shader sh)
+{
+    pl_free_children(sh->tmp);
+
+    for (int i = 0; i < sh->obj.num; i++)
+        sh_obj_deref(sh->obj.elem[i]);
+    sh->obj.num = 0;
+}
+
+void pl_shader_free(pl_shader *psh)
+{
+    pl_shader sh = *psh;
+    if (!sh)
+        return;
+
+    sh_deref(sh);
+    pl_shader_info_deref((pl_shader_info *) &sh->info);
+    pl_free_ptr(psh);
+}
+
+void pl_shader_reset(pl_shader sh, const struct pl_shader_params *params)
+{
+    sh_deref(sh);
+
+    struct pl_shader_t new = {
+        .log            = sh->log,
+        .tmp            = sh->tmp,
+        .info           = sh_info_recycle(sh->info),
+        .data.buf       = sh->data.buf,
+        .mutable        = true,
+
+        // Preserve array allocations
+        .obj.elem       = sh->obj.elem,
+        .vas.elem       = sh->vas.elem,
+        .vars.elem      = sh->vars.elem,
+        .descs.elem     = sh->descs.elem,
+        .consts.elem    = sh->consts.elem,
+    };
+
+    // Preserve buffer allocations
+    memcpy(new.buffers, sh->buffers, sizeof(new.buffers));
+    for (int i = 0; i < PL_ARRAY_SIZE(new.buffers); i++)
+        pl_str_builder_reset(new.buffers[i]);
+
+    *sh = new;
+    init_shader(sh, params);
+}
+
+static void *sh_alloc(pl_shader sh, size_t size, size_t align)
+{
+    const size_t offset = PL_ALIGN2(sh->data.len, align);
+    const size_t req_size = offset + size;
+    if (req_size <= pl_get_size(sh->data.buf)) {
+        sh->data.len = offset + size;
+        return sh->data.buf + offset;
+    }
+
+    // We can't realloc this buffer because various pointers will be left
+    // dangling, so just reparent it onto `sh->tmp` (so it will be cleaned
+    // up when the shader is next reset) and allocate a new, larger buffer
+    // in its place
+    const size_t new_size = PL_MAX(req_size << 1, 256);
+    pl_steal(sh->tmp, sh->data.buf);
+    sh->data.buf = pl_alloc(sh, new_size);
+    sh->data.len = size;
+    return sh->data.buf;
+}
+
+static void *sh_memdup(pl_shader sh, const void *data, size_t size, size_t align)
+{
+    if (!size)
+        return NULL;
+
+    void *dst = sh_alloc(sh, size, align);
+    assert(data);
+    memcpy(dst, data, size);
+    return dst;
+}
+
+bool pl_shader_is_failed(const pl_shader sh)
+{
+    return sh->failed;
+}
+
+struct pl_glsl_version sh_glsl(const pl_shader sh)
+{
+    if (SH_PARAMS(sh).glsl.version)
+        return SH_PARAMS(sh).glsl;
+
+    if (SH_GPU(sh))
+        return SH_GPU(sh)->glsl;
+
+    return (struct pl_glsl_version) { .version = 130 };
+}
+
+bool sh_try_compute(pl_shader sh, int bw, int bh, bool flex, size_t mem)
+{
+    pl_assert(bw && bh);
+    int *sh_bw = &sh->group_size[0];
+    int *sh_bh = &sh->group_size[1];
+
+    struct pl_glsl_version glsl = sh_glsl(sh);
+    if (!glsl.compute) {
+        PL_TRACE(sh, "Disabling compute shader due to missing `compute` support");
+        return false;
+    }
+
+    if (sh->shmem + mem > glsl.max_shmem_size) {
+        PL_TRACE(sh, "Disabling compute shader due to insufficient shmem");
+        return false;
+    }
+
+    if (sh->type == SH_FRAGMENT) {
+        PL_TRACE(sh, "Disabling compute shader because shader is already marked "
+                 "as fragment shader");
+        return false;
+    }
+
+    if (bw > glsl.max_group_size[0] ||
+        bh > glsl.max_group_size[1] ||
+        (bw * bh) > glsl.max_group_threads)
+    {
+        if (!flex) {
+            PL_TRACE(sh, "Disabling compute shader due to exceeded group "
+                     "thread count.");
+            return false;
+        } else {
+            // Pick better group sizes
+            bw = PL_MIN(bw, glsl.max_group_size[0]);
+            bh = glsl.max_group_threads / bw;
+        }
+    }
+
+    sh->shmem += mem;
+
+    // If the current shader is either not a compute shader, or we have no
+    // choice but to override the metadata, always do so
+    if (sh->type != SH_COMPUTE || (sh->flexible_work_groups && !flex)) {
+        *sh_bw = bw;
+        *sh_bh = bh;
+        sh->type = SH_COMPUTE;
+        sh->flexible_work_groups = flex;
+        return true;
+    }
+
+    // If both shaders are flexible, pick the larger of the two
+    if (sh->flexible_work_groups && flex) {
+        *sh_bw = PL_MAX(*sh_bw, bw);
+        *sh_bh = PL_MAX(*sh_bh, bh);
+        pl_assert(*sh_bw * *sh_bh <= glsl.max_group_threads);
+        return true;
+    }
+
+    // At this point we're looking only at a non-flexible compute shader
+    pl_assert(sh->type == SH_COMPUTE && !sh->flexible_work_groups);
+    if (!flex) {
+        // Ensure parameters match
+        if (bw != *sh_bw || bh != *sh_bh) {
+            PL_TRACE(sh, "Disabling compute shader due to incompatible group "
+                     "sizes %dx%d and %dx%d", *sh_bw, *sh_bh, bw, bh);
+            sh->shmem -= mem;
+            return false;
+        }
+    }
+
+    return true;
+}
+
+bool pl_shader_is_compute(const pl_shader sh)
+{
+    return sh->type == SH_COMPUTE;
+}
+
+bool pl_shader_output_size(const pl_shader sh, int *w, int *h)
+{
+    if (!sh->output_w || !sh->output_h)
+        return false;
+
+    *w = sh->transpose ? sh->output_h : sh->output_w;
+    *h = sh->transpose ? sh->output_w : sh->output_h;
+    return true;
+}
+
+ident_t sh_fresh(pl_shader sh, const char *name)
+{
+    unsigned short id = ++sh->fresh;
+    assert(!(sh->prefix & id));
+    id |= sh->prefix;
+
+    assert(name);
+    return sh_mkident(id, name);
+}
+
+static inline ident_t sh_fresh_name(pl_shader sh, const char **pname)
+{
+    ident_t id = sh_fresh(sh, *pname);
+    *pname = sh_ident_pack(id);
+    return id;
+}
+
+ident_t sh_var(pl_shader sh, struct pl_shader_var sv)
+{
+    ident_t id = sh_fresh_name(sh, &sv.var.name);
+    struct pl_var_layout layout = pl_var_host_layout(0, &sv.var);
+    sv.data = sh_memdup(sh, sv.data, layout.size, layout.stride);
+    PL_ARRAY_APPEND(sh, sh->vars, sv);
+    return id;
+}
+
+ident_t sh_var_int(pl_shader sh, const char *name, int val, bool dynamic)
+{
+    return sh_var(sh, (struct pl_shader_var) {
+        .var     = pl_var_int(name),
+        .data    = &val,
+        .dynamic = dynamic,
+    });
+}
+
+ident_t sh_var_uint(pl_shader sh, const char *name, unsigned int val, bool dynamic)
+{
+    return sh_var(sh, (struct pl_shader_var) {
+        .var     = pl_var_uint(name),
+        .data    = &val,
+        .dynamic = dynamic,
+    });
+}
+
+ident_t sh_var_float(pl_shader sh, const char *name, float val, bool dynamic)
+{
+    return sh_var(sh, (struct pl_shader_var) {
+        .var     = pl_var_float(name),
+        .data    = &val,
+        .dynamic = dynamic,
+    });
+}
+
+ident_t sh_var_mat3(pl_shader sh, const char *name, pl_matrix3x3 val)
+{
+    return sh_var(sh, (struct pl_shader_var) {
+        .var     = pl_var_mat3(name),
+        .data    = PL_TRANSPOSE_3X3(val.m),
+    });
+}
+
+ident_t sh_desc(pl_shader sh, struct pl_shader_desc sd)
+{
+    switch (sd.desc.type) {
+    case PL_DESC_BUF_UNIFORM:
+    case PL_DESC_BUF_STORAGE:
+        for (int i = 0; i < sh->descs.num; i++) // ensure uniqueness
+            pl_assert(sh->descs.elem[i].binding.object != sd.binding.object);
+        size_t bsize = sizeof(sd.buffer_vars[0]) * sd.num_buffer_vars;
+        sd.buffer_vars = sh_memdup(sh, sd.buffer_vars, bsize,
+                                   alignof(struct pl_buffer_var));
+        for (int i = 0; i < sd.num_buffer_vars; i++) {
+            struct pl_var *bv = &sd.buffer_vars[i].var;
+            const char *name = bv->name;
+            GLSLP("#define %s "$"\n", name, sh_fresh_name(sh, &bv->name));
+        }
+        break;
+
+    case PL_DESC_BUF_TEXEL_UNIFORM:
+    case PL_DESC_BUF_TEXEL_STORAGE:
+    case PL_DESC_SAMPLED_TEX:
+    case PL_DESC_STORAGE_IMG:
+        pl_assert(!sd.num_buffer_vars);
+        break;
+
+    case PL_DESC_INVALID:
+    case PL_DESC_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+    ident_t id = sh_fresh_name(sh, &sd.desc.name);
+    PL_ARRAY_APPEND(sh, sh->descs, sd);
+    return id;
+}
+
+ident_t sh_const(pl_shader sh, struct pl_shader_const sc)
+{
+    if (SH_PARAMS(sh).dynamic_constants && !sc.compile_time) {
+        return sh_var(sh, (struct pl_shader_var) {
+            .var = {
+                .name = sc.name,
+                .type = sc.type,
+                .dim_v = 1,
+                .dim_m = 1,
+                .dim_a = 1,
+            },
+            .data = sc.data,
+        });
+    }
+
+    ident_t id = sh_fresh_name(sh, &sc.name);
+
+    pl_gpu gpu = SH_GPU(sh);
+    if (gpu && gpu->limits.max_constants) {
+        if (!sc.compile_time || gpu->limits.array_size_constants) {
+            size_t size = pl_var_type_size(sc.type);
+            sc.data = sh_memdup(sh, sc.data, size, size);
+            PL_ARRAY_APPEND(sh, sh->consts, sc);
+            return id;
+        }
+    }
+
+    // Fallback for GPUs without specialization constants
+    switch (sc.type) {
+    case PL_VAR_SINT:
+        GLSLH("const int "$" = %d; \n", id, *(int *) sc.data);
+        return id;
+    case PL_VAR_UINT:
+        GLSLH("const uint "$" = uint(%u); \n", id, *(unsigned int *) sc.data);
+        return id;
+    case PL_VAR_FLOAT:
+        GLSLH("const float "$" = float(%f); \n", id, *(float *) sc.data);
+        return id;
+    case PL_VAR_INVALID:
+    case PL_VAR_TYPE_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+}
+
+ident_t sh_const_int(pl_shader sh, const char *name, int val)
+{
+    return sh_const(sh, (struct pl_shader_const) {
+        .type = PL_VAR_SINT,
+        .name = name,
+        .data = &val,
+    });
+}
+
+ident_t sh_const_uint(pl_shader sh, const char *name, unsigned int val)
+{
+    return sh_const(sh, (struct pl_shader_const) {
+        .type = PL_VAR_UINT,
+        .name = name,
+        .data = &val,
+    });
+}
+
+ident_t sh_const_float(pl_shader sh, const char *name, float val)
+{
+    return sh_const(sh, (struct pl_shader_const) {
+        .type = PL_VAR_FLOAT,
+        .name = name,
+        .data = &val,
+    });
+}
+
+ident_t sh_attr(pl_shader sh, struct pl_shader_va sva)
+{
+    const size_t vsize = sva.attr.fmt->texel_size;
+    uint8_t *data = sh_alloc(sh, vsize * 4, vsize);
+    for (int i = 0; i < 4; i++) {
+        memcpy(data, sva.data[i], vsize);
+        sva.data[i] = data;
+        data += vsize;
+    }
+
+    ident_t id = sh_fresh_name(sh, &sva.attr.name);
+    PL_ARRAY_APPEND(sh, sh->vas, sva);
+    return id;
+}
+
+ident_t sh_attr_vec2(pl_shader sh, const char *name, const pl_rect2df *rc)
+{
+    pl_gpu gpu = SH_GPU(sh);
+    if (!gpu) {
+        SH_FAIL(sh, "Failed adding vertex attr '%s': No GPU available!", name);
+        return NULL_IDENT;
+    }
+
+    pl_fmt fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2);
+    if (!fmt) {
+        SH_FAIL(sh, "Failed adding vertex attr '%s': no vertex fmt!", name);
+        return NULL_IDENT;
+    }
+
+    float verts[4][2] = {
+        { rc->x0, rc->y0 },
+        { rc->x1, rc->y0 },
+        { rc->x0, rc->y1 },
+        { rc->x1, rc->y1 },
+    };
+
+    return sh_attr(sh, (struct pl_shader_va) {
+        .attr = {
+            .name     = name,
+            .fmt      = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2),
+        },
+        .data = { verts[0], verts[1], verts[2], verts[3] },
+    });
+}
+
+ident_t sh_bind(pl_shader sh, pl_tex tex,
+                enum pl_tex_address_mode address_mode,
+                enum pl_tex_sample_mode sample_mode,
+                const char *name, const pl_rect2df *rect,
+                ident_t *out_pos, ident_t *out_pt)
+{
+    if (pl_tex_params_dimension(tex->params) != 2) {
+        SH_FAIL(sh, "Failed binding texture '%s': not a 2D texture!", name);
+        return NULL_IDENT;
+    }
+
+    if (!tex->params.sampleable) {
+        SH_FAIL(sh, "Failed binding texture '%s': texture not sampleable!", name);
+        return NULL_IDENT;
+    }
+
+    ident_t itex = sh_desc(sh, (struct pl_shader_desc) {
+        .desc = {
+            .name = name,
+            .type = PL_DESC_SAMPLED_TEX,
+        },
+        .binding = {
+            .object = tex,
+            .address_mode = address_mode,
+            .sample_mode = sample_mode,
+        },
+    });
+
+    float sx, sy;
+    if (tex->sampler_type == PL_SAMPLER_RECT) {
+        sx = 1.0;
+        sy = 1.0;
+    } else {
+        sx = 1.0 / tex->params.w;
+        sy = 1.0 / tex->params.h;
+    }
+
+    if (out_pos) {
+        pl_rect2df full = {
+            .x1 = tex->params.w,
+            .y1 = tex->params.h,
+        };
+
+        rect = PL_DEF(rect, &full);
+        *out_pos = sh_attr_vec2(sh, "tex_coord", &(pl_rect2df) {
+            .x0 = sx * rect->x0, .y0 = sy * rect->y0,
+            .x1 = sx * rect->x1, .y1 = sy * rect->y1,
+        });
+    }
+
+    if (out_pt) {
+        *out_pt = sh_var(sh, (struct pl_shader_var) {
+            .var  = pl_var_vec2("tex_pt"),
+            .data = &(float[2]) {sx, sy},
+        });
+    }
+
+    return itex;
+}
+
+bool sh_buf_desc_append(void *alloc, pl_gpu gpu,
+                        struct pl_shader_desc *buf_desc,
+                        struct pl_var_layout *out_layout,
+                        const struct pl_var new_var)
+{
+    struct pl_buffer_var bv = { .var = new_var };
+    size_t cur_size = sh_buf_desc_size(buf_desc);
+
+    switch (buf_desc->desc.type) {
+    case PL_DESC_BUF_UNIFORM:
+        bv.layout = pl_std140_layout(cur_size, &new_var);
+        if (bv.layout.offset + bv.layout.size > gpu->limits.max_ubo_size)
+            return false;
+        break;
+    case PL_DESC_BUF_STORAGE:
+        bv.layout = pl_std430_layout(cur_size, &new_var);
+        if (bv.layout.offset + bv.layout.size > gpu->limits.max_ssbo_size)
+            return false;
+        break;
+    case PL_DESC_INVALID:
+    case PL_DESC_SAMPLED_TEX:
+    case PL_DESC_STORAGE_IMG:
+    case PL_DESC_BUF_TEXEL_UNIFORM:
+    case PL_DESC_BUF_TEXEL_STORAGE:
+    case PL_DESC_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+    if (out_layout)
+        *out_layout = bv.layout;
+    PL_ARRAY_APPEND_RAW(alloc, buf_desc->buffer_vars, buf_desc->num_buffer_vars, bv);
+    return true;
+}
+
+size_t sh_buf_desc_size(const struct pl_shader_desc *buf_desc)
+{
+    if (!buf_desc->num_buffer_vars)
+        return 0;
+
+    const struct pl_buffer_var *last;
+    last = &buf_desc->buffer_vars[buf_desc->num_buffer_vars - 1];
+    return last->layout.offset + last->layout.size;
+}
+
+void sh_describef(pl_shader sh, const char *fmt, ...)
+{
+    va_list ap;
+    va_start(ap, fmt);
+    sh_describe(sh, pl_vasprintf(sh->info->tmp, fmt, ap));
+    va_end(ap);
+}
+
+static const char *insigs[] = {
+    [PL_SHADER_SIG_NONE]  = "",
+    [PL_SHADER_SIG_COLOR] = "vec4 color",
+};
+
+static const char *outsigs[] = {
+    [PL_SHADER_SIG_NONE]  = "void",
+    [PL_SHADER_SIG_COLOR] = "vec4",
+};
+
+static const char *retvals[] = {
+    [PL_SHADER_SIG_NONE]  = "",
+    [PL_SHADER_SIG_COLOR] = "return color;",
+};
+
+// libplacebo currently only allows 2D samplers for shader signatures
+static const char *samplers2D[] = {
+    [PL_SAMPLER_NORMAL]     = "sampler2D",
+    [PL_SAMPLER_RECT]       = "sampler2DRect",
+    [PL_SAMPLER_EXTERNAL]   = "samplerExternalOES",
+};
+
+ident_t sh_subpass(pl_shader sh, pl_shader sub)
+{
+    pl_assert(sh->mutable);
+
+    if (sh->prefix == sub->prefix) {
+        PL_TRACE(sh, "Can't merge shaders: conflicting identifiers!");
+        return NULL_IDENT;
+    }
+
+    // Check for shader compatibility
+    int res_w = PL_DEF(sh->output_w, sub->output_w),
+        res_h = PL_DEF(sh->output_h, sub->output_h);
+
+    if ((sub->output_w && res_w != sub->output_w) ||
+        (sub->output_h && res_h != sub->output_h))
+    {
+        PL_TRACE(sh, "Can't merge shaders: incompatible sizes: %dx%d and %dx%d",
+                 sh->output_w, sh->output_h, sub->output_w, sub->output_h);
+        return NULL_IDENT;
+    }
+
+    if (sub->type == SH_COMPUTE) {
+        int subw = sub->group_size[0],
+            subh = sub->group_size[1];
+        bool flex = sub->flexible_work_groups;
+
+        if (!sh_try_compute(sh, subw, subh, flex, sub->shmem)) {
+            PL_TRACE(sh, "Can't merge shaders: incompatible block sizes or "
+                     "exceeded shared memory resource capabilities");
+            return NULL_IDENT;
+        }
+    }
+
+    sh->output_w = res_w;
+    sh->output_h = res_h;
+
+    // Append the prelude and header
+    pl_str_builder_concat(sh->buffers[SH_BUF_PRELUDE], sub->buffers[SH_BUF_PRELUDE]);
+    pl_str_builder_concat(sh->buffers[SH_BUF_HEADER], sub->buffers[SH_BUF_HEADER]);
+
+    // Append the body as a new header function
+    if (sub->input == PL_SHADER_SIG_SAMPLER) {
+        pl_assert(sub->sampler_prefix);
+        GLSLH("%s "$"(%c%s src_tex, vec2 tex_coord) {\n",
+              outsigs[sub->output], sub->name,
+              sub->sampler_prefix, samplers2D[sub->sampler_type]);
+    } else {
+        GLSLH("%s "$"(%s) {\n",
+              outsigs[sub->output], sub->name, insigs[sub->input]);
+    }
+    pl_str_builder_concat(sh->buffers[SH_BUF_HEADER], sub->buffers[SH_BUF_BODY]);
+    GLSLH("%s\n}\n\n", retvals[sub->output]);
+
+    // Steal all inputs and objects from the subpass
+#define ARRAY_STEAL(arr) do                 \
+{                                           \
+    PL_ARRAY_CONCAT(sh, sh->arr, sub->arr); \
+    sub->arr.num = 0;                       \
+} while (0)
+
+    ARRAY_STEAL(obj);
+    ARRAY_STEAL(vas);
+    ARRAY_STEAL(vars);
+    ARRAY_STEAL(descs);
+    ARRAY_STEAL(consts);
+#undef ARRAY_STEAL
+
+    // Steal the scratch buffer (if it holds data)
+    if (sub->data.len) {
+        pl_steal(sh->tmp, sub->data.buf);
+        sub->data = (pl_str) {0};
+    }
+
+    // Steal all temporary allocations and mark the child as unusable
+    pl_steal(sh->tmp, sub->tmp);
+    sub->tmp = pl_tmp(sub);
+    sub->failed = true;
+
+    // Steal the shader steps array (and allocations)
+    pl_assert(pl_rc_count(&sub->info->rc) == 1);
+    PL_ARRAY_CONCAT(sh->info, sh->info->steps, sub->info->steps);
+    pl_steal(sh->info->tmp, sub->info->tmp);
+    sub->info->tmp = pl_tmp(sub->info);
+    sub->info->steps.num = 0; // sanity
+
+    return sub->name;
+}
+
+pl_str_builder sh_finalize_internal(pl_shader sh)
+{
+    pl_assert(sh->mutable); // this function should only ever be called once
+    if (sh->failed)
+        return NULL;
+
+    // Padding for readability
+    GLSLP("\n");
+
+    // Concatenate everything onto the prelude to form the final output
+    pl_str_builder_concat(sh->buffers[SH_BUF_PRELUDE], sh->buffers[SH_BUF_HEADER]);
+
+    if (sh->input == PL_SHADER_SIG_SAMPLER) {
+        pl_assert(sh->sampler_prefix);
+        GLSLP("%s "$"(%c%s src_tex, vec2 tex_coord) {\n",
+              outsigs[sh->output], sh->name,
+              sh->sampler_prefix,
+              samplers2D[sh->sampler_type]);
+    } else {
+        GLSLP("%s "$"(%s) {\n", outsigs[sh->output], sh->name, insigs[sh->input]);
+    }
+
+    pl_str_builder_concat(sh->buffers[SH_BUF_PRELUDE], sh->buffers[SH_BUF_BODY]);
+    pl_str_builder_concat(sh->buffers[SH_BUF_PRELUDE], sh->buffers[SH_BUF_FOOTER]);
+    GLSLP("%s\n}\n\n", retvals[sh->output]);
+
+    // Generate the shader info
+    struct sh_info *info = sh->info;
+    info->info.steps = info->steps.elem;
+    info->info.num_steps = info->steps.num;
+    info->info.description = "(unknown shader)";
+
+    // Generate pretty description
+    for (int i = 0; i < info->steps.num; i++) {
+        const char *step = info->steps.elem[i];
+
+        // Prevent duplicates. We're okay using a weak equality check here
+        // because most pass descriptions are static strings.
+        for (int j = 0; j < i; j++) {
+            if (info->steps.elem[j] == step)
+                goto next_step;
+        }
+
+        int count = 1;
+        for (int j = i+1; j < info->steps.num; j++) {
+            if (info->steps.elem[j] == step)
+                count++;
+        }
+
+        const char *prefix = i > 0 ? ", " : "";
+        if (count > 1) {
+            pl_str_append_asprintf(info, &info->desc, "%s%s x%d",
+                                   prefix, step, count);
+        } else {
+            pl_str_append_asprintf(info, &info->desc, "%s%s", prefix, step);
+        }
+
+next_step: ;
+    }
+
+    if (info->desc.len)
+        info->info.description = (char *) info->desc.buf;
+
+    sh->mutable = false;
+    return sh->buffers[SH_BUF_PRELUDE];
+}
+
+const struct pl_shader_res *pl_shader_finalize(pl_shader sh)
+{
+    if (sh->failed) {
+        return NULL;
+    } else if (!sh->mutable) {
+        return &sh->result;
+    }
+
+    pl_shader_info info = &sh->info->info;
+    pl_str_builder glsl = sh_finalize_internal(sh);
+
+    // Turn ident_t into friendly strings before passing it to users
+#define FIX_IDENT(name) \
+    name = sh_ident_tostr(sh_ident_unpack(name))
+    for (int i = 0; i < sh->vas.num; i++)
+        FIX_IDENT(sh->vas.elem[i].attr.name);
+    for (int i = 0; i < sh->vars.num; i++)
+        FIX_IDENT(sh->vars.elem[i].var.name);
+    for (int i = 0; i < sh->consts.num; i++)
+        FIX_IDENT(sh->consts.elem[i].name);
+    for (int i = 0; i < sh->descs.num; i++) {
+        struct pl_shader_desc *sd = &sh->descs.elem[i];
+        FIX_IDENT(sd->desc.name);
+        for (int j = 0; j < sd->num_buffer_vars; sd++)
+            FIX_IDENT(sd->buffer_vars[j].var.name);
+    }
+#undef FIX_IDENT
+
+    sh->result = (struct pl_shader_res) {
+        .info               = info,
+        .glsl               = (char *) pl_str_builder_exec(glsl).buf,
+        .name               = sh_ident_tostr(sh->name),
+        .input              = sh->input,
+        .output             = sh->output,
+        .compute_group_size = { sh->group_size[0], sh->group_size[1] },
+        .compute_shmem      = sh->shmem,
+        .vertex_attribs     = sh->vas.elem,
+        .num_vertex_attribs = sh->vas.num,
+        .variables          = sh->vars.elem,
+        .num_variables      = sh->vars.num,
+        .descriptors        = sh->descs.elem,
+        .num_descriptors    = sh->descs.num,
+        .constants          = sh->consts.elem,
+        .num_constants      = sh->consts.num,
+        // deprecated fields
+        .params             = info->params,
+        .steps              = info->steps,
+        .num_steps          = info->num_steps,
+        .description        = info->description,
+    };
+
+    return &sh->result;
+}
+
+bool sh_require(pl_shader sh, enum pl_shader_sig insig, int w, int h)
+{
+    if (sh->failed) {
+        SH_FAIL(sh, "Attempting to modify a failed shader!");
+        return false;
+    }
+
+    if (!sh->mutable) {
+        SH_FAIL(sh, "Attempted to modify an immutable shader!");
+        return false;
+    }
+
+    if ((w && sh->output_w && sh->output_w != w) ||
+        (h && sh->output_h && sh->output_h != h))
+    {
+        SH_FAIL(sh, "Illegal sequence of shader operations: Incompatible "
+                "output size requirements %dx%d and %dx%d",
+                sh->output_w, sh->output_h, w, h);
+        return false;
+    }
+
+    static const char *names[] = {
+        [PL_SHADER_SIG_NONE]  = "PL_SHADER_SIG_NONE",
+        [PL_SHADER_SIG_COLOR] = "PL_SHADER_SIG_COLOR",
+    };
+
+    // If we require an input, but there is none available - just get it from
+    // the user by turning it into an explicit input signature.
+    if (!sh->output && insig) {
+        pl_assert(!sh->input);
+        sh->input = insig;
+    } else if (sh->output != insig) {
+        SH_FAIL(sh, "Illegal sequence of shader operations! Current output "
+                "signature is '%s', but called operation expects '%s'!",
+                names[sh->output], names[insig]);
+        return false;
+    }
+
+    // All of our shaders end up returning a vec4 color
+    sh->output = PL_SHADER_SIG_COLOR;
+    sh->output_w = PL_DEF(sh->output_w, w);
+    sh->output_h = PL_DEF(sh->output_h, h);
+    return true;
+}
+
+static void sh_obj_deref(pl_shader_obj obj)
+{
+    if (!pl_rc_deref(&obj->rc))
+        return;
+
+    if (obj->uninit)
+        obj->uninit(obj->gpu, obj->priv);
+
+    pl_free(obj);
+}
+
+void pl_shader_obj_destroy(pl_shader_obj *ptr)
+{
+    pl_shader_obj obj = *ptr;
+    if (!obj)
+        return;
+
+    sh_obj_deref(obj);
+    *ptr = NULL;
+}
+
+void *sh_require_obj(pl_shader sh, pl_shader_obj *ptr,
+                     enum pl_shader_obj_type type, size_t priv_size,
+                     void (*uninit)(pl_gpu gpu, void *priv))
+{
+    if (!ptr)
+        return NULL;
+
+    pl_shader_obj obj = *ptr;
+    if (obj && obj->gpu != SH_GPU(sh)) {
+        SH_FAIL(sh, "Passed pl_shader_obj belongs to different GPU!");
+        return NULL;
+    }
+
+    if (obj && obj->type != type) {
+        SH_FAIL(sh, "Passed pl_shader_obj of wrong type! Shader objects must "
+                "always be used with the same type of shader.");
+        return NULL;
+    }
+
+    if (!obj) {
+        obj = pl_zalloc_ptr(NULL, obj);
+        pl_rc_init(&obj->rc);
+        obj->gpu = SH_GPU(sh);
+        obj->type = type;
+        obj->priv = pl_zalloc(obj, priv_size);
+        obj->uninit = uninit;
+    }
+
+    PL_ARRAY_APPEND(sh, sh->obj, obj);
+    pl_rc_ref(&obj->rc);
+
+    *ptr = obj;
+    return obj->priv;
+}
+
+ident_t sh_prng(pl_shader sh, bool temporal, ident_t *p_state)
+{
+    ident_t randfun = sh_fresh(sh, "rand"),
+            state = sh_fresh(sh, "state");
+
+    // Based on pcg3d (http://jcgt.org/published/0009/03/02/)
+    GLSLP("#define prng_t uvec3\n");
+    GLSLH("vec3 "$"(inout uvec3 s) {                    \n"
+          "    s = 1664525u * s + uvec3(1013904223u);   \n"
+          "    s.x += s.y * s.z;                        \n"
+          "    s.y += s.z * s.x;                        \n"
+          "    s.z += s.x * s.y;                        \n"
+          "    s ^= s >> 16u;                           \n"
+          "    s.x += s.y * s.z;                        \n"
+          "    s.y += s.z * s.x;                        \n"
+          "    s.z += s.x * s.y;                        \n"
+          "    return vec3(s) * 1.0/float(0xFFFFFFFFu); \n"
+          "}                                            \n",
+          randfun);
+
+    if (temporal) {
+        GLSL("uvec3 "$" = uvec3(gl_FragCoord.xy, "$"); \n",
+             state, SH_UINT_DYN(SH_PARAMS(sh).index));
+    } else {
+        GLSL("uvec3 "$" = uvec3(gl_FragCoord.xy, 0.0); \n", state);
+    }
+
+    if (p_state)
+        *p_state = state;
+
+    ident_t res = sh_fresh(sh, "RAND");
+    GLSLH("#define "$" ("$"("$"))\n", res, randfun, state);
+    return res;
+}
diff --git a/src/shaders.h b/src/shaders.h
new file mode 100644
index 0000000..7656a35
--- /dev/null
+++ b/src/shaders.h
@@ -0,0 +1,387 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <limits.h>
+
+#include "common.h"
+#include "cache.h"
+#include "log.h"
+#include "gpu.h"
+
+#include <libplacebo/shaders.h>
+
+// This represents an identifier (e.g. name of function, uniform etc.) for
+// a shader resource. Not human-readable.
+
+typedef unsigned short ident_t;
+#define $           "_%hx"
+#define NULL_IDENT  0u
+
+#define sh_mkident(id, name) ((ident_t) id)
+#define sh_ident_tostr(id)   pl_asprintf(sh->tmp, $, id)
+
+enum {
+    IDENT_BITS     = 8 * sizeof(ident_t),
+    IDENT_MASK     = (uintptr_t) USHRT_MAX,
+    IDENT_SENTINEL = (uintptr_t) 0x20230319 << IDENT_BITS,
+};
+
+// Functions to pack/unpack an identifier into a `const char *` name field.
+// Used to defer string templating of friendly names until actually necessary
+static inline const char *sh_ident_pack(ident_t id)
+{
+    return (const char *)(uintptr_t) (IDENT_SENTINEL | id);
+}
+
+static inline ident_t sh_ident_unpack(const char *name)
+{
+    uintptr_t uname = (uintptr_t) name;
+    assert((uname & ~IDENT_MASK) == IDENT_SENTINEL);
+    return uname & IDENT_MASK;
+}
+
+enum pl_shader_buf {
+    SH_BUF_PRELUDE, // extra #defines etc.
+    SH_BUF_HEADER,  // previous passes, helper function definitions, etc.
+    SH_BUF_BODY,    // partial contents of the "current" function
+    SH_BUF_FOOTER,  // will be appended to the end of the current function
+    SH_BUF_COUNT,
+};
+
+enum pl_shader_type {
+    SH_AUTO,
+    SH_COMPUTE,
+    SH_FRAGMENT
+};
+
+struct sh_info {
+    // public-facing struct
+    struct pl_shader_info_t info;
+
+    // internal fields
+    void *tmp;
+    pl_rc_t rc;
+    pl_str desc;
+    PL_ARRAY(const char *) steps;
+};
+
+struct pl_shader_t {
+    pl_log log;
+    void *tmp; // temporary allocations (freed on pl_shader_reset)
+    struct sh_info *info;
+    pl_str data; // pooled/recycled scratch buffer for small allocations
+    PL_ARRAY(pl_shader_obj) obj;
+    bool failed;
+    bool mutable;
+    ident_t name;
+    enum pl_shader_sig input, output;
+    int output_w;
+    int output_h;
+    bool transpose;
+    pl_str_builder buffers[SH_BUF_COUNT];
+    enum pl_shader_type type;
+    bool flexible_work_groups;
+    int group_size[2];
+    size_t shmem;
+    enum pl_sampler_type sampler_type;
+    char sampler_prefix;
+    unsigned short prefix; // pre-processed version of res.params.id
+    unsigned short fresh;
+
+    // Note: internally, these `pl_shader_va` etc. use raw ident_t fields
+    // instead of `const char *` wherever a name is required! These are
+    // translated to legal strings either in `pl_shader_finalize`, or inside
+    // the `pl_dispatch` shader compilation step.
+    PL_ARRAY(struct pl_shader_va) vas;
+    PL_ARRAY(struct pl_shader_var) vars;
+    PL_ARRAY(struct pl_shader_desc) descs;
+    PL_ARRAY(struct pl_shader_const) consts;
+
+    // cached result of `pl_shader_finalize`
+    struct pl_shader_res result;
+};
+
+// Free temporary resources associated with a shader. Normally called by
+// pl_shader_reset(), but used internally to reduce memory waste.
+void sh_deref(pl_shader sh);
+
+// Same as `pl_shader_finalize` but doesn't generate `sh->res`, instead returns
+// the string builder to be used to finalize the shader. Assumes the caller
+// will access the shader's internal fields directly.
+pl_str_builder sh_finalize_internal(pl_shader sh);
+
+// Helper functions for convenience
+#define SH_PARAMS(sh) ((sh)->info->info.params)
+#define SH_GPU(sh) (SH_PARAMS(sh).gpu)
+#define SH_CACHE(sh) pl_gpu_cache(SH_GPU(sh))
+
+// Returns the GLSL version, defaulting to desktop 130.
+struct pl_glsl_version sh_glsl(const pl_shader sh);
+
+#define SH_FAIL(sh, ...) do {    \
+        sh->failed = true;       \
+        PL_ERR(sh, __VA_ARGS__); \
+    } while (0)
+
+// Attempt enabling compute shaders for this pass, if possible
+bool sh_try_compute(pl_shader sh, int bw, int bh, bool flex, size_t mem);
+
+// Attempt merging a secondary shader into the current shader. Returns NULL if
+// merging fails (e.g. incompatible signatures); otherwise returns an identifier
+// corresponding to the generated subpass function.
+//
+// If successful, the subpass shader is set to an undefined failure state and
+// must be explicitly reset/aborted before being re-used.
+ident_t sh_subpass(pl_shader sh, pl_shader sub);
+
+// Helpers for adding new variables/descriptors/etc. with fresh, unique
+// identifier names. These will never conflict with other identifiers, even
+// if the shaders are merged together.
+ident_t sh_fresh(pl_shader sh, const char *name);
+
+// Add a new shader var and return its identifier
+ident_t sh_var(pl_shader sh, struct pl_shader_var sv);
+
+// Helper functions for `sh_var`
+ident_t sh_var_int(pl_shader sh, const char *name, int val, bool dynamic);
+ident_t sh_var_uint(pl_shader sh, const char *name, unsigned int val, bool dynamic);
+ident_t sh_var_float(pl_shader sh, const char *name, float val, bool dynamic);
+ident_t sh_var_mat3(pl_shader sh, const char *name, pl_matrix3x3 val);
+#define SH_INT_DYN(val)   sh_var_int(sh, "const", val, true)
+#define SH_UINT_DYN(val)  sh_var_uint(sh, "const", val, true)
+#define SH_FLOAT_DYN(val) sh_var_float(sh, "const", val, true)
+#define SH_MAT3(val) sh_var_mat3(sh, "mat", val)
+
+// Add a new shader desc and return its identifier.
+ident_t sh_desc(pl_shader sh, struct pl_shader_desc sd);
+
+// Add a new shader constant and return its identifier.
+ident_t sh_const(pl_shader sh, struct pl_shader_const sc);
+
+// Helper functions for `sh_const`
+ident_t sh_const_int(pl_shader sh, const char *name, int val);
+ident_t sh_const_uint(pl_shader sh, const char *name, unsigned int val);
+ident_t sh_const_float(pl_shader sh, const char *name, float val);
+#define SH_INT(val)     sh_const_int(sh, "const", val)
+#define SH_UINT(val)    sh_const_uint(sh, "const", val)
+#define SH_FLOAT(val)   sh_const_float(sh, "const", val)
+
+// Add a new shader va and return its identifier
+ident_t sh_attr(pl_shader sh, struct pl_shader_va sva);
+
+// Helper to add a a vec2 VA from a pl_rect2df. Returns NULL_IDENT on failure.
+ident_t sh_attr_vec2(pl_shader sh, const char *name, const pl_rect2df *rc);
+
+// Bind a texture under a given transformation and make its attributes
+// available as well. If an output pointer for one of the attributes is left
+// as NULL, that attribute will not be added. Returns NULL on failure. `rect`
+// is optional, and defaults to the full texture if left as NULL.
+//
+// Note that for e.g. compute shaders, the vec2 out_pos might be a macro that
+// expands to an expensive computation, and should be cached by the user.
+ident_t sh_bind(pl_shader sh, pl_tex tex,
+                enum pl_tex_address_mode address_mode,
+                enum pl_tex_sample_mode sample_mode,
+                const char *name, const pl_rect2df *rect,
+                ident_t *out_pos, ident_t *out_pt);
+
+// Incrementally build up a buffer by adding new variable elements to the
+// buffer, resizing buf.buffer_vars if necessary. Returns whether or not the
+// variable could be successfully added (which may fail if you try exceeding
+// the size limits of the buffer type). If successful, the layout is stored
+// in *out_layout (may be NULL).
+bool sh_buf_desc_append(void *alloc, pl_gpu gpu,
+                        struct pl_shader_desc *buf_desc,
+                        struct pl_var_layout *out_layout,
+                        const struct pl_var new_var);
+
+size_t sh_buf_desc_size(const struct pl_shader_desc *buf_desc);
+
+
+// Underlying function for appending text to a shader
+#define sh_append(sh, buf, ...) \
+    pl_str_builder_addf((sh)->buffers[buf], __VA_ARGS__)
+
+#define sh_append_str(sh, buf, str) \
+    pl_str_builder_str((sh)->buffers[buf], str)
+
+#define GLSLP(...) sh_append(sh, SH_BUF_PRELUDE, __VA_ARGS__)
+#define GLSLH(...) sh_append(sh, SH_BUF_HEADER, __VA_ARGS__)
+#define GLSL(...)  sh_append(sh, SH_BUF_BODY, __VA_ARGS__)
+#define GLSLF(...) sh_append(sh, SH_BUF_FOOTER, __VA_ARGS__)
+
+// Attach a description to a shader
+void sh_describef(pl_shader sh, const char *fmt, ...)
+    PL_PRINTF(2, 3);
+
+static inline void sh_describe(pl_shader sh, const char *desc)
+{
+    PL_ARRAY_APPEND(sh->info, sh->info->steps, desc);
+};
+
+// Requires that the share is mutable, has an output signature compatible
+// with the given input signature, as well as an output size compatible with
+// the given size requirements. Errors and returns false otherwise.
+bool sh_require(pl_shader sh, enum pl_shader_sig insig, int w, int h);
+
+// Shader resources
+
+enum pl_shader_obj_type {
+    PL_SHADER_OBJ_INVALID = 0,
+    PL_SHADER_OBJ_COLOR_MAP,
+    PL_SHADER_OBJ_SAMPLER,
+    PL_SHADER_OBJ_DITHER,
+    PL_SHADER_OBJ_LUT,
+    PL_SHADER_OBJ_AV1_GRAIN,
+    PL_SHADER_OBJ_FILM_GRAIN,
+    PL_SHADER_OBJ_RESHAPE,
+};
+
+struct pl_shader_obj_t {
+    enum pl_shader_obj_type type;
+    pl_rc_t rc;
+    pl_gpu gpu;
+    void (*uninit)(pl_gpu gpu, void *priv);
+    void *priv;
+};
+
+// Returns (*ptr)->priv, or NULL on failure
+void *sh_require_obj(pl_shader sh, pl_shader_obj *ptr,
+                     enum pl_shader_obj_type type, size_t priv_size,
+                     void (*uninit)(pl_gpu gpu, void *priv));
+
+#define SH_OBJ(sh, ptr, type, t, uninit) \
+    ((t*) sh_require_obj(sh, ptr, type, sizeof(t), uninit))
+
+// Initializes a PRNG. The resulting string will directly evaluate to a
+// pseudorandom, uniformly distributed vec3 from [0.0,1.0]. Since this
+// algorithm works by mutating a state variable, if the user wants to use the
+// resulting PRNG inside a subfunction, they must add an extra `inout prng_t %s`
+// with the contents of `state` to the signature. (Optional)
+//
+// If `temporal` is set, the PRNG will vary across frames.
+ident_t sh_prng(pl_shader sh, bool temporal, ident_t *state);
+
+// Backing memory type
+enum sh_lut_type {
+    SH_LUT_AUTO = 0, // pick whatever makes the most sense
+    SH_LUT_TEXTURE,  // upload as texture
+    SH_LUT_UNIFORM,  // uniform array
+    SH_LUT_LITERAL,  // constant / literal array in shader source (fallback)
+};
+
+// Interpolation method
+enum sh_lut_method {
+    SH_LUT_NONE = 0,    // no interpolation, integer indices
+    SH_LUT_LINEAR,      // linear interpolation, vecN indices in range [0,1]
+    SH_LUT_CUBIC,       // (bi/tri)cubic interpolation
+    SH_LUT_TETRAHEDRAL, // tetrahedral interpolation for vec3, equivalent to
+                        // SH_LUT_LINEAR for lower dimensions
+};
+
+struct sh_lut_params {
+    pl_shader_obj *object;
+
+    // Type of the LUT we intend to generate.
+    //
+    // Note: If `var_type` is PL_VAR_*INT, `method` must be SH_LUT_NONE.
+    enum pl_var_type var_type;
+    enum sh_lut_type lut_type;
+    enum sh_lut_method method;
+
+    // For SH_LUT_TEXTURE, this can be used to override the texture's internal
+    // format, in which case it takes precedence over the default for `type`.
+    pl_fmt fmt;
+
+    // LUT dimensions. Unused dimensions may be left as 0.
+    int width;
+    int height;
+    int depth;
+    int comps;
+
+    // If true, the LUT will always be regenerated, even if the dimensions have
+    // not changed.
+    bool update;
+
+    // Alternate way of triggering shader invalidations. If the signature
+    // does not match the LUT's signature, it will be regenerated.
+    uint64_t signature;
+
+    // If set to true, shader objects will be preserved and updated in-place
+    // rather than being treated as read-only.
+    bool dynamic;
+
+    // If set , generated shader objects are automatically cached in this
+    // cache. Requires `signature` to be set (and uniquely identify the LUT).
+    pl_cache cache;
+
+    // Will be called with a zero-initialized buffer whenever the data needs to
+    // be computed, which happens whenever the size is changed, the shader
+    // object is invalidated, or `update` is set to true.
+    //
+    // Note: Interpretation of `data` is according to `type` and `fmt`.
+    void (*fill)(void *data, const struct sh_lut_params *params);
+    void *priv;
+
+    // Debug tag to track LUT source
+    pl_debug_tag debug_tag;
+};
+
+#define sh_lut_params(...) (&(struct sh_lut_params) {   \
+        .debug_tag = PL_DEBUG_TAG,                      \
+        __VA_ARGS__                                     \
+    })
+
+// Makes a table of values available as a shader variable, using an a given
+// method (falling back if needed). The resulting identifier can be sampled
+// directly as %s(pos), where pos is a vector with the right number of
+// dimensions. `pos` must be an integer vector within the bounds of the array,
+// unless the method is `SH_LUT_LINEAR`, in which case it's a float vector that
+// gets interpolated and clamped as needed. Returns NULL on error.
+ident_t sh_lut(pl_shader sh, const struct sh_lut_params *params);
+
+static inline uint8_t sh_num_comps(uint8_t mask)
+{
+    pl_assert((mask & 0xF) == mask);
+    return __builtin_popcount(mask);
+}
+
+static inline const char *sh_float_type(uint8_t mask)
+{
+    switch (sh_num_comps(mask)) {
+    case 1: return "float";
+    case 2: return "vec2";
+    case 3: return "vec3";
+    case 4: return "vec4";
+    }
+
+    pl_unreachable();
+}
+
+static inline const char *sh_swizzle(uint8_t mask)
+{
+    static const char * const swizzles[0x10] = {
+        NULL, "r",  "g",  "rg",  "b",  "rb",  "gb",  "rgb",
+        "a",  "ra", "ga", "rga", "ba", "rba", "gba", "rgba",
+    };
+
+    pl_assert(mask <= PL_ARRAY_SIZE(swizzles));
+    return swizzles[mask];
+}
diff --git a/src/shaders/colorspace.c b/src/shaders/colorspace.c
new file mode 100644
index 0000000..c7b3b5a
--- /dev/null
+++ b/src/shaders/colorspace.c
@@ -0,0 +1,2120 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include "cache.h"
+#include "shaders.h"
+
+#include <libplacebo/shaders/colorspace.h>
+
+// Common constants for SMPTE ST.2084 (PQ)
+static const float PQ_M1 = 2610./4096 * 1./4,
+                   PQ_M2 = 2523./4096 * 128,
+                   PQ_C1 = 3424./4096,
+                   PQ_C2 = 2413./4096 * 32,
+                   PQ_C3 = 2392./4096 * 32;
+
+// Common constants for ARIB STD-B67 (HLG)
+static const float HLG_A = 0.17883277,
+                   HLG_B = 0.28466892,
+                   HLG_C = 0.55991073,
+                   HLG_REF = 1000.0 / PL_COLOR_SDR_WHITE;
+
+// Common constants for Panasonic V-Log
+static const float VLOG_B = 0.00873,
+                   VLOG_C = 0.241514,
+                   VLOG_D = 0.598206;
+
+// Common constants for Sony S-Log
+static const float SLOG_A = 0.432699,
+                   SLOG_B = 0.037584,
+                   SLOG_C = 0.616596 + 0.03,
+                   SLOG_P = 3.538813,
+                   SLOG_Q = 0.030001,
+                   SLOG_K2 = 155.0 / 219.0;
+
+void pl_shader_set_alpha(pl_shader sh, struct pl_color_repr *repr,
+                         enum pl_alpha_mode mode)
+{
+    if (repr->alpha == PL_ALPHA_PREMULTIPLIED && mode == PL_ALPHA_INDEPENDENT) {
+        GLSL("if (color.a > 1e-6)               \n"
+             "    color.rgb /= vec3(color.a);   \n");
+        repr->alpha = PL_ALPHA_INDEPENDENT;
+    }
+
+    if (repr->alpha == PL_ALPHA_INDEPENDENT && mode == PL_ALPHA_PREMULTIPLIED) {
+        GLSL("color.rgb *= vec3(color.a); \n");
+        repr->alpha = PL_ALPHA_PREMULTIPLIED;
+    }
+}
+
+#ifdef PL_HAVE_DOVI
+static inline void reshape_mmr(pl_shader sh, ident_t mmr, bool single,
+                               int min_order, int max_order)
+{
+    if (single) {
+        GLSL("const uint mmr_idx = 0u; \n");
+    } else {
+        GLSL("uint mmr_idx = uint(coeffs.y); \n");
+    }
+
+    assert(min_order <= max_order);
+    if (min_order < max_order)
+        GLSL("uint order = uint(coeffs.w); \n");
+
+    GLSL("vec4 sigX;                            \n"
+         "s = coeffs.x;                         \n"
+         "sigX.xyz = sig.xxy * sig.yzz;         \n"
+         "sigX.w = sigX.x * sig.z;              \n"
+         "s += dot("$"[mmr_idx + 0].xyz, sig);  \n"
+         "s += dot("$"[mmr_idx + 1], sigX);     \n",
+         mmr, mmr);
+
+    if (max_order >= 2) {
+        if (min_order < 2)
+            GLSL("if (order >= 2) { \n");
+
+        GLSL("vec3 sig2 = sig * sig;                \n"
+             "vec4 sigX2 = sigX * sigX;             \n"
+             "s += dot("$"[mmr_idx + 2].xyz, sig2); \n"
+             "s += dot("$"[mmr_idx + 3], sigX2);    \n",
+             mmr, mmr);
+
+        if (max_order == 3) {
+            if (min_order < 3)
+                GLSL("if (order >= 3 { \n");
+
+            GLSL("s += dot("$"[mmr_idx + 4].xyz, sig2 * sig);   \n"
+                 "s += dot("$"[mmr_idx + 5], sigX2 * sigX);     \n",
+                 mmr, mmr);
+
+            if (min_order < 3)
+                GLSL("} \n");
+        }
+
+        if (min_order < 2)
+            GLSL("} \n");
+    }
+}
+
+static inline void reshape_poly(pl_shader sh)
+{
+    GLSL("s = (coeffs.z * s + coeffs.y) * s + coeffs.x; \n");
+}
+#endif
+
+void pl_shader_dovi_reshape(pl_shader sh, const struct pl_dovi_metadata *data)
+{
+#ifdef PL_HAVE_DOVI
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0) || !data)
+        return;
+
+    sh_describe(sh, "reshaping");
+    GLSL("// pl_shader_reshape                  \n"
+         "{                                     \n"
+         "vec3 sig;                             \n"
+         "vec4 coeffs;                          \n"
+         "float s;                              \n"
+         "sig = clamp(color.rgb, 0.0, 1.0);     \n");
+
+    float coeffs_data[8][4];
+    float mmr_packed_data[8*6][4];
+
+    for (int c = 0; c < 3; c++) {
+        const struct pl_reshape_data *comp = &data->comp[c];
+        if (!comp->num_pivots)
+            continue;
+
+        pl_assert(comp->num_pivots >= 2 && comp->num_pivots <= 9);
+        GLSL("s = sig[%d]; \n", c);
+
+        // Prepare coefficients for GPU
+        bool has_poly = false, has_mmr = false, mmr_single = true;
+        int mmr_idx = 0, min_order = 3, max_order = 1;
+        memset(coeffs_data, 0, sizeof(coeffs_data));
+        for (int i = 0; i < comp->num_pivots - 1; i++) {
+            switch (comp->method[i]) {
+            case 0: // polynomial
+                has_poly = true;
+                coeffs_data[i][3] = 0.0; // order=0 signals polynomial
+                for (int k = 0; k < 3; k++)
+                    coeffs_data[i][k] = comp->poly_coeffs[i][k];
+                break;
+
+            case 1:
+                min_order = PL_MIN(min_order, comp->mmr_order[i]);
+                max_order = PL_MAX(max_order, comp->mmr_order[i]);
+                mmr_single = !has_mmr;
+                has_mmr = true;
+                coeffs_data[i][3] = (float) comp->mmr_order[i];
+                coeffs_data[i][0] = comp->mmr_constant[i];
+                coeffs_data[i][1] = (float) mmr_idx;
+                for (int j = 0; j < comp->mmr_order[i]; j++) {
+                    // store weights per order as two packed vec4s
+                    float *mmr = &mmr_packed_data[mmr_idx][0];
+                    mmr[0] = comp->mmr_coeffs[i][j][0];
+                    mmr[1] = comp->mmr_coeffs[i][j][1];
+                    mmr[2] = comp->mmr_coeffs[i][j][2];
+                    mmr[3] = 0.0; // unused
+                    mmr[4] = comp->mmr_coeffs[i][j][3];
+                    mmr[5] = comp->mmr_coeffs[i][j][4];
+                    mmr[6] = comp->mmr_coeffs[i][j][5];
+                    mmr[7] = comp->mmr_coeffs[i][j][6];
+                    mmr_idx += 2;
+                }
+                break;
+
+            default:
+                pl_unreachable();
+            }
+        }
+
+        if (comp->num_pivots > 2) {
+
+            // Skip the (irrelevant) lower and upper bounds
+            float pivots_data[7];
+            memcpy(pivots_data, comp->pivots + 1,
+                   (comp->num_pivots - 2) * sizeof(pivots_data[0]));
+
+            // Fill the remainder with a quasi-infinite sentinel pivot
+            for (int i = comp->num_pivots - 2; i < PL_ARRAY_SIZE(pivots_data); i++)
+                pivots_data[i] = 1e9f;
+
+            ident_t pivots = sh_var(sh, (struct pl_shader_var) {
+                .data = pivots_data,
+                .var = {
+                    .name = "pivots",
+                    .type = PL_VAR_FLOAT,
+                    .dim_v = 1,
+                    .dim_m = 1,
+                    .dim_a = PL_ARRAY_SIZE(pivots_data),
+                },
+            });
+
+            ident_t coeffs = sh_var(sh, (struct pl_shader_var) {
+                .data = coeffs_data,
+                .var = {
+                    .name = "coeffs",
+                    .type = PL_VAR_FLOAT,
+                    .dim_v = 4,
+                    .dim_m = 1,
+                    .dim_a = PL_ARRAY_SIZE(coeffs_data),
+                },
+            });
+
+            // Efficiently branch into the correct set of coefficients
+            GLSL("#define test(i) bvec4(s >= "$"[i])                \n"
+                 "#define coef(i) "$"[i]                            \n"
+                 "coeffs = mix(mix(mix(coef(0), coef(1), test(0)),  \n"
+                 "                 mix(coef(2), coef(3), test(2)),  \n"
+                 "                 test(1)),                        \n"
+                 "             mix(mix(coef(4), coef(5), test(4)),  \n"
+                 "                 mix(coef(6), coef(7), test(6)),  \n"
+                 "                 test(5)),                        \n"
+                 "             test(3));                            \n"
+                 "#undef test                                       \n"
+                 "#undef coef                                       \n",
+                 pivots, coeffs);
+
+        } else {
+
+            // No need for a single pivot, just set the coeffs directly
+            GLSL("coeffs = "$"; \n", sh_var(sh, (struct pl_shader_var) {
+                .var = pl_var_vec4("coeffs"),
+                .data = coeffs_data,
+            }));
+
+        }
+
+        ident_t mmr = NULL_IDENT;
+        if (has_mmr) {
+            mmr = sh_var(sh, (struct pl_shader_var) {
+                .data = mmr_packed_data,
+                .var = {
+                    .name = "mmr",
+                    .type = PL_VAR_FLOAT,
+                    .dim_v = 4,
+                    .dim_m = 1,
+                    .dim_a = mmr_idx,
+                },
+            });
+        }
+
+        if (has_mmr && has_poly) {
+            GLSL("if (coeffs.w == 0.0) { \n");
+            reshape_poly(sh);
+            GLSL("} else { \n");
+            reshape_mmr(sh, mmr, mmr_single, min_order, max_order);
+            GLSL("} \n");
+        } else if (has_poly) {
+            reshape_poly(sh);
+        } else {
+            assert(has_mmr);
+            GLSL("{ \n");
+            reshape_mmr(sh, mmr, mmr_single, min_order, max_order);
+            GLSL("} \n");
+        }
+
+        ident_t lo = sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_float("lo"),
+            .data = &comp->pivots[0],
+        });
+        ident_t hi = sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_float("hi"),
+            .data = &comp->pivots[comp->num_pivots - 1],
+        });
+        GLSL("color[%d] = clamp(s, "$", "$"); \n", c, lo, hi);
+    }
+
+    GLSL("} \n");
+#else
+    SH_FAIL(sh, "libplacebo was compiled without support for dolbyvision reshaping");
+#endif
+}
+
+void pl_shader_decode_color(pl_shader sh, struct pl_color_repr *repr,
+                            const struct pl_color_adjustment *params)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    sh_describe(sh, "color decoding");
+    GLSL("// pl_shader_decode_color \n"
+         "{ \n");
+
+    // Do this first because the following operations are potentially nonlinear
+    pl_shader_set_alpha(sh, repr, PL_ALPHA_INDEPENDENT);
+
+    if (repr->sys == PL_COLOR_SYSTEM_XYZ ||
+        repr->sys == PL_COLOR_SYSTEM_DOLBYVISION)
+    {
+        ident_t scale = SH_FLOAT(pl_color_repr_normalize(repr));
+        GLSL("color.rgb *= vec3("$"); \n", scale);
+    }
+
+    if (repr->sys == PL_COLOR_SYSTEM_XYZ) {
+        pl_shader_linearize(sh, &(struct pl_color_space) {
+            .transfer = PL_COLOR_TRC_ST428,
+        });
+    }
+
+    if (repr->sys == PL_COLOR_SYSTEM_DOLBYVISION)
+        pl_shader_dovi_reshape(sh, repr->dovi);
+
+    enum pl_color_system orig_sys = repr->sys;
+    pl_transform3x3 tr = pl_color_repr_decode(repr, params);
+
+    if (memcmp(&tr, &pl_transform3x3_identity, sizeof(tr))) {
+        ident_t cmat = sh_var(sh, (struct pl_shader_var) {
+            .var  = pl_var_mat3("cmat"),
+            .data = PL_TRANSPOSE_3X3(tr.mat.m),
+        });
+
+        ident_t cmat_c = sh_var(sh, (struct pl_shader_var) {
+            .var  = pl_var_vec3("cmat_c"),
+            .data = tr.c,
+        });
+
+        GLSL("color.rgb = "$" * color.rgb + "$"; \n", cmat, cmat_c);
+    }
+
+    switch (orig_sys) {
+    case PL_COLOR_SYSTEM_BT_2020_C:
+        // Conversion for C'rcY'cC'bc via the BT.2020 CL system:
+        // C'bc = (B'-Y'c) / 1.9404  | C'bc <= 0
+        //      = (B'-Y'c) / 1.5816  | C'bc >  0
+        //
+        // C'rc = (R'-Y'c) / 1.7184  | C'rc <= 0
+        //      = (R'-Y'c) / 0.9936  | C'rc >  0
+        //
+        // as per the BT.2020 specification, table 4. This is a non-linear
+        // transformation because (constant) luminance receives non-equal
+        // contributions from the three different channels.
+        GLSL("// constant luminance conversion                              \n"
+             "color.br = color.br * mix(vec2(1.5816, 0.9936),               \n"
+             "                          vec2(1.9404, 1.7184),               \n"
+             "                          lessThanEqual(color.br, vec2(0.0))) \n"
+             "           + color.gg;                                        \n");
+        // Expand channels to camera-linear light. This shader currently just
+        // assumes everything uses the BT.2020 12-bit gamma function, since the
+        // difference between 10 and 12-bit is negligible for anything other
+        // than 12-bit content.
+        GLSL("vec3 lin = mix(color.rgb * vec3(1.0/4.5),                        \n"
+             "                pow((color.rgb + vec3(0.0993))*vec3(1.0/1.0993), \n"
+             "                    vec3(1.0/0.45)),                             \n"
+             "                lessThanEqual(vec3(0.08145), color.rgb));        \n");
+        // Calculate the green channel from the expanded RYcB, and recompress to G'
+        // The BT.2020 specification says Yc = 0.2627*R + 0.6780*G + 0.0593*B
+        GLSL("color.g = (lin.g - 0.2627*lin.r - 0.0593*lin.b)*1.0/0.6780;   \n"
+             "color.g = mix(color.g * 4.5,                                  \n"
+             "              1.0993 * pow(color.g, 0.45) - 0.0993,           \n"
+             "              0.0181 <= color.g);                             \n");
+        break;
+
+    case PL_COLOR_SYSTEM_BT_2100_PQ:;
+        // Conversion process from the spec:
+        //
+        // 1. L'M'S' = cmat * ICtCp
+        // 2. LMS = linearize(L'M'S')  (EOTF for PQ, inverse OETF for HLG)
+        // 3. RGB = lms2rgb * LMS
+        //
+        // After this we need to invert step 2 to arrive at non-linear RGB.
+        // (It's important we keep the transfer function conversion separate
+        // from the color system decoding, so we have to partially undo our
+        // work here even though we will end up linearizing later on anyway)
+
+        GLSL(// PQ EOTF
+             "color.rgb = pow(max(color.rgb, 0.0), vec3(1.0/%f));           \n"
+             "color.rgb = max(color.rgb - vec3(%f), 0.0)                    \n"
+             "             / (vec3(%f) - vec3(%f) * color.rgb);             \n"
+             "color.rgb = pow(color.rgb, vec3(1.0/%f));                     \n"
+             // LMS matrix
+             "color.rgb = mat3( 3.43661, -0.79133, -0.0259499,              \n"
+             "                 -2.50645,  1.98360, -0.0989137,              \n"
+             "                  0.06984, -0.192271, 1.12486) * color.rgb;   \n"
+             // PQ OETF
+             "color.rgb = pow(max(color.rgb, 0.0), vec3(%f));               \n"
+             "color.rgb = (vec3(%f) + vec3(%f) * color.rgb)                 \n"
+             "             / (vec3(1.0) + vec3(%f) * color.rgb);            \n"
+             "color.rgb = pow(color.rgb, vec3(%f));                         \n",
+             PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1,
+             PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2);
+        break;
+
+    case PL_COLOR_SYSTEM_BT_2100_HLG:
+        GLSL(// HLG OETF^-1
+             "color.rgb = mix(vec3(4.0) * color.rgb * color.rgb,                \n"
+             "                exp((color.rgb - vec3(%f)) * vec3(1.0/%f))        \n"
+             "                    + vec3(%f),                                   \n"
+             "                lessThan(vec3(0.5), color.rgb));                  \n"
+             // LMS matrix
+             "color.rgb = mat3( 3.43661, -0.79133, -0.0259499,                  \n"
+             "                 -2.50645,  1.98360, -0.0989137,                  \n"
+             "                  0.06984, -0.192271, 1.12486) * color.rgb;       \n"
+            // HLG OETF
+             "color.rgb = mix(vec3(0.5) * sqrt(color.rgb),                      \n"
+             "                vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f),  \n"
+             "                lessThan(vec3(1.0), color.rgb));                  \n",
+             HLG_C, HLG_A, HLG_B,
+             HLG_A, HLG_B, HLG_C);
+        break;
+
+    case PL_COLOR_SYSTEM_DOLBYVISION:;
+#ifdef PL_HAVE_DOVI
+        // Dolby Vision always outputs BT.2020-referred HPE LMS, so hard-code
+        // the inverse LMS->RGB matrix corresponding to this color space.
+        pl_matrix3x3 dovi_lms2rgb = {{
+            { 3.06441879, -2.16597676,  0.10155818},
+            {-0.65612108,  1.78554118, -0.12943749},
+            { 0.01736321, -0.04725154,  1.03004253},
+        }};
+
+        pl_matrix3x3_mul(&dovi_lms2rgb, &repr->dovi->linear);
+        ident_t mat = sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_mat3("lms2rgb"),
+            .data = PL_TRANSPOSE_3X3(dovi_lms2rgb.m),
+        });
+
+        // PQ EOTF
+        GLSL("color.rgb = pow(max(color.rgb, 0.0), vec3(1.0/%f));   \n"
+             "color.rgb = max(color.rgb - vec3(%f), 0.0)            \n"
+             "             / (vec3(%f) - vec3(%f) * color.rgb);     \n"
+             "color.rgb = pow(color.rgb, vec3(1.0/%f));             \n",
+             PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1);
+        // LMS matrix
+        GLSL("color.rgb = "$" * color.rgb; \n", mat);
+        // PQ OETF
+        GLSL("color.rgb = pow(max(color.rgb, 0.0), vec3(%f));       \n"
+             "color.rgb = (vec3(%f) + vec3(%f) * color.rgb)         \n"
+             "             / (vec3(1.0) + vec3(%f) * color.rgb);    \n"
+             "color.rgb = pow(color.rgb, vec3(%f));                 \n",
+             PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2);
+        break;
+#else
+        SH_FAIL(sh, "libplacebo was compiled without support for dolbyvision reshaping");
+        return;
+#endif
+
+    case PL_COLOR_SYSTEM_UNKNOWN:
+    case PL_COLOR_SYSTEM_RGB:
+    case PL_COLOR_SYSTEM_XYZ:
+    case PL_COLOR_SYSTEM_BT_601:
+    case PL_COLOR_SYSTEM_BT_709:
+    case PL_COLOR_SYSTEM_SMPTE_240M:
+    case PL_COLOR_SYSTEM_BT_2020_NC:
+    case PL_COLOR_SYSTEM_YCGCO:
+        break; // no special post-processing needed
+
+    case PL_COLOR_SYSTEM_COUNT:
+        pl_unreachable();
+    }
+
+    // Gamma adjustment. Doing this here (in non-linear light) is technically
+    // somewhat wrong, but this is just an aesthetic parameter and not really
+    // meant for colorimetric precision, so we don't care too much.
+    if (params && params->gamma == 0) {
+        // Avoid division by zero
+        GLSL("color.rgb = vec3(0.0); \n");
+    } else if (params && params->gamma != 1) {
+        ident_t gamma = sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_float("gamma"),
+            .data = &(float){ 1 / params->gamma },
+        });
+        GLSL("color.rgb = pow(max(color.rgb, vec3(0.0)), vec3("$")); \n", gamma);
+    }
+
+    GLSL("}\n");
+}
+
+void pl_shader_encode_color(pl_shader sh, const struct pl_color_repr *repr)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    sh_describe(sh, "color encoding");
+    GLSL("// pl_shader_encode_color \n"
+         "{ \n");
+
+    switch (repr->sys) {
+    case PL_COLOR_SYSTEM_BT_2020_C:
+        // Expand R'G'B' to RGB
+        GLSL("vec3 lin = mix(color.rgb * vec3(1.0/4.5),                        \n"
+             "                pow((color.rgb + vec3(0.0993))*vec3(1.0/1.0993), \n"
+             "                    vec3(1.0/0.45)),                             \n"
+             "                lessThanEqual(vec3(0.08145), color.rgb));        \n");
+
+        // Compute Yc from RGB and compress to R'Y'cB'
+        GLSL("color.g = dot(vec3(0.2627, 0.6780, 0.0593), lin);     \n"
+             "color.g = mix(color.g * 4.5,                          \n"
+             "              1.0993 * pow(color.g, 0.45) - 0.0993,   \n"
+             "              0.0181 <= color.g);                     \n");
+
+        // Compute C'bc and C'rc into color.br
+        GLSL("color.br = color.br - color.gg;                       \n"
+             "color.br *= mix(vec2(1.0/1.5816, 1.0/0.9936),         \n"
+             "                vec2(1.0/1.9404, 1.0/1.7184),         \n"
+             "                lessThanEqual(color.br, vec2(0.0)));  \n");
+        break;
+
+    case PL_COLOR_SYSTEM_BT_2100_PQ:;
+        GLSL("color.rgb = pow(max(color.rgb, 0.0), vec3(1.0/%f));           \n"
+             "color.rgb = max(color.rgb - vec3(%f), 0.0)                    \n"
+             "             / (vec3(%f) - vec3(%f) * color.rgb);             \n"
+             "color.rgb = pow(color.rgb, vec3(1.0/%f));                     \n"
+             "color.rgb = mat3(0.412109, 0.166748, 0.024170,                \n"
+             "                 0.523925, 0.720459, 0.075440,                \n"
+             "                 0.063965, 0.112793, 0.900394) * color.rgb;   \n"
+             "color.rgb = pow(color.rgb, vec3(%f));                         \n"
+             "color.rgb = (vec3(%f) + vec3(%f) * color.rgb)                 \n"
+             "             / (vec3(1.0) + vec3(%f) * color.rgb);            \n"
+             "color.rgb = pow(color.rgb, vec3(%f));                         \n",
+             PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1,
+             PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2);
+        break;
+
+    case PL_COLOR_SYSTEM_BT_2100_HLG:
+        GLSL("color.rgb = mix(vec3(4.0) * color.rgb * color.rgb,                \n"
+             "                exp((color.rgb - vec3(%f)) * vec3(1.0/%f))        \n"
+             "                    + vec3(%f),                                   \n"
+             "                lessThan(vec3(0.5), color.rgb));                  \n"
+             "color.rgb = mat3(0.412109, 0.166748, 0.024170,                    \n"
+             "                 0.523925, 0.720459, 0.075440,                    \n"
+             "                 0.063965, 0.112793, 0.900394) * color.rgb;       \n"
+             "color.rgb = mix(vec3(0.5) * sqrt(color.rgb),                      \n"
+             "                vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f),  \n"
+             "                lessThan(vec3(1.0), color.rgb));                  \n",
+             HLG_C, HLG_A, HLG_B,
+             HLG_A, HLG_B, HLG_C);
+        break;
+
+    case PL_COLOR_SYSTEM_DOLBYVISION:
+        SH_FAIL(sh, "Cannot un-apply dolbyvision yet (no inverse reshaping)!");
+        return;
+
+    case PL_COLOR_SYSTEM_UNKNOWN:
+    case PL_COLOR_SYSTEM_RGB:
+    case PL_COLOR_SYSTEM_XYZ:
+    case PL_COLOR_SYSTEM_BT_601:
+    case PL_COLOR_SYSTEM_BT_709:
+    case PL_COLOR_SYSTEM_SMPTE_240M:
+    case PL_COLOR_SYSTEM_BT_2020_NC:
+    case PL_COLOR_SYSTEM_YCGCO:
+        break; // no special pre-processing needed
+
+    case PL_COLOR_SYSTEM_COUNT:
+        pl_unreachable();
+    }
+
+    // Since this is a relatively rare operation, bypass it as much as possible
+    bool skip = true;
+    skip &= PL_DEF(repr->sys, PL_COLOR_SYSTEM_RGB) == PL_COLOR_SYSTEM_RGB;
+    skip &= PL_DEF(repr->levels, PL_COLOR_LEVELS_FULL) == PL_COLOR_LEVELS_FULL;
+    skip &= !repr->bits.sample_depth || !repr->bits.color_depth ||
+             repr->bits.sample_depth == repr->bits.color_depth;
+    skip &= !repr->bits.bit_shift;
+
+    if (!skip) {
+        struct pl_color_repr copy = *repr;
+        ident_t xyzscale = NULL_IDENT;
+        if (repr->sys == PL_COLOR_SYSTEM_XYZ)
+            xyzscale = SH_FLOAT(1.0 / pl_color_repr_normalize(&copy));
+
+        pl_transform3x3 tr = pl_color_repr_decode(&copy, NULL);
+        pl_transform3x3_invert(&tr);
+
+        ident_t cmat = sh_var(sh, (struct pl_shader_var) {
+            .var  = pl_var_mat3("cmat"),
+            .data = PL_TRANSPOSE_3X3(tr.mat.m),
+        });
+
+        ident_t cmat_c = sh_var(sh, (struct pl_shader_var) {
+            .var  = pl_var_vec3("cmat_c"),
+            .data = tr.c,
+        });
+
+        GLSL("color.rgb = "$" * color.rgb + "$"; \n", cmat, cmat_c);
+
+        if (repr->sys == PL_COLOR_SYSTEM_XYZ) {
+            pl_shader_delinearize(sh, &(struct pl_color_space) {
+                .transfer = PL_COLOR_TRC_ST428,
+            });
+            GLSL("color.rgb *= vec3("$"); \n", xyzscale);
+        }
+    }
+
+    if (repr->alpha == PL_ALPHA_PREMULTIPLIED)
+        GLSL("color.rgb *= vec3(color.a); \n");
+
+    GLSL("}\n");
+}
+
+static ident_t sh_luma_coeffs(pl_shader sh, const struct pl_color_space *csp)
+{
+    pl_matrix3x3 rgb2xyz;
+    rgb2xyz = pl_get_rgb2xyz_matrix(pl_raw_primaries_get(csp->primaries));
+
+    // FIXME: Cannot use `const vec3` due to glslang bug #2025
+    ident_t coeffs = sh_fresh(sh, "luma_coeffs");
+    GLSLH("#define "$" vec3("$", "$", "$") \n", coeffs,
+          SH_FLOAT(rgb2xyz.m[1][0]), // RGB->Y vector
+          SH_FLOAT(rgb2xyz.m[1][1]),
+          SH_FLOAT(rgb2xyz.m[1][2]));
+    return coeffs;
+}
+
+void pl_shader_linearize(pl_shader sh, const struct pl_color_space *csp)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    if (csp->transfer == PL_COLOR_TRC_LINEAR)
+        return;
+
+    float csp_min, csp_max;
+    pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+        .color      = csp,
+        .metadata   = PL_HDR_METADATA_HDR10,
+        .scaling    = PL_HDR_NORM,
+        .out_min    = &csp_min,
+        .out_max    = &csp_max,
+    ));
+
+    // Note that this clamp may technically violate the definition of
+    // ITU-R BT.2100, which allows for sub-blacks and super-whites to be
+    // displayed on the display where such would be possible. That said, the
+    // problem is that not all gamma curves are well-defined on the values
+    // outside this range, so we ignore it and just clamp anyway for sanity.
+    GLSL("// pl_shader_linearize           \n"
+         "color.rgb = max(color.rgb, 0.0); \n");
+
+    switch (csp->transfer) {
+    case PL_COLOR_TRC_SRGB:
+        GLSL("color.rgb = mix(color.rgb * vec3(1.0/12.92),               \n"
+             "                pow((color.rgb + vec3(0.055))/vec3(1.055), \n"
+             "                    vec3(2.4)),                            \n"
+             "                lessThan(vec3(0.04045), color.rgb));       \n");
+        goto scale_out;
+    case PL_COLOR_TRC_BT_1886: {
+        const float lb = powf(csp_min, 1/2.4f);
+        const float lw = powf(csp_max, 1/2.4f);
+        const float a = powf(lw - lb, 2.4f);
+        const float b = lb / (lw - lb);
+        GLSL("color.rgb = "$" * pow(color.rgb + vec3("$"), vec3(2.4)); \n",
+             SH_FLOAT(a), SH_FLOAT(b));
+        return;
+    }
+    case PL_COLOR_TRC_GAMMA18:
+        GLSL("color.rgb = pow(color.rgb, vec3(1.8));\n");
+        goto scale_out;
+    case PL_COLOR_TRC_GAMMA20:
+        GLSL("color.rgb = pow(color.rgb, vec3(2.0));\n");
+        goto scale_out;
+    case PL_COLOR_TRC_UNKNOWN:
+    case PL_COLOR_TRC_GAMMA22:
+        GLSL("color.rgb = pow(color.rgb, vec3(2.2));\n");
+        goto scale_out;
+    case PL_COLOR_TRC_GAMMA24:
+        GLSL("color.rgb = pow(color.rgb, vec3(2.4));\n");
+        goto scale_out;
+    case PL_COLOR_TRC_GAMMA26:
+        GLSL("color.rgb = pow(color.rgb, vec3(2.6));\n");
+        goto scale_out;
+    case PL_COLOR_TRC_GAMMA28:
+        GLSL("color.rgb = pow(color.rgb, vec3(2.8));\n");
+        goto scale_out;
+    case PL_COLOR_TRC_PRO_PHOTO:
+        GLSL("color.rgb = mix(color.rgb * vec3(1.0/16.0),              \n"
+             "                pow(color.rgb, vec3(1.8)),               \n"
+             "                lessThan(vec3(0.03125), color.rgb));     \n");
+        goto scale_out;
+    case PL_COLOR_TRC_ST428:
+        GLSL("color.rgb = vec3(52.37/48.0) * pow(color.rgb, vec3(2.6));\n");
+        goto scale_out;
+    case PL_COLOR_TRC_PQ:
+        GLSL("color.rgb = pow(color.rgb, vec3(1.0/%f));         \n"
+             "color.rgb = max(color.rgb - vec3(%f), 0.0)        \n"
+             "             / (vec3(%f) - vec3(%f) * color.rgb); \n"
+             "color.rgb = pow(color.rgb, vec3(1.0/%f));         \n"
+             // PQ's output range is 0-10000, but we need it to be relative to
+             // to PL_COLOR_SDR_WHITE instead, so rescale
+             "color.rgb *= vec3(%f);                            \n",
+             PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1, 10000.0 / PL_COLOR_SDR_WHITE);
+        return;
+    case PL_COLOR_TRC_HLG: {
+        const float y = fmaxf(1.2f + 0.42f * log10f(csp_max / HLG_REF), 1);
+        const float b = sqrtf(3 * powf(csp_min / csp_max, 1 / y));
+        // OETF^-1
+        GLSL("color.rgb = "$" * color.rgb + vec3("$");                  \n"
+             "color.rgb = mix(vec3(4.0) * color.rgb * color.rgb,        \n"
+             "                exp((color.rgb - vec3(%f)) * vec3(1.0/%f))\n"
+             "                    + vec3(%f),                           \n"
+             "                lessThan(vec3(0.5), color.rgb));          \n",
+             SH_FLOAT(1 - b), SH_FLOAT(b),
+             HLG_C, HLG_A, HLG_B);
+        // OOTF
+        GLSL("color.rgb *= 1.0 / 12.0;                                      \n"
+             "color.rgb *= "$" * pow(max(dot("$", color.rgb), 0.0), "$");   \n",
+             SH_FLOAT(csp_max), sh_luma_coeffs(sh, csp), SH_FLOAT(y - 1));
+        return;
+    }
+    case PL_COLOR_TRC_V_LOG:
+        GLSL("color.rgb = mix((color.rgb - vec3(0.125)) * vec3(1.0/5.6), \n"
+             "    pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
+             "              - vec3(%f),                                  \n"
+             "    lessThanEqual(vec3(0.181), color.rgb));                \n",
+             VLOG_D, VLOG_C, VLOG_B);
+        return;
+    case PL_COLOR_TRC_S_LOG1:
+        GLSL("color.rgb = pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
+             "            - vec3(%f);                                            \n",
+             SLOG_C, SLOG_A, SLOG_B);
+        return;
+    case PL_COLOR_TRC_S_LOG2:
+        GLSL("color.rgb = mix((color.rgb - vec3(%f)) * vec3(1.0/%f),      \n"
+             "    (pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
+             "              - vec3(%f)) * vec3(1.0/%f),                   \n"
+             "    lessThanEqual(vec3(%f), color.rgb));                    \n",
+             SLOG_Q, SLOG_P, SLOG_C, SLOG_A, SLOG_B, SLOG_K2, SLOG_Q);
+        return;
+    case PL_COLOR_TRC_LINEAR:
+    case PL_COLOR_TRC_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+
+scale_out:
+    if (csp_max != 1 || csp_min != 0) {
+        GLSL("color.rgb = "$" * color.rgb + vec3("$"); \n",
+             SH_FLOAT(csp_max - csp_min), SH_FLOAT(csp_min));
+    }
+}
+
+void pl_shader_delinearize(pl_shader sh, const struct pl_color_space *csp)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    if (csp->transfer == PL_COLOR_TRC_LINEAR)
+        return;
+
+    float csp_min, csp_max;
+    pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+        .color      = csp,
+        .metadata   = PL_HDR_METADATA_HDR10,
+        .scaling    = PL_HDR_NORM,
+        .out_min    = &csp_min,
+        .out_max    = &csp_max,
+    ));
+
+    GLSL("// pl_shader_delinearize \n");
+    switch (csp->transfer) {
+    case PL_COLOR_TRC_UNKNOWN:
+    case PL_COLOR_TRC_SRGB:
+    case PL_COLOR_TRC_LINEAR:
+    case PL_COLOR_TRC_GAMMA18:
+    case PL_COLOR_TRC_GAMMA20:
+    case PL_COLOR_TRC_GAMMA22:
+    case PL_COLOR_TRC_GAMMA24:
+    case PL_COLOR_TRC_GAMMA26:
+    case PL_COLOR_TRC_GAMMA28:
+    case PL_COLOR_TRC_PRO_PHOTO:
+    case PL_COLOR_TRC_ST428: ;
+        if (csp_max != 1 || csp_min != 0) {
+            GLSL("color.rgb = "$" * color.rgb + vec3("$"); \n",
+                 SH_FLOAT(1 / (csp_max - csp_min)),
+                 SH_FLOAT(-csp_min / (csp_max - csp_min)));
+        }
+        break;
+    case PL_COLOR_TRC_BT_1886:
+    case PL_COLOR_TRC_PQ:
+    case PL_COLOR_TRC_HLG:
+    case PL_COLOR_TRC_V_LOG:
+    case PL_COLOR_TRC_S_LOG1:
+    case PL_COLOR_TRC_S_LOG2:
+        break; // scene-referred or absolute scale
+    case PL_COLOR_TRC_COUNT:
+        pl_unreachable();
+    }
+
+    GLSL("color.rgb = max(color.rgb, 0.0); \n");
+
+    switch (csp->transfer) {
+    case PL_COLOR_TRC_SRGB:
+        GLSL("color.rgb = mix(color.rgb * vec3(12.92),                        \n"
+             "                vec3(1.055) * pow(color.rgb, vec3(1.0/2.4))     \n"
+             "                    - vec3(0.055),                              \n"
+             "                lessThanEqual(vec3(0.0031308), color.rgb));     \n");
+        return;
+    case PL_COLOR_TRC_BT_1886: {
+        const float lb = powf(csp_min, 1/2.4f);
+        const float lw = powf(csp_max, 1/2.4f);
+        const float a = powf(lw - lb, 2.4f);
+        const float b = lb / (lw - lb);
+        GLSL("color.rgb = pow("$" * color.rgb, vec3(1.0/2.4)) - vec3("$"); \n",
+             SH_FLOAT(1.0 / a), SH_FLOAT(b));
+        return;
+    }
+    case PL_COLOR_TRC_GAMMA18:
+        GLSL("color.rgb = pow(color.rgb, vec3(1.0/1.8));\n");
+        return;
+    case PL_COLOR_TRC_GAMMA20:
+        GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.0));\n");
+        return;
+    case PL_COLOR_TRC_UNKNOWN:
+    case PL_COLOR_TRC_GAMMA22:
+        GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.2));\n");
+        return;
+    case PL_COLOR_TRC_GAMMA24:
+        GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.4));\n");
+        return;
+    case PL_COLOR_TRC_GAMMA26:
+        GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.6));\n");
+        return;
+    case PL_COLOR_TRC_GAMMA28:
+        GLSL("color.rgb = pow(color.rgb, vec3(1.0/2.8));\n");
+        return;
+    case PL_COLOR_TRC_ST428:
+        GLSL("color.rgb = pow(color.rgb * vec3(48.0/52.37), vec3(1.0/2.6));\n");
+        return;
+    case PL_COLOR_TRC_PRO_PHOTO:
+        GLSL("color.rgb = mix(color.rgb * vec3(16.0),                        \n"
+             "                pow(color.rgb, vec3(1.0/1.8)),                 \n"
+             "                lessThanEqual(vec3(0.001953), color.rgb));     \n");
+        return;
+    case PL_COLOR_TRC_PQ:
+        GLSL("color.rgb *= vec3(1.0/%f);                         \n"
+             "color.rgb = pow(color.rgb, vec3(%f));              \n"
+             "color.rgb = (vec3(%f) + vec3(%f) * color.rgb)      \n"
+             "             / (vec3(1.0) + vec3(%f) * color.rgb); \n"
+             "color.rgb = pow(color.rgb, vec3(%f));              \n",
+             10000 / PL_COLOR_SDR_WHITE, PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2);
+        return;
+    case PL_COLOR_TRC_HLG: {
+        const float y = fmaxf(1.2f + 0.42f * log10f(csp_max / HLG_REF), 1);
+        const float b = sqrtf(3 * powf(csp_min / csp_max, 1 / y));
+        // OOTF^-1
+        GLSL("color.rgb *= 1.0 / "$";                                       \n"
+             "color.rgb *= 12.0 * max(1e-6, pow(dot("$", color.rgb), "$")); \n",
+             SH_FLOAT(csp_max), sh_luma_coeffs(sh, csp), SH_FLOAT((1 - y) / y));
+        // OETF
+        GLSL("color.rgb = mix(vec3(0.5) * sqrt(color.rgb),                      \n"
+             "                vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f),  \n"
+             "                lessThan(vec3(1.0), color.rgb));                  \n"
+             "color.rgb = "$" * color.rgb + vec3("$");                          \n",
+             HLG_A, HLG_B, HLG_C,
+             SH_FLOAT(1 / (1 - b)), SH_FLOAT(-b / (1 - b)));
+        return;
+    }
+    case PL_COLOR_TRC_V_LOG:
+        GLSL("color.rgb = mix(vec3(5.6) * color.rgb + vec3(0.125),       \n"
+             "                vec3(%f) * log(color.rgb + vec3(%f))       \n"
+             "                    + vec3(%f),                            \n"
+             "                lessThanEqual(vec3(0.01), color.rgb));     \n",
+             VLOG_C / M_LN10, VLOG_B, VLOG_D);
+        return;
+    case PL_COLOR_TRC_S_LOG1:
+        GLSL("color.rgb = vec3(%f) * log(color.rgb + vec3(%f)) + vec3(%f);\n",
+             SLOG_A / M_LN10, SLOG_B, SLOG_C);
+        return;
+    case PL_COLOR_TRC_S_LOG2:
+        GLSL("color.rgb = mix(vec3(%f) * color.rgb + vec3(%f),                \n"
+             "                vec3(%f) * log(vec3(%f) * color.rgb + vec3(%f)) \n"
+             "                    + vec3(%f),                                 \n"
+             "                lessThanEqual(vec3(0.0), color.rgb));           \n",
+             SLOG_P, SLOG_Q, SLOG_A / M_LN10, SLOG_K2, SLOG_B, SLOG_C);
+        return;
+    case PL_COLOR_TRC_LINEAR:
+    case PL_COLOR_TRC_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+}
+
+const struct pl_sigmoid_params pl_sigmoid_default_params = { PL_SIGMOID_DEFAULTS };
+
+void pl_shader_sigmoidize(pl_shader sh, const struct pl_sigmoid_params *params)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    params = PL_DEF(params, &pl_sigmoid_default_params);
+    float center = PL_DEF(params->center, pl_sigmoid_default_params.center);
+    float slope  = PL_DEF(params->slope, pl_sigmoid_default_params.slope);
+
+    // This function needs to go through (0,0) and (1,1), so we compute the
+    // values at 1 and 0, and then scale/shift them, respectively.
+    float offset = 1.0 / (1 + expf(slope * center));
+    float scale  = 1.0 / (1 + expf(slope * (center - 1))) - offset;
+
+    GLSL("// pl_shader_sigmoidize                               \n"
+         "color = clamp(color, 0.0, 1.0);                       \n"
+         "color = vec4("$") - vec4("$") *                       \n"
+         "    log(vec4(1.0) / (color * vec4("$") + vec4("$"))   \n"
+         "        - vec4(1.0));                                 \n",
+         SH_FLOAT(center), SH_FLOAT(1.0 / slope),
+         SH_FLOAT(scale), SH_FLOAT(offset));
+}
+
+void pl_shader_unsigmoidize(pl_shader sh, const struct pl_sigmoid_params *params)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    // See: pl_shader_sigmoidize
+    params = PL_DEF(params, &pl_sigmoid_default_params);
+    float center = PL_DEF(params->center, pl_sigmoid_default_params.center);
+    float slope  = PL_DEF(params->slope, pl_sigmoid_default_params.slope);
+    float offset = 1.0 / (1 + expf(slope * center));
+    float scale  = 1.0 / (1 + expf(slope * (center - 1))) - offset;
+
+    GLSL("// pl_shader_unsigmoidize                                 \n"
+         "color = clamp(color, 0.0, 1.0);                           \n"
+         "color = vec4("$") /                                       \n"
+         "    (vec4(1.0) + exp(vec4("$") * (vec4("$") - color)))    \n"
+         "    - vec4("$");                                          \n",
+         SH_FLOAT(1.0 / scale),
+         SH_FLOAT(slope), SH_FLOAT(center),
+         SH_FLOAT(offset / scale));
+}
+
+const struct pl_peak_detect_params pl_peak_detect_default_params = { PL_PEAK_DETECT_DEFAULTS };
+const struct pl_peak_detect_params pl_peak_detect_high_quality_params = { PL_PEAK_DETECT_HQ_DEFAULTS };
+
+static bool peak_detect_params_eq(const struct pl_peak_detect_params *a,
+                                  const struct pl_peak_detect_params *b)
+{
+    return a->smoothing_period     == b->smoothing_period     &&
+           a->scene_threshold_low  == b->scene_threshold_low  &&
+           a->scene_threshold_high == b->scene_threshold_high &&
+           a->percentile           == b->percentile;
+    // don't compare `allow_delayed` because it doesn't change measurement
+}
+
+enum {
+    // Split the peak buffer into several independent slices to reduce pressure
+    // on global atomics
+    SLICES = 12,
+
+    // How many bits to use for storing PQ data. Be careful when setting this
+    // too high, as it may overflow `unsigned int` on large video sources.
+    //
+    // The value chosen is enough to guarantee no overflow for an 8K x 4K frame
+    // consisting entirely of 100% 10k nits PQ values, with 16x16 workgroups.
+    PQ_BITS     = 14,
+    PQ_MAX      = (1 << PQ_BITS) - 1,
+
+    // How many bits to use for the histogram. We bias the histogram down
+    // by half the PQ range (~90 nits), effectively clumping the SDR part
+    // of the image into a single histogram bin.
+    HIST_BITS   = 7,
+    HIST_BIAS   = 1 << (HIST_BITS - 1),
+    HIST_BINS   = (1 << HIST_BITS) - HIST_BIAS,
+
+    // Convert from histogram bin to (starting) PQ value
+#define HIST_PQ(bin) (((bin) + HIST_BIAS) << (PQ_BITS - HIST_BITS))
+};
+
+
+pl_static_assert(PQ_BITS >= HIST_BITS);
+
+struct peak_buf_data {
+    unsigned frame_wg_count[SLICES]; // number of work groups processed
+    unsigned frame_wg_active[SLICES];// number of active (nonzero) work groups
+    unsigned frame_sum_pq[SLICES];   // sum of PQ Y values over all WGs (PQ_BITS)
+    unsigned frame_max_pq[SLICES];   // maximum PQ Y value among these WGs (PQ_BITS)
+    unsigned frame_hist[SLICES][HIST_BINS]; // always allocated, conditionally used
+};
+
+static const struct pl_buffer_var peak_buf_vars[] = {
+#define VAR(field) {                                                            \
+    .var = {                                                                    \
+        .name = #field,                                                         \
+        .type = PL_VAR_UINT,                                                    \
+        .dim_v = 1,                                                             \
+        .dim_m = 1,                                                             \
+        .dim_a = sizeof(((struct peak_buf_data *) NULL)->field) /               \
+                 sizeof(unsigned),                                              \
+    },                                                                          \
+    .layout = {                                                                 \
+        .offset = offsetof(struct peak_buf_data, field),                        \
+        .size   = sizeof(((struct peak_buf_data *) NULL)->field),               \
+        .stride = sizeof(unsigned),                                             \
+    },                                                                          \
+}
+    VAR(frame_wg_count),
+    VAR(frame_wg_active),
+    VAR(frame_sum_pq),
+    VAR(frame_max_pq),
+    VAR(frame_hist),
+#undef VAR
+};
+
+struct sh_color_map_obj {
+    // Tone map state
+    struct {
+        struct pl_tone_map_params params;
+        pl_shader_obj lut;
+    } tone;
+
+    // Gamut map state
+    struct {
+        pl_shader_obj lut;
+    } gamut;
+
+    // Peak detection state
+    struct {
+        struct pl_peak_detect_params params;    // currently active parameters
+        pl_buf buf;                             // pending peak detection buffer
+        pl_buf readback;                        // readback buffer (fallback)
+        float avg_pq;                           // current (smoothed) values
+        float max_pq;
+    } peak;
+};
+
+// Excluding size, since this is checked by sh_lut
+static uint64_t gamut_map_signature(const struct pl_gamut_map_params *par)
+{
+    uint64_t sig = CACHE_KEY_GAMUT_LUT;
+    pl_hash_merge(&sig, pl_str0_hash(par->function->name));
+    pl_hash_merge(&sig, pl_var_hash(par->input_gamut));
+    pl_hash_merge(&sig, pl_var_hash(par->output_gamut));
+    pl_hash_merge(&sig, pl_var_hash(par->min_luma));
+    pl_hash_merge(&sig, pl_var_hash(par->max_luma));
+    pl_hash_merge(&sig, pl_var_hash(par->constants));
+    return sig;
+}
+
+static void sh_color_map_uninit(pl_gpu gpu, void *ptr)
+{
+    struct sh_color_map_obj *obj = ptr;
+    pl_shader_obj_destroy(&obj->tone.lut);
+    pl_shader_obj_destroy(&obj->gamut.lut);
+    pl_buf_destroy(gpu, &obj->peak.buf);
+    pl_buf_destroy(gpu, &obj->peak.readback);
+    memset(obj, 0, sizeof(*obj));
+}
+
+static inline float iir_coeff(float rate)
+{
+    if (!rate)
+        return 1.0f;
+    return 1.0f - expf(-1.0f / rate);
+}
+
+static float measure_peak(const struct peak_buf_data *data, float percentile)
+{
+    unsigned frame_max_pq = data->frame_max_pq[0];
+    for (int k = 1; k < SLICES; k++)
+        frame_max_pq = PL_MAX(frame_max_pq, data->frame_max_pq[k]);
+    const float frame_max = (float) frame_max_pq / PQ_MAX;
+    if (percentile <= 0 || percentile >= 100)
+        return frame_max;
+    unsigned total_pixels = 0;
+    for (int k = 0; k < SLICES; k++) {
+        for (int i = 0; i < HIST_BINS; i++)
+            total_pixels += data->frame_hist[k][i];
+    }
+    if (!total_pixels) // no histogram data available?
+        return frame_max;
+
+    const unsigned target_pixel = ceilf(percentile / 100.0f * total_pixels);
+    if (target_pixel >= total_pixels)
+        return frame_max;
+
+    unsigned sum = 0;
+    for (int i = 0; i < HIST_BINS; i++) {
+        unsigned next = sum;
+        for (int k = 0; k < SLICES; k++)
+            next += data->frame_hist[k][i];
+        if (next < target_pixel) {
+            sum = next;
+            continue;
+        }
+
+        // Upper and lower frequency boundaries of the matching histogram bin
+        const unsigned count_low  = sum;      // last pixel of previous bin
+        const unsigned count_high = next + 1; // first pixel of next bin
+        pl_assert(count_low < target_pixel && target_pixel < count_high);
+
+        // PQ luminance associated with count_low/high respectively
+        const float pq_low  = (float) HIST_PQ(i)     / PQ_MAX;
+        float pq_high       = (float) HIST_PQ(i + 1) / PQ_MAX;
+        if (count_high > total_pixels) // special case for last histogram bin
+            pq_high = frame_max;
+
+        // Position of `target_pixel` inside this bin, assumes pixels are
+        // equidistributed inside a histogram bin
+        const float ratio = (float) (target_pixel - count_low) /
+                                    (count_high - count_low);
+        return PL_MIX(pq_low, pq_high, ratio);
+    }
+
+    pl_unreachable();
+}
+
+// if `force` is true, ensures the buffer is read, even if `allow_delayed`
+static void update_peak_buf(pl_gpu gpu, struct sh_color_map_obj *obj, bool force)
+{
+    const struct pl_peak_detect_params *params = &obj->peak.params;
+    if (!obj->peak.buf)
+        return;
+
+    if (!force && params->allow_delayed && pl_buf_poll(gpu, obj->peak.buf, 0))
+        return; // buffer not ready yet
+
+    bool ok;
+    struct peak_buf_data data = {0};
+    if (obj->peak.readback) {
+        pl_buf_copy(gpu, obj->peak.readback, 0, obj->peak.buf, 0, sizeof(data));
+        ok = pl_buf_read(gpu, obj->peak.readback, 0, &data, sizeof(data));
+    } else {
+        ok = pl_buf_read(gpu, obj->peak.buf, 0, &data, sizeof(data));
+    }
+    if (ok && data.frame_wg_count[0] > 0) {
+        // Peak detection completed successfully
+        pl_buf_destroy(gpu, &obj->peak.buf);
+    } else {
+        // No data read? Possibly this peak obj has not been executed yet
+        if (!ok) {
+            PL_ERR(gpu, "Failed reading peak detection buffer!");
+        } else if (params->allow_delayed) {
+            PL_TRACE(gpu, "Peak detection buffer not yet ready, ignoring..");
+        } else {
+            PL_WARN(gpu, "Peak detection usage error: attempted detecting peak "
+                    "and using detected peak in the same shader program, "
+                    "but `params->allow_delayed` is false! Ignoring, but "
+                    "expect incorrect output.");
+        }
+        if (force || !ok)
+            pl_buf_destroy(gpu, &obj->peak.buf);
+        return;
+    }
+
+    uint64_t frame_sum_pq = 0u, frame_wg_count = 0u, frame_wg_active = 0u;
+    for (int k = 0; k < SLICES; k++) {
+        frame_sum_pq    += data.frame_sum_pq[k];
+        frame_wg_count  += data.frame_wg_count[k];
+        frame_wg_active += data.frame_wg_active[k];
+    }
+    float avg_pq, max_pq;
+    if (frame_wg_active) {
+        avg_pq = (float) frame_sum_pq / (frame_wg_active * PQ_MAX);
+        max_pq = measure_peak(&data, params->percentile);
+    } else {
+        // Solid black frame
+        avg_pq = max_pq = PL_COLOR_HDR_BLACK;
+    }
+
+    if (!obj->peak.avg_pq) {
+        // Set the initial value accordingly if it contains no data
+        obj->peak.avg_pq = avg_pq;
+        obj->peak.max_pq = max_pq;
+    } else {
+        // Ignore small deviations from existing peak (rounding error)
+        static const float epsilon = 1.0f / PQ_MAX;
+        if (fabsf(avg_pq - obj->peak.avg_pq) < epsilon)
+            avg_pq = obj->peak.avg_pq;
+        if (fabsf(max_pq - obj->peak.max_pq) < epsilon)
+            max_pq = obj->peak.max_pq;
+    }
+
+    // Use an IIR low-pass filter to smooth out the detected values
+    const float coeff = iir_coeff(params->smoothing_period);
+    obj->peak.avg_pq += coeff * (avg_pq - obj->peak.avg_pq);
+    obj->peak.max_pq += coeff * (max_pq - obj->peak.max_pq);
+
+    // Scene change hysteresis
+    if (params->scene_threshold_low > 0 && params->scene_threshold_high > 0) {
+        const float log10_pq = 1e-2f; // experimentally determined approximate
+        const float thresh_low = params->scene_threshold_low * log10_pq;
+        const float thresh_high = params->scene_threshold_high * log10_pq;
+        const float bias = (float) frame_wg_active / frame_wg_count;
+        const float delta = bias * fabsf(avg_pq - obj->peak.avg_pq);
+        const float mix_coeff = pl_smoothstep(thresh_low, thresh_high, delta);
+        obj->peak.avg_pq = PL_MIX(obj->peak.avg_pq, avg_pq, mix_coeff);
+        obj->peak.max_pq = PL_MIX(obj->peak.max_pq, max_pq, mix_coeff);
+    }
+}
+
+bool pl_shader_detect_peak(pl_shader sh, struct pl_color_space csp,
+                           pl_shader_obj *state,
+                           const struct pl_peak_detect_params *params)
+{
+    params = PL_DEF(params, &pl_peak_detect_default_params);
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return false;
+
+    pl_gpu gpu = SH_GPU(sh);
+    if (!gpu || gpu->limits.max_ssbo_size < sizeof(struct peak_buf_data)) {
+        PL_ERR(sh, "HDR peak detection requires a GPU with support for at "
+               "least %zu bytes of SSBO data (supported: %zu)",
+               sizeof(struct peak_buf_data), gpu ? gpu->limits.max_ssbo_size : 0);
+        return false;
+    }
+
+    const bool use_histogram = params->percentile > 0 && params->percentile < 100;
+    size_t shmem_req = 3 * sizeof(uint32_t);
+    if (use_histogram)
+        shmem_req += sizeof(uint32_t[HIST_BINS]);
+
+    if (!sh_try_compute(sh, 16, 16, true, shmem_req)) {
+        PL_ERR(sh, "HDR peak detection requires compute shaders with support "
+               "for at least %zu bytes of shared memory! (avail: %zu)",
+               shmem_req, sh_glsl(sh).max_shmem_size);
+        return false;
+    }
+
+    struct sh_color_map_obj *obj;
+    obj = SH_OBJ(sh, state, PL_SHADER_OBJ_COLOR_MAP, struct sh_color_map_obj,
+                 sh_color_map_uninit);
+    if (!obj)
+        return false;
+
+    if (peak_detect_params_eq(&obj->peak.params, params)) {
+        update_peak_buf(gpu, obj, true); // prevent over-writing previous frame
+    } else {
+        pl_reset_detected_peak(*state);
+    }
+
+    pl_assert(!obj->peak.buf);
+    static const struct peak_buf_data zero = {0};
+
+retry_ssbo:
+    if (obj->peak.readback) {
+        obj->peak.buf = pl_buf_create(gpu, pl_buf_params(
+            .size           = sizeof(struct peak_buf_data),
+            .storable       = true,
+            .initial_data   = &zero,
+        ));
+    } else {
+        obj->peak.buf = pl_buf_create(gpu, pl_buf_params(
+            .size           = sizeof(struct peak_buf_data),
+            .memory_type    = PL_BUF_MEM_DEVICE,
+            .host_readable  = true,
+            .storable       = true,
+            .initial_data   = &zero,
+        ));
+    }
+
+    if (!obj->peak.buf && !obj->peak.readback) {
+        PL_WARN(sh, "Failed creating host-readable peak detection SSBO, "
+                "retrying with fallback buffer");
+        obj->peak.readback = pl_buf_create(gpu, pl_buf_params(
+            .size           = sizeof(struct peak_buf_data),
+            .host_readable  = true,
+        ));
+        if (obj->peak.readback)
+            goto retry_ssbo;
+    }
+
+    if (!obj->peak.buf) {
+        SH_FAIL(sh, "Failed creating peak detection SSBO!");
+        return false;
+    }
+
+    obj->peak.params = *params;
+
+    sh_desc(sh, (struct pl_shader_desc) {
+        .desc = {
+            .name   = "PeakBuf",
+            .type   = PL_DESC_BUF_STORAGE,
+            .access = PL_DESC_ACCESS_READWRITE,
+        },
+        .binding.object  = obj->peak.buf,
+        .buffer_vars     = (struct pl_buffer_var *) peak_buf_vars,
+        .num_buffer_vars = PL_ARRAY_SIZE(peak_buf_vars),
+    });
+
+    sh_describe(sh, "peak detection");
+    GLSL("// pl_shader_detect_peak                                      \n"
+         "{                                                             \n"
+         "const uint wg_size = gl_WorkGroupSize.x * gl_WorkGroupSize.y; \n"
+         "uint wg_idx = gl_WorkGroupID.y * gl_NumWorkGroups.x +         \n"
+         "              gl_WorkGroupID.x;                               \n"
+         "uint slice = wg_idx %% %du;                                   \n"
+         "vec4 color_orig = color;                                      \n",
+         SLICES);
+
+    // For performance, we want to do as few atomic operations on global
+    // memory as possible, so use an atomic in shmem for the work group.
+    ident_t wg_sum   = sh_fresh(sh, "wg_sum"),
+            wg_max   = sh_fresh(sh, "wg_max"),
+            wg_black = sh_fresh(sh, "wg_black"),
+            wg_hist  = NULL_IDENT;
+    GLSLH("shared uint "$", "$", "$"; \n", wg_sum, wg_max, wg_black);
+    if (use_histogram) {
+        wg_hist = sh_fresh(sh, "wg_hist");
+        GLSLH("shared uint "$"[%u]; \n", wg_hist, HIST_BINS);
+        GLSL("for (uint i = gl_LocalInvocationIndex; i < %du; i += wg_size) \n"
+             "    "$"[i] = 0u;                                              \n",
+             HIST_BINS, wg_hist);
+    }
+    GLSL($" = 0u; "$" = 0u; "$" = 0u; \n"
+         "barrier();                  \n",
+         wg_sum, wg_max, wg_black);
+
+    // Decode color into linear light representation
+    pl_color_space_infer(&csp);
+    pl_shader_linearize(sh, &csp);
+
+    // Measure luminance as N-bit PQ
+    GLSL("float luma = dot("$", color.rgb);             \n"
+         "luma *= %f;                                   \n"
+         "luma = pow(clamp(luma, 0.0, 1.0), %f);        \n"
+         "luma = (%f + %f * luma) / (1.0 + %f * luma);  \n"
+         "luma = pow(luma, %f);                         \n"
+         "luma *= smoothstep(0.0, 1e-2, luma);          \n"
+         "uint y_pq = uint(%d.0 * luma);                \n",
+         sh_luma_coeffs(sh, &csp),
+         PL_COLOR_SDR_WHITE / 10000.0,
+         PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2,
+         PQ_MAX);
+
+    // Update the work group's shared atomics
+    bool has_subgroups = sh_glsl(sh).subgroup_size > 0;
+    if (use_histogram) {
+        GLSL("int bin = (int(y_pq) >> %d) - %d; \n"
+             "bin = clamp(bin, 0, %d);          \n",
+             PQ_BITS - HIST_BITS, HIST_BIAS,
+             HIST_BINS - 1);
+        if (has_subgroups) {
+            // Optimize for the very common case of identical histogram bins
+            GLSL("if (subgroupAllEqual(bin)) {                  \n"
+                 "    if (subgroupElect())                      \n"
+                 "        atomicAdd("$"[bin], gl_SubgroupSize); \n"
+                 "} else {                                      \n"
+                 "    atomicAdd("$"[bin], 1u);                  \n"
+                 "}                                             \n",
+                 wg_hist, wg_hist);
+        } else {
+            GLSL("atomicAdd("$"[bin], 1u); \n", wg_hist);
+        }
+    }
+
+    if (has_subgroups) {
+        GLSL("uint group_sum = subgroupAdd(y_pq);           \n"
+             "uint group_max = subgroupMax(y_pq);           \n"
+             "uvec4 b = subgroupBallot(y_pq == 0u);         \n"
+             "if (subgroupElect()) {                        \n"
+             "    atomicAdd("$", group_sum);                \n"
+             "    atomicMax("$", group_max);                \n"
+             "    atomicAdd("$", subgroupBallotBitCount(b));\n"
+             "}                                             \n"
+             "barrier();                                    \n",
+             wg_sum, wg_max, wg_black);
+    } else {
+        GLSL("atomicAdd("$", y_pq);     \n"
+             "atomicMax("$", y_pq);     \n"
+             "if (y_pq == 0u)           \n"
+             "    atomicAdd("$", 1u);   \n"
+             "barrier();                \n",
+             wg_sum, wg_max, wg_black);
+    }
+
+    if (use_histogram) {
+        GLSL("if (gl_LocalInvocationIndex == 0u)                            \n"
+             "    "$"[0] -= "$";                                            \n"
+             "for (uint i = gl_LocalInvocationIndex; i < %du; i += wg_size) \n"
+             "    atomicAdd(frame_hist[slice * %du + i], "$"[i]);           \n",
+             wg_hist, wg_black,
+             HIST_BINS,
+             HIST_BINS, wg_hist);
+    }
+
+    // Have one thread per work group update the global atomics
+    GLSL("if (gl_LocalInvocationIndex == 0u) {                  \n"
+         "    uint num = wg_size - "$";                         \n"
+         "    atomicAdd(frame_wg_count[slice], 1u);             \n"
+         "    atomicAdd(frame_wg_active[slice], min(num, 1u));  \n"
+         "    if (num > 0u) {                                   \n"
+         "        atomicAdd(frame_sum_pq[slice], "$" / num);    \n"
+         "        atomicMax(frame_max_pq[slice], "$");          \n"
+         "    }                                                 \n"
+         "}                                                     \n"
+         "color = color_orig;                                   \n"
+         "}                                                     \n",
+         wg_black, wg_sum, wg_max);
+
+    return true;
+}
+
+bool pl_get_detected_hdr_metadata(const pl_shader_obj state,
+                                  struct pl_hdr_metadata *out)
+{
+    if (!state || state->type != PL_SHADER_OBJ_COLOR_MAP)
+        return false;
+
+    struct sh_color_map_obj *obj = state->priv;
+    update_peak_buf(state->gpu, obj, false);
+    if (!obj->peak.avg_pq)
+        return false;
+
+    out->max_pq_y = obj->peak.max_pq;
+    out->avg_pq_y = obj->peak.avg_pq;
+    return true;
+}
+
+bool pl_get_detected_peak(const pl_shader_obj state,
+                          float *out_peak, float *out_avg)
+{
+    struct pl_hdr_metadata data;
+    if (!pl_get_detected_hdr_metadata(state, &data))
+        return false;
+
+    // Preserves old behavior
+    *out_peak = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, data.max_pq_y);
+    *out_avg  = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, data.avg_pq_y);
+    return true;
+}
+
+void pl_reset_detected_peak(pl_shader_obj state)
+{
+    if (!state || state->type != PL_SHADER_OBJ_COLOR_MAP)
+        return;
+
+    struct sh_color_map_obj *obj = state->priv;
+    pl_buf readback = obj->peak.readback;
+    pl_buf_destroy(state->gpu, &obj->peak.buf);
+    memset(&obj->peak, 0, sizeof(obj->peak));
+    obj->peak.readback = readback;
+}
+
+void pl_shader_extract_features(pl_shader sh, struct pl_color_space csp)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    sh_describe(sh, "feature extraction");
+    pl_shader_linearize(sh, &csp);
+    GLSL("// pl_shader_extract_features             \n"
+         "{                                         \n"
+         "vec3 lms = %f * "$" * color.rgb;          \n"
+         "lms = pow(max(lms, 0.0), vec3(%f));       \n"
+         "lms = (vec3(%f) + %f * lms)               \n"
+         "        / (vec3(1.0) + %f * lms);         \n"
+         "lms = pow(lms, vec3(%f));                 \n"
+         "float I = dot(vec3(%f, %f, %f), lms);     \n"
+         "color = vec4(I, 0.0, 0.0, 1.0);           \n"
+         "}                                         \n",
+         PL_COLOR_SDR_WHITE / 10000,
+         SH_MAT3(pl_ipt_rgb2lms(pl_raw_primaries_get(csp.primaries))),
+         PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2,
+         pl_ipt_lms2ipt.m[0][0], pl_ipt_lms2ipt.m[0][1], pl_ipt_lms2ipt.m[0][2]);
+}
+
+const struct pl_color_map_params pl_color_map_default_params = { PL_COLOR_MAP_DEFAULTS };
+const struct pl_color_map_params pl_color_map_high_quality_params = { PL_COLOR_MAP_HQ_DEFAULTS };
+
+static ident_t rect_pos(pl_shader sh, pl_rect2df rc)
+{
+    if (!rc.x0 && !rc.x1)
+        rc.x1 = 1.0f;
+    if (!rc.y0 && !rc.y1)
+        rc.y1 = 1.0f;
+
+    return sh_attr_vec2(sh, "tone_map_coords", &(pl_rect2df) {
+        .x0 = -rc.x0         / (rc.x1 - rc.x0),
+        .x1 = (1.0f - rc.x0) / (rc.x1 - rc.x0),
+        .y0 = -rc.y1         / (rc.y0 - rc.y1),
+        .y1 = (1.0f - rc.y1) / (rc.y0 - rc.y1),
+    });
+}
+
+static void visualize_tone_map(pl_shader sh, pl_rect2df rc, float alpha,
+                               const struct pl_tone_map_params *params)
+{
+    pl_assert(params->input_scaling  == PL_HDR_PQ);
+    pl_assert(params->output_scaling == PL_HDR_PQ);
+
+    GLSL("// Visualize tone mapping                 \n"
+         "{                                         \n"
+         "vec2 pos = "$";                           \n"
+         "if (min(pos.x, pos.y) >= 0.0 &&           \n" // visualizer rect
+         "    max(pos.x, pos.y) <= 1.0)             \n"
+         "{                                         \n"
+         "float xmin = "$";                         \n"
+         "float xmax = "$";                         \n"
+         "float xavg = "$";                         \n"
+         "float ymin = "$";                         \n"
+         "float ymax = "$";                         \n"
+         "float alpha = 0.8 * "$";                  \n"
+         "vec3 viz = color.rgb;                     \n"
+         "float vv = tone_map(pos.x);               \n"
+         // Color based on region
+         "if (pos.x < xmin || pos.x > xmax) {       \n" // outside source
+         "} else if (pos.y < ymin || pos.y > ymax) {\n" // outside target
+         "    if (pos.y < xmin || pos.y > xmax) {   \n" //  and also source
+         "        viz = vec3(0.1, 0.1, 0.5);        \n"
+         "    } else {                              \n"
+         "        viz = vec3(0.2, 0.05, 0.05);      \n" //  but inside source
+         "    }                                     \n"
+         "} else {                                  \n" // inside domain
+         "    if (abs(pos.x - pos.y) < 1e-3) {      \n" // main diagonal
+         "        viz = vec3(0.2);                  \n"
+         "    } else if (pos.y < vv) {              \n" // inside function
+         "        alpha *= 0.6;                     \n"
+         "        viz = vec3(0.05);                 \n"
+         "        if (vv > pos.x && pos.y > pos.x)  \n" // output brighter than input
+         "            viz.rg = vec2(0.5, 0.7);      \n"
+         "    } else {                              \n" // outside function
+         "        if (vv < pos.x && pos.y < pos.x)  \n" // output darker than input
+         "            viz = vec3(0.0, 0.1, 0.2);    \n"
+         "    }                                     \n"
+         "    if (pos.y > xmax) {                   \n" // inverse tone-mapping region
+         "        vec3 hi = vec3(0.2, 0.5, 0.8);    \n"
+         "        viz = mix(viz, hi, 0.5);          \n"
+         "    } else if (pos.y < xmin) {            \n" // black point region
+         "        viz = mix(viz, vec3(0.0), 0.3);   \n"
+         "    }                                     \n"
+         "    if (xavg > 0.0 && abs(pos.x - xavg) < 1e-3)\n" // source avg brightness
+         "        viz = vec3(0.5);                  \n"
+         "}                                         \n"
+         "color.rgb = mix(color.rgb, viz, alpha);   \n"
+         "}                                         \n"
+         "}                                         \n",
+         rect_pos(sh, rc),
+         SH_FLOAT_DYN(params->input_min),
+         SH_FLOAT_DYN(params->input_max),
+         SH_FLOAT_DYN(params->input_avg),
+         SH_FLOAT(params->output_min),
+         SH_FLOAT_DYN(params->output_max),
+         SH_FLOAT_DYN(alpha));
+}
+
+static void visualize_gamut_map(pl_shader sh, pl_rect2df rc,
+                                ident_t lut, float hue, float theta,
+                                const struct pl_gamut_map_params *params)
+{
+    ident_t ipt2lms = SH_MAT3(pl_ipt_ipt2lms);
+    ident_t lms2rgb_src = SH_MAT3(pl_ipt_lms2rgb(&params->input_gamut));
+    ident_t lms2rgb_dst = SH_MAT3(pl_ipt_lms2rgb(&params->output_gamut));
+
+    GLSL("// Visualize gamut mapping                            \n"
+         "vec2 pos = "$";                                       \n"
+         "float pqmin = "$";                                    \n"
+         "float pqmax = "$";                                    \n"
+         "float rgbmin = "$";                                   \n"
+         "float rgbmax = "$";                                   \n"
+         "vec3 orig = ipt;                                      \n"
+         "if (min(pos.x, pos.y) >= 0.0 &&                       \n"
+         "    max(pos.x, pos.y) <= 1.0)                         \n"
+         "{                                                     \n"
+         // Source color to visualize
+         "float mid = mix(pqmin, pqmax, 0.6);                   \n"
+         "vec3 base = vec3(0.5, 0.0, 0.0);                      \n"
+         "float hue = "$", theta = "$";                         \n"
+         "base.x = mix(base.x, mid, sin(theta));                \n"
+         "mat3 rot1 = mat3(1.0,    0.0,      0.0,               \n"
+         "                 0.0,  cos(hue), sin(hue),            \n"
+         "                 0.0, -sin(hue), cos(hue));           \n"
+         "mat3 rot2 = mat3( cos(theta), 0.0, sin(theta),        \n"
+         "                     0.0,     1.0,    0.0,            \n"
+         "                 -sin(theta), 0.0, cos(theta));       \n"
+         "vec3 dir = vec3(pos.yx - vec2(0.5), 0.0);             \n"
+         "ipt = base + rot1 * rot2 * dir;                       \n"
+         // Convert back to RGB (for gamut boundary testing)
+         "lmspq = "$" * ipt;                                    \n"
+         "lms = pow(max(lmspq, 0.0), vec3(1.0/%f));             \n"
+         "lms = max(lms - vec3(%f), 0.0)                        \n"
+         "             / (vec3(%f) - %f * lms);                 \n"
+         "lms = pow(lms, vec3(1.0/%f));                         \n"
+         "lms *= %f;                                            \n"
+         // Check against src/dst gamut boundaries
+         "vec3 rgbsrc = "$" * lms;                              \n"
+         "vec3 rgbdst = "$" * lms;                              \n"
+         "bool insrc, indst;                                    \n"
+         "insrc = all(lessThan(rgbsrc, vec3(rgbmax))) &&        \n"
+         "              all(greaterThan(rgbsrc, vec3(rgbmin))); \n"
+         "indst = all(lessThan(rgbdst, vec3(rgbmax))) &&        \n"
+         "              all(greaterThan(rgbdst, vec3(rgbmin))); \n"
+         // Sample from gamut mapping 3DLUT
+         "idx.x = (ipt.x - pqmin) / (pqmax - pqmin);            \n"
+         "idx.y = 2.0 * length(ipt.yz);                         \n"
+         "idx.z = %f * atan(ipt.z, ipt.y) + 0.5;                \n"
+         "vec3 mapped = "$"(idx).xyz;                           \n"
+         "mapped.yz -= vec2(32768.0/65535.0);                   \n"
+         "float mappedhue = atan(mapped.z, mapped.y);           \n"
+         "float mappedchroma = length(mapped.yz);               \n"
+         "ipt = mapped;                                         \n"
+         // Visualize gamuts
+         "if (!insrc && !indst) {                               \n"
+         "    ipt = orig;                                       \n"
+         "} else if (insrc && !indst) {                         \n"
+         "    ipt.x -= 0.1;                                     \n"
+         "} else if (indst && !insrc) {                         \n"
+         "    ipt.x += 0.1;                                     \n"
+         "}                                                     \n"
+         // Visualize iso-luminance and iso-hue lines
+         "vec3 line;                                            \n"
+         "if (insrc && fract(50.0 * mapped.x) < 1e-1) {         \n"
+         "    float k = smoothstep(0.1, 0.0, abs(sin(theta)));  \n"
+         "    line.x = mix(mapped.x, 0.3, 0.5);                 \n"
+         "    line.yz = sqrt(length(mapped.yz)) *               \n"
+         "              normalize(mapped.yz);                   \n"
+         "    ipt = mix(ipt, line, k);                          \n"
+         "}                                                     \n"
+         "if (insrc && fract(10.0 * (mappedhue - hue)) < 1e-1) {\n"
+         "    float k = smoothstep(0.3, 0.0, abs(cos(theta)));  \n"
+         "    line.x = mapped.x - 0.05;                         \n"
+         "    line.yz = 1.2 * mapped.yz;                        \n"
+         "    ipt = mix(ipt, line, k);                          \n"
+         "}                                                     \n"
+         "if (insrc && fract(100.0 * mappedchroma) < 1e-1) {    \n"
+         "    line.x = mapped.x + 0.1;                          \n"
+         "    line.yz = 0.4 * mapped.yz;                        \n"
+         "    ipt = mix(ipt, line, 0.5);                        \n"
+         "}                                                     \n"
+         "}                                                     \n",
+         rect_pos(sh, rc),
+         SH_FLOAT(params->min_luma), SH_FLOAT(params->max_luma),
+         SH_FLOAT(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, params->min_luma)),
+         SH_FLOAT(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, params->max_luma)),
+         SH_FLOAT_DYN(hue), SH_FLOAT_DYN(theta),
+         ipt2lms,
+         PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1,
+         10000 / PL_COLOR_SDR_WHITE,
+         lms2rgb_src,
+         lms2rgb_dst,
+         0.5f / M_PI,
+         lut);
+}
+
+static void fill_tone_lut(void *data, const struct sh_lut_params *params)
+{
+    const struct pl_tone_map_params *lut_params = params->priv;
+    pl_tone_map_generate(data, lut_params);
+}
+
+static void fill_gamut_lut(void *data, const struct sh_lut_params *params)
+{
+    const struct pl_gamut_map_params *lut_params = params->priv;
+    const int lut_size = params->width * params->height * params->depth;
+    void *tmp = pl_alloc(NULL, lut_size * sizeof(float) * lut_params->lut_stride);
+    pl_gamut_map_generate(tmp, lut_params);
+
+    // Convert to 16-bit unsigned integer for GPU texture
+    const float *in = tmp;
+    uint16_t *out = data;
+    pl_assert(lut_params->lut_stride == 3);
+    pl_assert(params->comps == 4);
+    for (int i = 0; i < lut_size; i++) {
+        out[0] = roundf(in[0] * UINT16_MAX);
+        out[1] = roundf(in[1] * UINT16_MAX + (UINT16_MAX >> 1));
+        out[2] = roundf(in[2] * UINT16_MAX + (UINT16_MAX >> 1));
+        in  += 3;
+        out += 4;
+    }
+
+    pl_free(tmp);
+}
+
+void pl_shader_color_map_ex(pl_shader sh, const struct pl_color_map_params *params,
+                            const struct pl_color_map_args *args)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    struct pl_color_space src = args->src, dst = args->dst;
+    pl_color_space_infer_map(&src, &dst);
+    if (pl_color_space_equal(&src, &dst)) {
+        if (args->prelinearized)
+            pl_shader_delinearize(sh, &dst);
+        return;
+    }
+
+    struct sh_color_map_obj *obj = NULL;
+    if (args->state) {
+        pl_get_detected_hdr_metadata(*args->state, &src.hdr);
+        obj = SH_OBJ(sh, args->state, PL_SHADER_OBJ_COLOR_MAP, struct sh_color_map_obj,
+                     sh_color_map_uninit);
+        if (!obj)
+            return;
+    }
+
+    params = PL_DEF(params, &pl_color_map_default_params);
+    GLSL("// pl_shader_color_map \n"
+         "{                      \n");
+
+    struct pl_tone_map_params tone = {
+        .function       = PL_DEF(params->tone_mapping_function, &pl_tone_map_clip),
+        .constants      = params->tone_constants,
+        .param          = params->tone_mapping_param,
+        .input_scaling  = PL_HDR_PQ,
+        .output_scaling = PL_HDR_PQ,
+        .lut_size       = PL_DEF(params->lut_size, pl_color_map_default_params.lut_size),
+        .hdr            = src.hdr,
+    };
+
+    pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+        .color      = &src,
+        .metadata   = params->metadata,
+        .scaling    = tone.input_scaling,
+        .out_min    = &tone.input_min,
+        .out_max    = &tone.input_max,
+        .out_avg    = &tone.input_avg,
+    ));
+
+    pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+        .color      = &dst,
+        .metadata   = PL_HDR_METADATA_HDR10,
+        .scaling    = tone.output_scaling,
+        .out_min    = &tone.output_min,
+        .out_max    = &tone.output_max,
+    ));
+
+    pl_tone_map_params_infer(&tone);
+
+    // Round sufficiently similar values
+    if (fabs(tone.input_max - tone.output_max) < 1e-6)
+        tone.output_max = tone.input_max;
+    if (fabs(tone.input_min - tone.output_min) < 1e-6)
+        tone.output_min = tone.input_min;
+
+    if (!params->inverse_tone_mapping) {
+        // Never exceed the source unless requested, but still allow
+        // black point adaptation
+        tone.output_max = PL_MIN(tone.output_max, tone.input_max);
+    }
+
+    const int *lut3d_size_def = pl_color_map_default_params.lut3d_size;
+    struct pl_gamut_map_params gamut = {
+        .function        = PL_DEF(params->gamut_mapping, &pl_gamut_map_clip),
+        .constants       = params->gamut_constants,
+        .input_gamut     = src.hdr.prim,
+        .output_gamut    = dst.hdr.prim,
+        .lut_size_I      = PL_DEF(params->lut3d_size[0], lut3d_size_def[0]),
+        .lut_size_C      = PL_DEF(params->lut3d_size[1], lut3d_size_def[1]),
+        .lut_size_h      = PL_DEF(params->lut3d_size[2], lut3d_size_def[2]),
+        .lut_stride      = 3,
+    };
+
+    float src_peak_static;
+    pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+        .color      = &src,
+        .metadata   = PL_HDR_METADATA_HDR10,
+        .scaling    = PL_HDR_PQ,
+        .out_max    = &src_peak_static,
+    ));
+
+    pl_color_space_nominal_luma_ex(pl_nominal_luma_params(
+        .color      = &dst,
+        .metadata   = PL_HDR_METADATA_HDR10,
+        .scaling    = PL_HDR_PQ,
+        .out_min    = &gamut.min_luma,
+        .out_max    = &gamut.max_luma,
+    ));
+
+    // Clip the gamut mapping output to the input gamut if disabled
+    if (!params->gamut_expansion && gamut.function->bidirectional) {
+        if (pl_primaries_compatible(&gamut.input_gamut, &gamut.output_gamut)) {
+            gamut.output_gamut = pl_primaries_clip(&gamut.output_gamut,
+                                                   &gamut.input_gamut);
+        }
+    }
+
+    // Backwards compatibility with older API
+    switch (params->gamut_mode) {
+    case PL_GAMUT_CLIP:
+        switch (params->intent) {
+        case PL_INTENT_AUTO:
+        case PL_INTENT_PERCEPTUAL:
+        case PL_INTENT_RELATIVE_COLORIMETRIC:
+            break; // leave default
+        case PL_INTENT_SATURATION:
+            gamut.function = &pl_gamut_map_saturation;
+            break;
+        case PL_INTENT_ABSOLUTE_COLORIMETRIC:
+            gamut.function = &pl_gamut_map_absolute;
+            break;
+        }
+        break;
+    case PL_GAMUT_DARKEN:
+        gamut.function = &pl_gamut_map_darken;
+        break;
+    case PL_GAMUT_WARN:
+        gamut.function = &pl_gamut_map_highlight;
+        break;
+    case PL_GAMUT_DESATURATE:
+        gamut.function = &pl_gamut_map_desaturate;
+        break;
+    case PL_GAMUT_MODE_COUNT:
+        pl_unreachable();
+    }
+
+    bool can_fast = !params->force_tone_mapping_lut;
+    if (!args->state) {
+        // No state object provided, forcibly disable advanced methods
+        can_fast = true;
+        if (tone.function != &pl_tone_map_clip)
+            tone.function = &pl_tone_map_linear;
+        if (gamut.function != &pl_gamut_map_clip)
+            gamut.function = &pl_gamut_map_saturation;
+    }
+
+    pl_fmt gamut_fmt = pl_find_fmt(SH_GPU(sh), PL_FMT_UNORM, 4, 16, 16, PL_FMT_CAP_LINEAR);
+    if (!gamut_fmt) {
+        gamut.function = &pl_gamut_map_saturation;
+        can_fast = true;
+    }
+
+    bool need_tone_map = !pl_tone_map_params_noop(&tone);
+    bool need_gamut_map = !pl_gamut_map_params_noop(&gamut);
+
+    if (!args->prelinearized)
+        pl_shader_linearize(sh, &src);
+
+    pl_matrix3x3 rgb2lms = pl_ipt_rgb2lms(pl_raw_primaries_get(src.primaries));
+    pl_matrix3x3 lms2rgb = pl_ipt_lms2rgb(pl_raw_primaries_get(dst.primaries));
+    ident_t lms2ipt = SH_MAT3(pl_ipt_lms2ipt);
+    ident_t ipt2lms = SH_MAT3(pl_ipt_ipt2lms);
+
+    if (need_gamut_map && gamut.function == &pl_gamut_map_saturation && can_fast) {
+        const pl_matrix3x3 lms2src = pl_ipt_lms2rgb(&gamut.input_gamut);
+        const pl_matrix3x3 dst2lms = pl_ipt_rgb2lms(&gamut.output_gamut);
+        sh_describe(sh, "gamut map (saturation)");
+        pl_matrix3x3_mul(&lms2rgb, &dst2lms);
+        pl_matrix3x3_mul(&lms2rgb, &lms2src);
+        need_gamut_map = false;
+    }
+
+    // Fast path: simply convert between primaries (if needed)
+    if (!need_tone_map && !need_gamut_map) {
+        if (src.primaries != dst.primaries) {
+            sh_describe(sh, "colorspace conversion");
+            pl_matrix3x3_mul(&lms2rgb, &rgb2lms);
+            GLSL("color.rgb = "$" * color.rgb; \n", SH_MAT3(lms2rgb));
+        }
+        goto done;
+    }
+
+    // Full path: convert input from normalized RGB to IPT
+    GLSL("vec3 lms = "$" * color.rgb;               \n"
+         "vec3 lmspq = %f * lms;                    \n"
+         "lmspq = pow(max(lmspq, 0.0), vec3(%f));   \n"
+         "lmspq = (vec3(%f) + %f * lmspq)           \n"
+         "        / (vec3(1.0) + %f * lmspq);       \n"
+         "lmspq = pow(lmspq, vec3(%f));             \n"
+         "vec3 ipt = "$" * lmspq;                   \n"
+         "float i_orig = ipt.x;                     \n",
+         SH_MAT3(rgb2lms),
+         PL_COLOR_SDR_WHITE / 10000,
+         PQ_M1, PQ_C1, PQ_C2, PQ_C3, PQ_M2,
+         lms2ipt);
+
+    if (params->show_clipping) {
+        const float eps = 1e-6f;
+        GLSL("bool clip_hi, clip_lo;                            \n"
+             "clip_hi = any(greaterThan(color.rgb, vec3("$"))); \n"
+             "clip_lo = any(lessThan(color.rgb, vec3("$")));    \n"
+             "clip_hi = clip_hi || ipt.x > "$";                 \n"
+             "clip_lo = clip_lo || ipt.x < "$";                 \n",
+             SH_FLOAT_DYN(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, tone.input_max) + eps),
+             SH_FLOAT(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, tone.input_min) - eps),
+             SH_FLOAT_DYN(tone.input_max + eps),
+             SH_FLOAT(tone.input_min - eps));
+    }
+
+    if (need_tone_map) {
+        const struct pl_tone_map_function *fun = tone.function;
+        sh_describef(sh, "%s tone map (%.0f -> %.0f)", fun->name,
+                     pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, tone.input_max),
+                     pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, tone.output_max));
+
+        if (fun == &pl_tone_map_clip && can_fast) {
+
+            GLSL("#define tone_map(x) clamp((x), "$", "$") \n",
+                 SH_FLOAT(tone.input_min),
+                 SH_FLOAT_DYN(tone.input_max));
+
+        } else if (fun == &pl_tone_map_linear && can_fast) {
+
+            const float gain = tone.constants.exposure;
+            const float scale = tone.input_max - tone.input_min;
+
+            ident_t linfun = sh_fresh(sh, "linear_pq");
+            GLSLH("float "$"(float x) {                         \n"
+                 // Stretch the input range (while clipping)
+                 "    x = "$" * x + "$";                        \n"
+                 "    x = clamp(x, 0.0, 1.0);                   \n"
+                 "    x = "$" * x + "$";                        \n"
+                 "    return x;                                 \n"
+                 "}                                             \n",
+                 linfun,
+                 SH_FLOAT_DYN(gain / scale),
+                 SH_FLOAT_DYN(-gain / scale * tone.input_min),
+                 SH_FLOAT_DYN(tone.output_max - tone.output_min),
+                 SH_FLOAT(tone.output_min));
+
+            GLSL("#define tone_map(x) ("$"(x)) \n", linfun);
+
+        } else {
+
+            pl_assert(obj);
+            ident_t lut = sh_lut(sh, sh_lut_params(
+                .object     = &obj->tone.lut,
+                .var_type   = PL_VAR_FLOAT,
+                .lut_type   = SH_LUT_AUTO,
+                .method     = SH_LUT_LINEAR,
+                .width      = tone.lut_size,
+                .comps      = 1,
+                .update     = !pl_tone_map_params_equal(&tone, &obj->tone.params),
+                .dynamic    = tone.input_avg > 0, // dynamic metadata
+                .fill       = fill_tone_lut,
+                .priv       = &tone,
+            ));
+            obj->tone.params = tone;
+            if (!lut) {
+                SH_FAIL(sh, "Failed generating tone-mapping LUT!");
+                return;
+            }
+
+            const float lut_range = tone.input_max - tone.input_min;
+            GLSL("#define tone_map(x) ("$"("$" * (x) + "$")) \n",
+                 lut, SH_FLOAT_DYN(1.0f / lut_range),
+                 SH_FLOAT_DYN(-tone.input_min / lut_range));
+
+        }
+
+        bool need_recovery = tone.input_max >= tone.output_max;
+        if (need_recovery && params->contrast_recovery && args->feature_map) {
+            ident_t pos, pt;
+            ident_t lowres = sh_bind(sh, args->feature_map, PL_TEX_ADDRESS_CLAMP,
+                                     PL_TEX_SAMPLE_LINEAR, "feature_map",
+                                     NULL, &pos, &pt);
+
+            // Obtain HF detail map from bicubic interpolation of LF features
+            GLSL("vec2 lpos  = "$";                                 \n"
+                 "vec2 lpt   = "$";                                 \n"
+                 "vec2 lsize = vec2(textureSize("$", 0));           \n"
+                 "vec2 frac  = fract(lpos * lsize + vec2(0.5));     \n"
+                 "vec2 frac2 = frac * frac;                         \n"
+                 "vec2 inv   = vec2(1.0) - frac;                    \n"
+                 "vec2 inv2  = inv * inv;                           \n"
+                 "vec2 w0 = 1.0/6.0 * inv2 * inv;                   \n"
+                 "vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac);   \n"
+                 "vec2 w2 = 2.0/3.0 - 0.5 * inv2  * (2.0 - inv);    \n"
+                 "vec2 w3 = 1.0/6.0 * frac2 * frac;                 \n"
+                 "vec4 g = vec4(w0 + w1, w2 + w3);                  \n"
+                 "vec4 h = vec4(w1, w3) / g + inv.xyxy;             \n"
+                 "h.xy -= vec2(2.0);                                \n"
+                 "vec4 p = lpos.xyxy + lpt.xyxy * h;                \n"
+                 "float l00 = textureLod("$", p.xy, 0.0).r;         \n"
+                 "float l01 = textureLod("$", p.xw, 0.0).r;         \n"
+                 "float l0 = mix(l01, l00, g.y);                    \n"
+                 "float l10 = textureLod("$", p.zy, 0.0).r;         \n"
+                 "float l11 = textureLod("$", p.zw, 0.0).r;         \n"
+                 "float l1 = mix(l11, l10, g.y);                    \n"
+                 "float luma = mix(l1, l0, g.x);                    \n"
+                 // Mix low-resolution tone mapped image with high-resolution
+                 // tone mapped image according to desired strength.
+                 "float highres = clamp(ipt.x, 0.0, 1.0);           \n"
+                 "float lowres = clamp(luma, 0.0, 1.0);             \n"
+                 "float detail = highres - lowres;                  \n"
+                 "float base = tone_map(highres);                   \n"
+                 "float sharp = tone_map(lowres) + detail;          \n"
+                 "ipt.x = clamp(mix(base, sharp, "$"), "$", "$");   \n",
+                 pos, pt, lowres,
+                 lowres, lowres, lowres, lowres,
+                 SH_FLOAT(params->contrast_recovery),
+                 SH_FLOAT(tone.output_min), SH_FLOAT_DYN(tone.output_max));
+
+        } else {
+
+            GLSL("ipt.x = tone_map(ipt.x); \n");
+        }
+
+        // Avoid raising saturation excessively when raising brightness, and
+        // also desaturate when reducing brightness greatly to account for the
+        // reduction in gamut volume.
+        GLSL("vec2 hull = vec2(i_orig, ipt.x);                  \n"
+             "hull = ((hull - 6.0) * hull + 9.0) * hull;        \n"
+             "ipt.yz *= min(i_orig / ipt.x, hull.y / hull.x);   \n");
+    }
+
+    if (need_gamut_map) {
+        const struct pl_gamut_map_function *fun = gamut.function;
+        sh_describef(sh, "gamut map (%s)", fun->name);
+
+        pl_assert(obj);
+        ident_t lut = sh_lut(sh, sh_lut_params(
+            .object     = &obj->gamut.lut,
+            .var_type   = PL_VAR_FLOAT,
+            .lut_type   = SH_LUT_TEXTURE,
+            .fmt        = gamut_fmt,
+            .method     = params->lut3d_tricubic ? SH_LUT_CUBIC : SH_LUT_LINEAR,
+            .width      = gamut.lut_size_I,
+            .height     = gamut.lut_size_C,
+            .depth      = gamut.lut_size_h,
+            .comps      = 4,
+            .signature  = gamut_map_signature(&gamut),
+            .cache      = SH_CACHE(sh),
+            .fill       = fill_gamut_lut,
+            .priv       = &gamut,
+        ));
+        if (!lut) {
+            SH_FAIL(sh, "Failed generating gamut-mapping LUT!");
+            return;
+        }
+
+        // 3D LUT lookup (in ICh space)
+        const float lut_range = gamut.max_luma - gamut.min_luma;
+        GLSL("vec3 idx;                             \n"
+             "idx.x = "$" * ipt.x + "$";            \n"
+             "idx.y = 2.0 * length(ipt.yz);         \n"
+             "idx.z = %f * atan(ipt.z, ipt.y) + 0.5;\n"
+             "ipt = "$"(idx).xyz;                   \n"
+             "ipt.yz -= vec2(32768.0/65535.0);      \n",
+             SH_FLOAT(1.0f / lut_range),
+             SH_FLOAT(-gamut.min_luma / lut_range),
+             0.5f / M_PI, lut);
+
+        if (params->show_clipping) {
+            GLSL("clip_lo = clip_lo || any(lessThan(idx, vec3(0.0)));    \n"
+                 "clip_hi = clip_hi || any(greaterThan(idx, vec3(1.0))); \n");
+        }
+
+        if (params->visualize_lut) {
+            visualize_gamut_map(sh, params->visualize_rect, lut,
+                                params->visualize_hue, params->visualize_theta,
+                                &gamut);
+        }
+    }
+
+    // Convert IPT back to linear RGB
+    GLSL("lmspq = "$" * ipt;                        \n"
+         "lms = pow(max(lmspq, 0.0), vec3(1.0/%f)); \n"
+         "lms = max(lms - vec3(%f), 0.0)            \n"
+         "             / (vec3(%f) - %f * lms);     \n"
+         "lms = pow(lms, vec3(1.0/%f));             \n"
+         "lms *= %f;                                \n"
+         "color.rgb = "$" * lms;                    \n",
+         ipt2lms,
+         PQ_M2, PQ_C1, PQ_C2, PQ_C3, PQ_M1,
+         10000 / PL_COLOR_SDR_WHITE,
+         SH_MAT3(lms2rgb));
+
+    if (params->show_clipping) {
+        GLSL("if (clip_hi) {                                                \n"
+             "    float k = dot(color.rgb, vec3(2.0 / 3.0));                \n"
+             "    color.rgb = clamp(vec3(k) - color.rgb, 0.0, 1.0);         \n"
+             "    float cmin = min(min(color.r, color.g), color.b);         \n"
+             "    float cmax = max(max(color.r, color.g), color.b);         \n"
+             "    float delta = cmax - cmin;                                \n"
+             "    vec3 sat = smoothstep(cmin - 1e-6, cmax, color.rgb);      \n"
+             "    const vec3 red = vec3(1.0, 0.0, 0.0);                     \n"
+             "    color.rgb = mix(red, sat, smoothstep(0.0, 0.3, delta));   \n"
+             "} else if (clip_lo) {                                         \n"
+             "    vec3 hi = vec3(0.0, 0.3, 0.3);                            \n"
+             "    color.rgb = mix(color.rgb, hi, 0.5);                      \n"
+             "}                                                             \n");
+    }
+
+    if (need_tone_map) {
+        if (params->visualize_lut) {
+            float alpha = need_gamut_map ? powf(cosf(params->visualize_theta), 5.0f) : 1.0f;
+            visualize_tone_map(sh, params->visualize_rect, alpha, &tone);
+        }
+        GLSL("#undef tone_map \n");
+    }
+
+done:
+    pl_shader_delinearize(sh, &dst);
+    GLSL("}\n");
+}
+
+// Backwards compatibility wrapper around `pl_shader_color_map_ex`
+void pl_shader_color_map(pl_shader sh, const struct pl_color_map_params *params,
+                         struct pl_color_space src, struct pl_color_space dst,
+                         pl_shader_obj *state, bool prelinearized)
+{
+    pl_shader_color_map_ex(sh, params, pl_color_map_args(
+        .src           = src,
+        .dst           = dst,
+        .prelinearized = prelinearized,
+        .state         = state,
+        .feature_map   = NULL
+    ));
+}
+
+void pl_shader_cone_distort(pl_shader sh, struct pl_color_space csp,
+                            const struct pl_cone_params *params)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+    if (!params || !params->cones)
+        return;
+
+    sh_describe(sh, "cone distortion");
+    GLSL("// pl_shader_cone_distort\n");
+    GLSL("{\n");
+
+    pl_color_space_infer(&csp);
+    pl_shader_linearize(sh, &csp);
+
+    pl_matrix3x3 cone_mat;
+    cone_mat = pl_get_cone_matrix(params, pl_raw_primaries_get(csp.primaries));
+    GLSL("color.rgb = "$" * color.rgb; \n", sh_var(sh, (struct pl_shader_var) {
+        .var = pl_var_mat3("cone_mat"),
+        .data = PL_TRANSPOSE_3X3(cone_mat.m),
+    }));
+
+    pl_shader_delinearize(sh, &csp);
+    GLSL("}\n");
+}
diff --git a/src/shaders/custom.c b/src/shaders/custom.c
new file mode 100644
index 0000000..3f03e57
--- /dev/null
+++ b/src/shaders/custom.c
@@ -0,0 +1,89 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "shaders.h"
+
+#include <libplacebo/shaders/custom.h>
+
+bool pl_shader_custom(pl_shader sh, const struct pl_custom_shader *params)
+{
+    if (params->compute) {
+        int bw = PL_DEF(params->compute_group_size[0], 16);
+        int bh = PL_DEF(params->compute_group_size[1], 16);
+        bool flex = !params->compute_group_size[0] ||
+                    !params->compute_group_size[1];
+        if (!sh_try_compute(sh, bw, bh, flex, params->compute_shmem))
+            return false;
+    }
+
+    if (!sh_require(sh, params->input, params->output_w, params->output_h))
+        return false;
+
+    sh->output = params->output;
+
+    for (int i = 0; i < params->num_variables; i++) {
+        struct pl_shader_var sv = params->variables[i];
+        GLSLP("#define %s "$"\n", sv.var.name, sh_var(sh, sv));
+    }
+
+    for (int i = 0; i < params->num_descriptors; i++) {
+        struct pl_shader_desc sd = params->descriptors[i];
+        GLSLP("#define %s "$"\n", sd.desc.name, sh_desc(sh, sd));
+    }
+
+    for (int i = 0; i < params->num_vertex_attribs; i++) {
+        struct pl_shader_va sva = params->vertex_attribs[i];
+        GLSLP("#define %s "$"\n", sva.attr.name, sh_attr(sh, sva));
+    }
+
+    for (int i = 0; i < params->num_constants; i++) {
+        struct pl_shader_const sc = params->constants[i];
+        GLSLP("#define %s "$"\n", sc.name, sh_const(sh, sc));
+    }
+
+    if (params->prelude)
+        GLSLP("// pl_shader_custom prelude: \n%s\n", params->prelude);
+    if (params->header)
+        GLSLH("// pl_shader_custom header: \n%s\n", params->header);
+
+    if (params->description)
+        sh_describef(sh, "%s", params->description);
+
+    if (params->body) {
+        const char *output_decl = "";
+        if (params->output != params->input) {
+            switch (params->output) {
+            case PL_SHADER_SIG_NONE: break;
+            case PL_SHADER_SIG_COLOR:
+                output_decl = "vec4 color = vec4(0.0);";
+                break;
+
+            case PL_SHADER_SIG_SAMPLER:
+                pl_unreachable();
+            }
+        }
+
+        GLSL("// pl_shader_custom \n"
+             "%s                  \n"
+             "{                   \n"
+             "%s                  \n"
+             "}                   \n",
+             output_decl, params->body);
+    }
+
+    return true;
+}
diff --git a/src/shaders/custom_mpv.c b/src/shaders/custom_mpv.c
new file mode 100644
index 0000000..4ef0817
--- /dev/null
+++ b/src/shaders/custom_mpv.c
@@ -0,0 +1,1768 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include <limits.h>
+
+#include "gpu.h"
+#include "shaders.h"
+
+#include <libplacebo/shaders/colorspace.h>
+#include <libplacebo/shaders/custom.h>
+
+// Hard-coded size limits, mainly for convenience (to avoid dynamic memory)
+#define SHADER_MAX_HOOKS 16
+#define SHADER_MAX_BINDS 16
+#define MAX_SHEXP_SIZE 32
+
+enum shexp_op {
+    SHEXP_OP_ADD,
+    SHEXP_OP_SUB,
+    SHEXP_OP_MUL,
+    SHEXP_OP_DIV,
+    SHEXP_OP_MOD,
+    SHEXP_OP_NOT,
+    SHEXP_OP_GT,
+    SHEXP_OP_LT,
+    SHEXP_OP_EQ,
+};
+
+enum shexp_tag {
+    SHEXP_END = 0, // End of an RPN expression
+    SHEXP_CONST, // Push a constant value onto the stack
+    SHEXP_TEX_W, // Get the width/height of a named texture (variable)
+    SHEXP_TEX_H,
+    SHEXP_OP2, // Pop two elements and push the result of a dyadic operation
+    SHEXP_OP1, // Pop one element and push the result of a monadic operation
+    SHEXP_VAR, // Arbitrary variable (e.g. shader parameters)
+};
+
+struct shexp {
+    enum shexp_tag tag;
+    union {
+        float cval;
+        pl_str varname;
+        enum shexp_op op;
+    } val;
+};
+
+struct custom_shader_hook {
+    // Variable/literal names of textures
+    pl_str pass_desc;
+    pl_str hook_tex[SHADER_MAX_HOOKS];
+    pl_str bind_tex[SHADER_MAX_BINDS];
+    pl_str save_tex;
+
+    // Shader body itself + metadata
+    pl_str pass_body;
+    float offset[2];
+    bool offset_align;
+    int comps;
+
+    // Special expressions governing the output size and execution conditions
+    struct shexp width[MAX_SHEXP_SIZE];
+    struct shexp height[MAX_SHEXP_SIZE];
+    struct shexp cond[MAX_SHEXP_SIZE];
+
+    // Special metadata for compute shaders
+    bool is_compute;
+    int block_w, block_h;       // Block size (each block corresponds to one WG)
+    int threads_w, threads_h;   // How many threads form a WG
+};
+
+static bool parse_rpn_shexpr(pl_str line, struct shexp out[MAX_SHEXP_SIZE])
+{
+    int pos = 0;
+
+    while (line.len > 0) {
+        pl_str word = pl_str_split_char(line, ' ', &line);
+        if (word.len == 0)
+            continue;
+
+        if (pos >= MAX_SHEXP_SIZE)
+            return false;
+
+        struct shexp *exp = &out[pos++];
+
+        if (pl_str_eatend0(&word, ".w") || pl_str_eatend0(&word, ".width")) {
+            exp->tag = SHEXP_TEX_W;
+            exp->val.varname = word;
+            continue;
+        }
+
+        if (pl_str_eatend0(&word, ".h") || pl_str_eatend0(&word, ".height")) {
+            exp->tag = SHEXP_TEX_H;
+            exp->val.varname = word;
+            continue;
+        }
+
+        switch (word.buf[0]) {
+        case '+': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_ADD; continue;
+        case '-': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_SUB; continue;
+        case '*': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_MUL; continue;
+        case '/': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_DIV; continue;
+        case '%': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_MOD; continue;
+        case '!': exp->tag = SHEXP_OP1; exp->val.op = SHEXP_OP_NOT; continue;
+        case '>': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_GT;  continue;
+        case '<': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_LT;  continue;
+        case '=': exp->tag = SHEXP_OP2; exp->val.op = SHEXP_OP_EQ;  continue;
+        }
+
+        if (word.buf[0] >= '0' && word.buf[0] <= '9') {
+            exp->tag = SHEXP_CONST;
+            if (!pl_str_parse_float(word, &exp->val.cval))
+                return false;
+            continue;
+        }
+
+        // Treat as generic variable
+        exp->tag = SHEXP_VAR;
+        exp->val.varname = word;
+    }
+
+    return true;
+}
+
+static inline pl_str split_magic(pl_str *body)
+{
+    pl_str ret = pl_str_split_str0(*body, "//!", body);
+    if (body->len) {
+        // Make sure the separator is included in the remainder
+        body->buf -= 3;
+        body->len += 3;
+    }
+
+    return ret;
+}
+
+static bool parse_hook(pl_log log, pl_str *body, struct custom_shader_hook *out)
+{
+    *out = (struct custom_shader_hook){
+        .pass_desc = pl_str0("unknown user shader"),
+        .width = {{ SHEXP_TEX_W, { .varname = pl_str0("HOOKED") }}},
+        .height = {{ SHEXP_TEX_H, { .varname = pl_str0("HOOKED") }}},
+        .cond = {{ SHEXP_CONST, { .cval = 1.0 }}},
+    };
+
+    int hook_idx = 0;
+    int bind_idx = 0;
+
+    // Parse all headers
+    while (true) {
+        pl_str rest;
+        pl_str line = pl_str_strip(pl_str_getline(*body, &rest));
+
+        // Check for the presence of the magic line beginning
+        if (!pl_str_eatstart0(&line, "//!"))
+            break;
+
+        *body = rest;
+
+        // Parse the supported commands
+        if (pl_str_eatstart0(&line, "HOOK")) {
+            if (hook_idx == SHADER_MAX_HOOKS) {
+                pl_err(log, "Passes may only hook up to %d textures!",
+                       SHADER_MAX_HOOKS);
+                return false;
+            }
+            out->hook_tex[hook_idx++] = pl_str_strip(line);
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "BIND")) {
+            if (bind_idx == SHADER_MAX_BINDS) {
+                pl_err(log, "Passes may only bind up to %d textures!",
+                       SHADER_MAX_BINDS);
+                return false;
+            }
+            out->bind_tex[bind_idx++] = pl_str_strip(line);
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "SAVE")) {
+            pl_str save_tex = pl_str_strip(line);
+            if (pl_str_equals0(save_tex, "HOOKED")) {
+                // This is a special name that means "overwrite existing"
+                // texture, which we just signal by not having any `save_tex`
+                // name set.
+                out->save_tex = (pl_str) {0};
+            } else if (pl_str_equals0(save_tex, "MAIN")) {
+                // Compatibility alias
+                out->save_tex = pl_str0("MAINPRESUB");
+            } else {
+                out->save_tex = save_tex;
+            };
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "DESC")) {
+            out->pass_desc = pl_str_strip(line);
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "OFFSET")) {
+            line = pl_str_strip(line);
+            if (pl_str_equals0(line, "ALIGN")) {
+                out->offset_align = true;
+            } else {
+                if (!pl_str_parse_float(pl_str_split_char(line, ' ', &line), &out->offset[0]) ||
+                    !pl_str_parse_float(pl_str_split_char(line, ' ', &line), &out->offset[1]) ||
+                    line.len)
+                {
+                    pl_err(log, "Error while parsing OFFSET!");
+                    return false;
+                }
+            }
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "WIDTH")) {
+            if (!parse_rpn_shexpr(line, out->width)) {
+                pl_err(log, "Error while parsing WIDTH!");
+                return false;
+            }
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "HEIGHT")) {
+            if (!parse_rpn_shexpr(line, out->height)) {
+                pl_err(log, "Error while parsing HEIGHT!");
+                return false;
+            }
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "WHEN")) {
+            if (!parse_rpn_shexpr(line, out->cond)) {
+                pl_err(log, "Error while parsing WHEN!");
+                return false;
+            }
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "COMPONENTS")) {
+            if (!pl_str_parse_int(pl_str_strip(line), &out->comps)) {
+                pl_err(log, "Error parsing COMPONENTS: '%.*s'", PL_STR_FMT(line));
+                return false;
+            }
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "COMPUTE")) {
+            line = pl_str_strip(line);
+            bool ok = pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->block_w) &&
+                      pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->block_h);
+
+            line = pl_str_strip(line);
+            if (ok && line.len) {
+                ok = pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->threads_w) &&
+                     pl_str_parse_int(pl_str_split_char(line, ' ', &line), &out->threads_h) &&
+                     !line.len;
+            } else {
+                out->threads_w = out->block_w;
+                out->threads_h = out->block_h;
+            }
+
+            if (!ok) {
+                pl_err(log, "Error while parsing COMPUTE!");
+                return false;
+            }
+
+            out->is_compute = true;
+            continue;
+        }
+
+        // Unknown command type
+        pl_err(log, "Unrecognized command '%.*s'!", PL_STR_FMT(line));
+        return false;
+    }
+
+    // The rest of the file up until the next magic line beginning (if any)
+    // shall be the shader body
+    out->pass_body = split_magic(body);
+
+    // Sanity checking
+    if (hook_idx == 0)
+        pl_warn(log, "Pass has no hooked textures (will be ignored)!");
+
+    return true;
+}
+
+static bool parse_tex(pl_gpu gpu, void *alloc, pl_str *body,
+                      struct pl_shader_desc *out)
+{
+    *out = (struct pl_shader_desc) {
+        .desc = {
+            .name = "USER_TEX",
+            .type = PL_DESC_SAMPLED_TEX,
+        },
+    };
+
+    struct pl_tex_params params = {
+        .w = 1, .h = 1, .d = 0,
+        .sampleable = true,
+        .debug_tag = PL_DEBUG_TAG,
+    };
+
+    while (true) {
+        pl_str rest;
+        pl_str line = pl_str_strip(pl_str_getline(*body, &rest));
+
+        if (!pl_str_eatstart0(&line, "//!"))
+            break;
+
+        *body = rest;
+
+        if (pl_str_eatstart0(&line, "TEXTURE")) {
+            out->desc.name = pl_strdup0(alloc, pl_str_strip(line));
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "SIZE")) {
+            line = pl_str_strip(line);
+            int dims = 0;
+            int dim[4]; // extra space to catch invalid extra entries
+            while (line.len && dims < PL_ARRAY_SIZE(dim)) {
+                if (!pl_str_parse_int(pl_str_split_char(line, ' ', &line), &dim[dims++])) {
+                    PL_ERR(gpu, "Error while parsing SIZE!");
+                    return false;
+                }
+            }
+
+            uint32_t lim = dims == 1 ? gpu->limits.max_tex_1d_dim
+                         : dims == 2 ? gpu->limits.max_tex_2d_dim
+                         : dims == 3 ? gpu->limits.max_tex_3d_dim
+                         : 0;
+
+            // Sanity check against GPU size limits
+            switch (dims) {
+            case 3:
+                params.d = dim[2];
+                if (params.d < 1 || params.d > lim) {
+                    PL_ERR(gpu, "SIZE %d exceeds GPU's texture size limits (%d)!",
+                           params.d, lim);
+                    return false;
+                }
+                // fall through
+            case 2:
+                params.h = dim[1];
+                if (params.h < 1 || params.h > lim) {
+                    PL_ERR(gpu, "SIZE %d exceeds GPU's texture size limits (%d)!",
+                           params.h, lim);
+                    return false;
+                }
+                // fall through
+            case 1:
+                params.w = dim[0];
+                if (params.w < 1 || params.w > lim) {
+                    PL_ERR(gpu, "SIZE %d exceeds GPU's texture size limits (%d)!",
+                           params.w, lim);
+                    return false;
+                }
+                break;
+
+            default:
+                PL_ERR(gpu, "Invalid number of texture dimensions!");
+                return false;
+            };
+
+            // Clear out the superfluous components
+            if (dims < 3)
+                params.d = 0;
+            if (dims < 2)
+                params.h = 0;
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "FORMAT")) {
+            line = pl_str_strip(line);
+            params.format = NULL;
+            for (int n = 0; n < gpu->num_formats; n++) {
+                pl_fmt fmt = gpu->formats[n];
+                if (pl_str_equals0(line, fmt->name)) {
+                    params.format = fmt;
+                    break;
+                }
+            }
+
+            if (!params.format || params.format->opaque) {
+                PL_ERR(gpu, "Unrecognized/unavailable FORMAT name: '%.*s'!",
+                       PL_STR_FMT(line));
+                return false;
+            }
+
+            if (!(params.format->caps & PL_FMT_CAP_SAMPLEABLE)) {
+                PL_ERR(gpu, "Chosen FORMAT '%.*s' is not sampleable!",
+                       PL_STR_FMT(line));
+                return false;
+            }
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "FILTER")) {
+            line = pl_str_strip(line);
+            if (pl_str_equals0(line, "LINEAR")) {
+                out->binding.sample_mode = PL_TEX_SAMPLE_LINEAR;
+            } else if (pl_str_equals0(line, "NEAREST")) {
+                out->binding.sample_mode = PL_TEX_SAMPLE_NEAREST;
+            } else {
+                PL_ERR(gpu, "Unrecognized FILTER: '%.*s'!", PL_STR_FMT(line));
+                return false;
+            }
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "BORDER")) {
+            line = pl_str_strip(line);
+            if (pl_str_equals0(line, "CLAMP")) {
+                out->binding.address_mode = PL_TEX_ADDRESS_CLAMP;
+            } else if (pl_str_equals0(line, "REPEAT")) {
+                out->binding.address_mode = PL_TEX_ADDRESS_REPEAT;
+            } else if (pl_str_equals0(line, "MIRROR")) {
+                out->binding.address_mode = PL_TEX_ADDRESS_MIRROR;
+            } else {
+                PL_ERR(gpu, "Unrecognized BORDER: '%.*s'!", PL_STR_FMT(line));
+                return false;
+            }
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "STORAGE")) {
+            params.storable = true;
+            out->desc.type = PL_DESC_STORAGE_IMG;
+            out->desc.access = PL_DESC_ACCESS_READWRITE;
+            out->memory = PL_MEMORY_COHERENT;
+            continue;
+        }
+
+        PL_ERR(gpu, "Unrecognized command '%.*s'!", PL_STR_FMT(line));
+        return false;
+    }
+
+    if (!params.format) {
+        PL_ERR(gpu, "No FORMAT specified!");
+        return false;
+    }
+
+    int caps = params.format->caps;
+    if (out->binding.sample_mode == PL_TEX_SAMPLE_LINEAR && !(caps & PL_FMT_CAP_LINEAR)) {
+        PL_ERR(gpu, "The specified texture format cannot be linear filtered!");
+        return false;
+    }
+
+    // Decode the rest of the section (up to the next //! marker) as raw hex
+    // data for the texture
+    pl_str tex, hexdata = split_magic(body);
+    if (!pl_str_decode_hex(NULL, pl_str_strip(hexdata), &tex)) {
+        PL_ERR(gpu, "Error while parsing TEXTURE body: must be a valid "
+                    "hexadecimal sequence!");
+        return false;
+    }
+
+    int texels = params.w * PL_DEF(params.h, 1) * PL_DEF(params.d, 1);
+    size_t expected_len = texels * params.format->texel_size;
+    if (tex.len == 0 && params.storable) {
+        // In this case, it's okay that the texture has no initial data
+        pl_free_ptr(&tex.buf);
+    } else if (tex.len != expected_len) {
+        PL_ERR(gpu, "Shader TEXTURE size mismatch: got %zu bytes, expected %zu!",
+               tex.len, expected_len);
+        pl_free(tex.buf);
+        return false;
+    }
+
+    params.initial_data = tex.buf;
+    out->binding.object = pl_tex_create(gpu, &params);
+    pl_free(tex.buf);
+
+    if (!out->binding.object) {
+        PL_ERR(gpu, "Failed creating custom texture!");
+        return false;
+    }
+
+    return true;
+}
+
+static bool parse_buf(pl_gpu gpu, void *alloc, pl_str *body,
+                      struct pl_shader_desc *out)
+{
+    *out = (struct pl_shader_desc) {
+        .desc = {
+            .name = "USER_BUF",
+            .type = PL_DESC_BUF_UNIFORM,
+        },
+    };
+
+    // Temporary, to allow deferring variable placement until all headers
+    // have been processed (in order to e.g. determine buffer type)
+    void *tmp = pl_tmp(alloc); // will be freed automatically on failure
+    PL_ARRAY(struct pl_var) vars = {0};
+
+    while (true) {
+        pl_str rest;
+        pl_str line = pl_str_strip(pl_str_getline(*body, &rest));
+
+        if (!pl_str_eatstart0(&line, "//!"))
+            break;
+
+        *body = rest;
+
+        if (pl_str_eatstart0(&line, "BUFFER")) {
+            out->desc.name = pl_strdup0(alloc, pl_str_strip(line));
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "STORAGE")) {
+            out->desc.type = PL_DESC_BUF_STORAGE;
+            out->desc.access = PL_DESC_ACCESS_READWRITE;
+            out->memory = PL_MEMORY_COHERENT;
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "VAR")) {
+            pl_str type_name = pl_str_split_char(pl_str_strip(line), ' ', &line);
+            struct pl_var var = {0};
+            for (const struct pl_named_var *nv = pl_var_glsl_types; nv->glsl_name; nv++) {
+                if (pl_str_equals0(type_name, nv->glsl_name)) {
+                    var = nv->var;
+                    break;
+                }
+            }
+
+            if (!var.type) {
+                // No type found
+                PL_ERR(gpu, "Unrecognized GLSL type '%.*s'!", PL_STR_FMT(type_name));
+                return false;
+            }
+
+            pl_str var_name = pl_str_split_char(line, '[', &line);
+            if (line.len > 0) {
+                // Parse array dimension
+                if (!pl_str_parse_int(pl_str_split_char(line, ']', NULL), &var.dim_a)) {
+                    PL_ERR(gpu, "Failed parsing array dimension from [%.*s!",
+                           PL_STR_FMT(line));
+                    return false;
+                }
+
+                if (var.dim_a < 1) {
+                    PL_ERR(gpu, "Invalid array dimension %d!", var.dim_a);
+                    return false;
+                }
+            }
+
+            var.name = pl_strdup0(alloc, pl_str_strip(var_name));
+            PL_ARRAY_APPEND(tmp, vars, var);
+            continue;
+        }
+
+        PL_ERR(gpu, "Unrecognized command '%.*s'!", PL_STR_FMT(line));
+        return false;
+    }
+
+    // Try placing all of the buffer variables
+    for (int i = 0; i < vars.num; i++) {
+        if (!sh_buf_desc_append(alloc, gpu, out, NULL, vars.elem[i])) {
+            PL_ERR(gpu, "Custom buffer exceeds GPU limitations!");
+            return false;
+        }
+    }
+
+    // Decode the rest of the section (up to the next //! marker) as raw hex
+    // data for the buffer
+    pl_str data, hexdata = split_magic(body);
+    if (!pl_str_decode_hex(tmp, pl_str_strip(hexdata), &data)) {
+        PL_ERR(gpu, "Error while parsing BUFFER body: must be a valid "
+                    "hexadecimal sequence!");
+        return false;
+    }
+
+    size_t buf_size = sh_buf_desc_size(out);
+    if (data.len == 0 && out->desc.type == PL_DESC_BUF_STORAGE) {
+        // In this case, it's okay that the buffer has no initial data
+    } else if (data.len != buf_size) {
+        PL_ERR(gpu, "Shader BUFFER size mismatch: got %zu bytes, expected %zu!",
+               data.len, buf_size);
+        return false;
+    }
+
+    out->binding.object = pl_buf_create(gpu, pl_buf_params(
+        .size = buf_size,
+        .uniform = out->desc.type == PL_DESC_BUF_UNIFORM,
+        .storable = out->desc.type == PL_DESC_BUF_STORAGE,
+        .initial_data = data.len ? data.buf : NULL,
+    ));
+
+    if (!out->binding.object) {
+        PL_ERR(gpu, "Failed creating custom buffer!");
+        return false;
+    }
+
+    pl_free(tmp);
+    return true;
+}
+
+static bool parse_var(pl_log log, pl_str str, enum pl_var_type type, pl_var_data *out)
+{
+    if (!str.len)
+        return true;
+
+    pl_str buf = str;
+    bool ok = false;
+    switch (type) {
+    case PL_VAR_SINT:
+        ok = pl_str_parse_int(pl_str_split_char(buf, ' ', &buf), &out->i);
+        break;
+    case PL_VAR_UINT:
+        ok = pl_str_parse_uint(pl_str_split_char(buf, ' ', &buf), &out->u);
+        break;
+    case PL_VAR_FLOAT:
+        ok = pl_str_parse_float(pl_str_split_char(buf, ' ', &buf), &out->f);
+        break;
+    case PL_VAR_INVALID:
+    case PL_VAR_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+    if (pl_str_strip(buf).len > 0)
+        ok = false; // left-over garbage
+
+    if (!ok) {
+        pl_err(log, "Failed parsing variable data: %.*s", PL_STR_FMT(str));
+        return false;
+    }
+
+    return true;
+}
+
+static bool check_bounds(pl_log log, enum pl_var_type type, const pl_var_data data,
+                         const pl_var_data minimum, const pl_var_data maximum)
+{
+#define CHECK_BOUNDS(v, fmt) do                                                 \
+{                                                                               \
+    if (data.v < minimum.v) {                                                   \
+        pl_err(log, "Initial value "fmt" below declared minimum "fmt"!",        \
+                data.v, minimum.v);                                             \
+        return false;                                                           \
+    }                                                                           \
+    if (data.v > maximum.v) {                                                   \
+        pl_err(log, "Initial value "fmt" above declared maximum "fmt"!",        \
+                data.v, maximum.v);                                             \
+        return false;                                                           \
+    }                                                                           \
+} while (0)
+
+    switch (type) {
+    case PL_VAR_SINT:
+        CHECK_BOUNDS(i, "%d");
+        break;
+    case PL_VAR_UINT:
+        CHECK_BOUNDS(u, "%u");
+        break;
+    case PL_VAR_FLOAT:
+        CHECK_BOUNDS(f, "%f");
+        break;
+    case PL_VAR_INVALID:
+    case PL_VAR_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+#undef CHECK_BOUNDS
+    return true;
+}
+
+static bool parse_param(pl_log log, void *alloc, pl_str *body,
+                        struct pl_hook_par *out)
+{
+    *out = (struct pl_hook_par) {0};
+    pl_str minimum = {0};
+    pl_str maximum = {0};
+    bool is_enum = false;
+
+    while (true) {
+        pl_str rest;
+        pl_str line = pl_str_strip(pl_str_getline(*body, &rest));
+
+        if (!pl_str_eatstart0(&line, "//!"))
+            break;
+
+        *body = rest;
+
+        if (pl_str_eatstart0(&line, "PARAM")) {
+            out->name = pl_strdup0(alloc, pl_str_strip(line));
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "DESC")) {
+            out->description = pl_strdup0(alloc, pl_str_strip(line));
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "MINIMUM")) {
+            minimum = pl_str_strip(line);
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "MAXIMUM")) {
+            maximum = pl_str_strip(line);
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "TYPE")) {
+            line = pl_str_strip(line);
+            is_enum = pl_str_eatstart0(&line, "ENUM");
+            line = pl_str_strip(line);
+            if (pl_str_eatstart0(&line, "DYNAMIC")) {
+                out->mode = PL_HOOK_PAR_DYNAMIC;
+            } else if (pl_str_eatstart0(&line, "CONSTANT")) {
+                out->mode = PL_HOOK_PAR_CONSTANT;
+            } else if (pl_str_eatstart0(&line, "DEFINE")) {
+                out->mode = PL_HOOK_PAR_DEFINE;
+                out->type = PL_VAR_SINT;
+                if (pl_str_strip(line).len > 0) {
+                    pl_err(log, "TYPE DEFINE does not take any extra arguments, "
+                           "unexpected: '%.*s'", PL_STR_FMT(line));
+                    return false;
+                }
+                continue;
+            } else {
+                out->mode = PL_HOOK_PAR_VARIABLE;
+            }
+
+            line = pl_str_strip(line);
+            for (const struct pl_named_var *nv = pl_var_glsl_types;
+                 nv->glsl_name; nv++)
+            {
+                if (pl_str_equals0(line, nv->glsl_name)) {
+                    if (nv->var.dim_v > 1 || nv->var.dim_m > 1) {
+                        pl_err(log, "GLSL type '%s' is incompatible with "
+                               "shader parameters, must be scalar type!",
+                               nv->glsl_name);
+                        return false;
+                    }
+
+                    out->type = nv->var.type;
+                    if (is_enum && out->type != PL_VAR_SINT) {
+                        pl_err(log, "ENUM is only compatible with type int/DEFINE!");
+                        return false;
+                    }
+                    goto next;
+                }
+            }
+
+            pl_err(log, "Unrecognized GLSL type '%.*s'!", PL_STR_FMT(line));
+            return false;
+        }
+
+        pl_err(log, "Unrecognized command '%.*s'!", PL_STR_FMT(line));
+        return false;
+
+next: ;
+    }
+
+    switch (out->type) {
+    case PL_VAR_INVALID:
+        pl_err(log, "Missing variable type!");
+        return false;
+    case PL_VAR_SINT:
+        out->minimum.i = INT_MIN;
+        out->maximum.i = INT_MAX;
+        break;
+    case PL_VAR_UINT:
+        out->minimum.u = 0;
+        out->maximum.u = UINT_MAX;
+        break;
+    case PL_VAR_FLOAT:
+        out->minimum.f = -INFINITY;
+        out->maximum.f = INFINITY;
+        break;
+    case PL_VAR_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+    pl_str initial = pl_str_strip(split_magic(body));
+    if (!initial.len) {
+        pl_err(log, "Missing initial parameter value!");
+        return false;
+    }
+
+    if (is_enum) {
+        PL_ARRAY(const char *) names = {0};
+        pl_assert(out->type == PL_VAR_SINT);
+        do {
+            pl_str line = pl_str_strip(pl_str_getline(initial, &initial));
+            if (!line.len)
+                continue;
+            PL_ARRAY_APPEND(alloc, names, pl_strdup0(alloc, line));
+        } while (initial.len);
+
+        pl_assert(names.num >= 1);
+        out->initial.i = 0;
+        out->minimum.i = 0;
+        out->maximum.i = names.num - 1;
+        out->names = names.elem;
+    } else {
+        if (!parse_var(log, initial, out->type, &out->initial))
+            return false;
+        if (!parse_var(log, minimum, out->type, &out->minimum))
+            return false;
+        if (!parse_var(log, maximum, out->type, &out->maximum))
+            return false;
+        if (!check_bounds(log, out->type, out->initial, out->minimum, out->maximum))
+            return false;
+    }
+
+    out->data = pl_memdup(alloc, &out->initial, sizeof(out->initial));
+    return true;
+}
+
+static enum pl_hook_stage mp_stage_to_pl(pl_str stage)
+{
+    if (pl_str_equals0(stage, "RGB"))
+        return PL_HOOK_RGB_INPUT;
+    if (pl_str_equals0(stage, "LUMA"))
+        return PL_HOOK_LUMA_INPUT;
+    if (pl_str_equals0(stage, "CHROMA"))
+        return PL_HOOK_CHROMA_INPUT;
+    if (pl_str_equals0(stage, "ALPHA"))
+        return PL_HOOK_ALPHA_INPUT;
+    if (pl_str_equals0(stage, "XYZ"))
+        return PL_HOOK_XYZ_INPUT;
+
+    if (pl_str_equals0(stage, "CHROMA_SCALED"))
+        return PL_HOOK_CHROMA_SCALED;
+    if (pl_str_equals0(stage, "ALPHA_SCALED"))
+        return PL_HOOK_ALPHA_SCALED;
+
+    if (pl_str_equals0(stage, "NATIVE"))
+        return PL_HOOK_NATIVE;
+    if (pl_str_equals0(stage, "MAINPRESUB"))
+        return PL_HOOK_RGB;
+    if (pl_str_equals0(stage, "MAIN"))
+        return PL_HOOK_RGB; // Note: conflicts with above!
+
+    if (pl_str_equals0(stage, "LINEAR"))
+        return PL_HOOK_LINEAR;
+    if (pl_str_equals0(stage, "SIGMOID"))
+        return PL_HOOK_SIGMOID;
+    if (pl_str_equals0(stage, "PREKERNEL"))
+        return PL_HOOK_PRE_KERNEL;
+    if (pl_str_equals0(stage, "POSTKERNEL"))
+        return PL_HOOK_POST_KERNEL;
+
+    if (pl_str_equals0(stage, "SCALED"))
+        return PL_HOOK_SCALED;
+    if (pl_str_equals0(stage, "PREOUTPUT"))
+        return PL_HOOK_PRE_OUTPUT;
+    if (pl_str_equals0(stage, "OUTPUT"))
+        return PL_HOOK_OUTPUT;
+
+    return 0;
+}
+
+static pl_str pl_stage_to_mp(enum pl_hook_stage stage)
+{
+    switch (stage) {
+    case PL_HOOK_RGB_INPUT:     return pl_str0("RGB");
+    case PL_HOOK_LUMA_INPUT:    return pl_str0("LUMA");
+    case PL_HOOK_CHROMA_INPUT:  return pl_str0("CHROMA");
+    case PL_HOOK_ALPHA_INPUT:   return pl_str0("ALPHA");
+    case PL_HOOK_XYZ_INPUT:     return pl_str0("XYZ");
+
+    case PL_HOOK_CHROMA_SCALED: return pl_str0("CHROMA_SCALED");
+    case PL_HOOK_ALPHA_SCALED:  return pl_str0("ALPHA_SCALED");
+
+    case PL_HOOK_NATIVE:        return pl_str0("NATIVE");
+    case PL_HOOK_RGB:           return pl_str0("MAINPRESUB");
+
+    case PL_HOOK_LINEAR:        return pl_str0("LINEAR");
+    case PL_HOOK_SIGMOID:       return pl_str0("SIGMOID");
+    case PL_HOOK_PRE_KERNEL:    return pl_str0("PREKERNEL");
+    case PL_HOOK_POST_KERNEL:   return pl_str0("POSTKERNEL");
+
+    case PL_HOOK_SCALED:        return pl_str0("SCALED");
+    case PL_HOOK_PRE_OUTPUT:    return pl_str0("PREOUTPUT");
+    case PL_HOOK_OUTPUT:        return pl_str0("OUTPUT");
+    };
+
+    pl_unreachable();
+}
+
+struct hook_pass {
+    enum pl_hook_stage exec_stages;
+    struct custom_shader_hook hook;
+};
+
+struct pass_tex {
+    pl_str name;
+    pl_tex tex;
+
+    // Metadata
+    pl_rect2df rect;
+    struct pl_color_repr repr;
+    struct pl_color_space color;
+    int comps;
+};
+
+struct hook_priv {
+    pl_log log;
+    pl_gpu gpu;
+    void *alloc;
+
+    PL_ARRAY(struct hook_pass) hook_passes;
+    PL_ARRAY(struct pl_hook_par) hook_params;
+
+    // Fixed (for shader-local resources)
+    PL_ARRAY(struct pl_shader_desc) descriptors;
+
+    // Dynamic per pass
+    enum pl_hook_stage save_stages;
+    PL_ARRAY(struct pass_tex) pass_textures;
+    pl_shader trc_helper;
+
+    // State for PRNG/frame count
+    int frame_count;
+    uint64_t prng_state[4];
+};
+
+static void hook_reset(void *priv)
+{
+    struct hook_priv *p = priv;
+    p->pass_textures.num = 0;
+}
+
+// Context during execution of a hook
+struct hook_ctx {
+    struct hook_priv *priv;
+    const struct pl_hook_params *params;
+    struct pass_tex hooked;
+};
+
+static bool lookup_tex(struct hook_ctx *ctx, pl_str var, float size[2])
+{
+    struct hook_priv *p = ctx->priv;
+    const struct pl_hook_params *params = ctx->params;
+
+    if (pl_str_equals0(var, "HOOKED")) {
+        pl_assert(ctx->hooked.tex);
+        size[0] = ctx->hooked.tex->params.w;
+        size[1] = ctx->hooked.tex->params.h;
+        return true;
+    }
+
+    if (pl_str_equals0(var, "NATIVE_CROPPED")) {
+        size[0] = fabs(pl_rect_w(params->src_rect));
+        size[1] = fabs(pl_rect_h(params->src_rect));
+        return true;
+    }
+
+    if (pl_str_equals0(var, "OUTPUT")) {
+        size[0] = abs(pl_rect_w(params->dst_rect));
+        size[1] = abs(pl_rect_h(params->dst_rect));
+        return true;
+    }
+
+    if (pl_str_equals0(var, "MAIN"))
+        var = pl_str0("MAINPRESUB");
+
+    for (int i = 0; i < p->pass_textures.num; i++) {
+        if (pl_str_equals(var, p->pass_textures.elem[i].name)) {
+            pl_tex tex = p->pass_textures.elem[i].tex;
+            size[0] = tex->params.w;
+            size[1] = tex->params.h;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static bool lookup_var(struct hook_ctx *ctx, pl_str var, float *val)
+{
+    struct hook_priv *p = ctx->priv;
+    for (int i = 0; i < p->hook_params.num; i++) {
+        const struct pl_hook_par *hp = &p->hook_params.elem[i];
+        if (pl_str_equals0(var, hp->name)) {
+            switch (hp->type) {
+            case PL_VAR_SINT:  *val = hp->data->i; return true;
+            case PL_VAR_UINT:  *val = hp->data->u; return true;
+            case PL_VAR_FLOAT: *val = hp->data->f; return true;
+            case PL_VAR_INVALID:
+            case PL_VAR_TYPE_COUNT:
+                break;
+            }
+
+            pl_unreachable();
+        }
+
+        if (hp->names) {
+            for (int j = hp->minimum.i; j <= hp->maximum.i; j++) {
+                if (pl_str_equals0(var, hp->names[j])) {
+                    *val = j;
+                    return true;
+                }
+            }
+        }
+    }
+
+    PL_WARN(p, "Variable '%.*s' not found in RPN expression!", PL_STR_FMT(var));
+    return false;
+}
+
+// Returns whether successful. 'result' is left untouched on failure
+static bool eval_shexpr(struct hook_ctx *ctx,
+                        const struct shexp expr[MAX_SHEXP_SIZE],
+                        float *result)
+{
+    struct hook_priv *p = ctx->priv;
+    float stack[MAX_SHEXP_SIZE] = {0};
+    int idx = 0; // points to next element to push
+
+    for (int i = 0; i < MAX_SHEXP_SIZE; i++) {
+        switch (expr[i].tag) {
+        case SHEXP_END:
+            goto done;
+
+        case SHEXP_CONST:
+            // Since our SHEXPs are bound by MAX_SHEXP_SIZE, it should be
+            // impossible to overflow the stack
+            assert(idx < MAX_SHEXP_SIZE);
+            stack[idx++] = expr[i].val.cval;
+            continue;
+
+        case SHEXP_OP1:
+            if (idx < 1) {
+                PL_WARN(p, "Stack underflow in RPN expression!");
+                return false;
+            }
+
+            switch (expr[i].val.op) {
+            case SHEXP_OP_NOT: stack[idx-1] = !stack[idx-1]; break;
+            default: pl_unreachable();
+            }
+            continue;
+
+        case SHEXP_OP2:
+            if (idx < 2) {
+                PL_WARN(p, "Stack underflow in RPN expression!");
+                return false;
+            }
+
+            // Pop the operands in reverse order
+            float op2 = stack[--idx];
+            float op1 = stack[--idx];
+            float res = 0.0;
+            switch (expr[i].val.op) {
+            case SHEXP_OP_ADD: res = op1 + op2; break;
+            case SHEXP_OP_SUB: res = op1 - op2; break;
+            case SHEXP_OP_MUL: res = op1 * op2; break;
+            case SHEXP_OP_DIV: res = op1 / op2; break;
+            case SHEXP_OP_MOD: res = fmodf(op1, op2); break;
+            case SHEXP_OP_GT:  res = op1 > op2; break;
+            case SHEXP_OP_LT:  res = op1 < op2; break;
+            case SHEXP_OP_EQ:  res = fabsf(op1 - op2) <= 1e-6 * fmaxf(op1, op2); break;
+            case SHEXP_OP_NOT: pl_unreachable();
+            }
+
+            if (!isfinite(res)) {
+                PL_WARN(p, "Illegal operation in RPN expression!");
+                return false;
+            }
+
+            stack[idx++] = res;
+            continue;
+
+        case SHEXP_TEX_W:
+        case SHEXP_TEX_H: {
+            pl_str name = expr[i].val.varname;
+            float size[2];
+
+            if (!lookup_tex(ctx, name, size)) {
+                PL_WARN(p, "Variable '%.*s' not found in RPN expression!",
+                        PL_STR_FMT(name));
+                return false;
+            }
+
+            stack[idx++] = (expr[i].tag == SHEXP_TEX_W) ? size[0] : size[1];
+            continue;
+        }
+
+        case SHEXP_VAR: {
+            pl_str name = expr[i].val.varname;
+            float val;
+            if (!lookup_var(ctx, name, &val))
+                return false;
+            stack[idx++] = val;
+            continue;
+        }
+        }
+    }
+
+done:
+    // Return the single stack element
+    if (idx != 1) {
+        PL_WARN(p, "Malformed stack after RPN expression!");
+        return false;
+    }
+
+    *result = stack[0];
+    return true;
+}
+
+static double prng_step(uint64_t s[4])
+{
+    const uint64_t result = s[0] + s[3];
+    const uint64_t t = s[1] << 17;
+
+    s[2] ^= s[0];
+    s[3] ^= s[1];
+    s[1] ^= s[2];
+    s[0] ^= s[3];
+
+    s[2] ^= t;
+    s[3] = (s[3] << 45) | (s[3] >> (64 - 45));
+    return (result >> 11) * 0x1.0p-53;
+}
+
+static bool bind_pass_tex(pl_shader sh, pl_str name,
+                          const struct pass_tex *ptex,
+                          const pl_rect2df *rect,
+                          bool hooked, bool mainpresub)
+{
+    ident_t id, pos, pt;
+
+    // Compatibility with mpv texture binding semantics
+    id = sh_bind(sh, ptex->tex, PL_TEX_ADDRESS_CLAMP, PL_TEX_SAMPLE_LINEAR,
+                 "hook_tex", rect, &pos, &pt);
+    if (!id)
+        return false;
+
+    GLSLH("#define %.*s_raw "$" \n", PL_STR_FMT(name), id);
+    GLSLH("#define %.*s_pos "$" \n", PL_STR_FMT(name), pos);
+    GLSLH("#define %.*s_map "$"_map \n", PL_STR_FMT(name), pos);
+    GLSLH("#define %.*s_size vec2(textureSize("$", 0)) \n", PL_STR_FMT(name), id);
+    GLSLH("#define %.*s_pt "$" \n", PL_STR_FMT(name), pt);
+
+    float off[2] = { ptex->rect.x0, ptex->rect.y0 };
+    GLSLH("#define %.*s_off "$" \n", PL_STR_FMT(name),
+          sh_var(sh, (struct pl_shader_var) {
+              .var = pl_var_vec2("offset"),
+              .data = off,
+    }));
+
+    struct pl_color_repr repr = ptex->repr;
+    ident_t scale = SH_FLOAT(pl_color_repr_normalize(&repr));
+    GLSLH("#define %.*s_mul "$" \n", PL_STR_FMT(name), scale);
+
+    // Compatibility with mpv
+    GLSLH("#define %.*s_rot mat2(1.0, 0.0, 0.0, 1.0) \n", PL_STR_FMT(name));
+
+    // Sampling function boilerplate
+    GLSLH("#define %.*s_tex(pos) ("$" * vec4(textureLod("$", pos, 0.0))) \n",
+          PL_STR_FMT(name), scale, id);
+    GLSLH("#define %.*s_texOff(off) (%.*s_tex("$" + "$" * vec2(off))) \n",
+          PL_STR_FMT(name), PL_STR_FMT(name), pos, pt);
+
+    bool can_gather = ptex->tex->params.format->gatherable;
+    if (can_gather) {
+        GLSLH("#define %.*s_gather(pos, c) ("$" * vec4(textureGather("$", pos, c))) \n",
+              PL_STR_FMT(name), scale, id);
+    }
+
+    if (hooked) {
+        GLSLH("#define HOOKED_raw %.*s_raw \n", PL_STR_FMT(name));
+        GLSLH("#define HOOKED_pos %.*s_pos \n", PL_STR_FMT(name));
+        GLSLH("#define HOOKED_size %.*s_size \n", PL_STR_FMT(name));
+        GLSLH("#define HOOKED_rot %.*s_rot \n", PL_STR_FMT(name));
+        GLSLH("#define HOOKED_off %.*s_off \n", PL_STR_FMT(name));
+        GLSLH("#define HOOKED_pt %.*s_pt \n", PL_STR_FMT(name));
+        GLSLH("#define HOOKED_map %.*s_map \n", PL_STR_FMT(name));
+        GLSLH("#define HOOKED_mul %.*s_mul \n", PL_STR_FMT(name));
+        GLSLH("#define HOOKED_tex %.*s_tex \n", PL_STR_FMT(name));
+        GLSLH("#define HOOKED_texOff %.*s_texOff \n", PL_STR_FMT(name));
+        if (can_gather)
+            GLSLH("#define HOOKED_gather %.*s_gather \n", PL_STR_FMT(name));
+    }
+
+    if (mainpresub) {
+        GLSLH("#define MAIN_raw MAINPRESUB_raw \n");
+        GLSLH("#define MAIN_pos MAINPRESUB_pos \n");
+        GLSLH("#define MAIN_size MAINPRESUB_size \n");
+        GLSLH("#define MAIN_rot MAINPRESUB_rot \n");
+        GLSLH("#define MAIN_off MAINPRESUB_off \n");
+        GLSLH("#define MAIN_pt MAINPRESUB_pt \n");
+        GLSLH("#define MAIN_map MAINPRESUB_map \n");
+        GLSLH("#define MAIN_mul MAINPRESUB_mul \n");
+        GLSLH("#define MAIN_tex MAINPRESUB_tex \n");
+        GLSLH("#define MAIN_texOff MAINPRESUB_texOff \n");
+        if (can_gather)
+            GLSLH("#define MAIN_gather MAINPRESUB_gather \n");
+    }
+
+    return true;
+}
+
+static void save_pass_tex(struct hook_priv *p, struct pass_tex ptex)
+{
+
+    for (int i = 0; i < p->pass_textures.num; i++) {
+        if (!pl_str_equals(p->pass_textures.elem[i].name, ptex.name))
+            continue;
+
+        p->pass_textures.elem[i] = ptex;
+        return;
+    }
+
+    // No texture with this name yet, append new one
+    PL_ARRAY_APPEND(p->alloc, p->pass_textures, ptex);
+}
+
+static struct pl_hook_res hook_hook(void *priv, const struct pl_hook_params *params)
+{
+    struct hook_priv *p = priv;
+    pl_str stage = pl_stage_to_mp(params->stage);
+    struct pl_hook_res res = {0};
+
+    pl_shader sh = NULL;
+    struct hook_ctx ctx = {
+        .priv = p,
+        .params = params,
+        .hooked = {
+            .name  = stage,
+            .tex   = params->tex,
+            .rect  = params->rect,
+            .repr  = params->repr,
+            .color = params->color,
+            .comps = params->components,
+        },
+    };
+
+    // Save the input texture if needed
+    if (p->save_stages & params->stage) {
+        PL_TRACE(p, "Saving input texture '%.*s' for binding",
+                 PL_STR_FMT(ctx.hooked.name));
+        save_pass_tex(p, ctx.hooked);
+    }
+
+    for (int n = 0; n < p->hook_passes.num; n++) {
+        const struct hook_pass *pass = &p->hook_passes.elem[n];
+        if (!(pass->exec_stages & params->stage))
+            continue;
+
+        const struct custom_shader_hook *hook = &pass->hook;
+        PL_TRACE(p, "Executing hook pass %d on stage '%.*s': %.*s",
+                 n, PL_STR_FMT(stage), PL_STR_FMT(hook->pass_desc));
+
+        // Test for execution condition
+        float run = 0;
+        if (!eval_shexpr(&ctx, hook->cond, &run))
+            goto error;
+
+        if (!run) {
+            PL_TRACE(p, "Skipping hook due to condition");
+            continue;
+        }
+
+        // Generate a new shader object
+        sh = pl_dispatch_begin(params->dispatch);
+
+        // Bind all necessary input textures
+        for (int i = 0; i < PL_ARRAY_SIZE(hook->bind_tex); i++) {
+            pl_str texname = hook->bind_tex[i];
+            if (!texname.len)
+                break;
+
+            // Convenience alias, to allow writing shaders that are oblivious
+            // of the exact stage they hooked. This simply translates to
+            // whatever stage actually fired the hook.
+            bool hooked = false, mainpresub = false;
+            if (pl_str_equals0(texname, "HOOKED")) {
+                // Continue with binding this, under the new name
+                texname = stage;
+                hooked = true;
+            }
+
+            // Compatibility alias, because MAIN and MAINPRESUB mean the same
+            // thing to libplacebo, but user shaders are still written as
+            // though they can be different concepts.
+            if (pl_str_equals0(texname, "MAIN") ||
+                pl_str_equals0(texname, "MAINPRESUB"))
+            {
+                texname = pl_str0("MAINPRESUB");
+                mainpresub = true;
+            }
+
+            for (int j = 0; j < p->descriptors.num; j++) {
+                if (pl_str_equals0(texname, p->descriptors.elem[j].desc.name)) {
+                    // Directly bind this, no need to bother with all the
+                    // `bind_pass_tex` boilerplate
+                    ident_t id = sh_desc(sh, p->descriptors.elem[j]);
+                    GLSLH("#define %.*s "$" \n", PL_STR_FMT(texname), id);
+
+                    if (p->descriptors.elem[j].desc.type == PL_DESC_SAMPLED_TEX) {
+                        GLSLH("#define %.*s_tex(pos) (textureLod("$", pos, 0.0)) \n",
+                              PL_STR_FMT(texname), id);
+                    }
+                    goto next_bind;
+                }
+            }
+
+            for (int j = 0; j < p->pass_textures.num; j++) {
+                if (pl_str_equals(texname, p->pass_textures.elem[j].name)) {
+                    // Note: We bind the whole texture, rather than
+                    // hooked.rect, because user shaders in general are not
+                    // designed to handle cropped input textures.
+                    const struct pass_tex *ptex = &p->pass_textures.elem[j];
+                    pl_rect2df rect = {
+                        0, 0, ptex->tex->params.w, ptex->tex->params.h,
+                    };
+
+                    if (hook->offset_align && pl_str_equals(texname, stage)) {
+                        float sx = pl_rect_w(ctx.hooked.rect) / pl_rect_w(params->src_rect),
+                              sy = pl_rect_h(ctx.hooked.rect) / pl_rect_h(params->src_rect),
+                              ox = ctx.hooked.rect.x0 - sx * params->src_rect.x0,
+                              oy = ctx.hooked.rect.y0 - sy * params->src_rect.y0;
+
+                        PL_TRACE(p, "Aligning plane with ref: %f %f", ox, oy);
+                        pl_rect2df_offset(&rect, ox, oy);
+                    }
+
+                    if (!bind_pass_tex(sh, texname, &p->pass_textures.elem[j],
+                                       &rect, hooked, mainpresub))
+                    {
+                        goto error;
+                    }
+                    goto next_bind;
+                }
+            }
+
+            // If none of the above matched, this is an unknown texture name,
+            // so silently ignore this pass to match the mpv behavior
+            PL_TRACE(p, "Skipping hook due to no texture named '%.*s'.",
+                     PL_STR_FMT(texname));
+            pl_dispatch_abort(params->dispatch, &sh);
+            goto next_pass;
+
+    next_bind: ; // outer 'continue'
+        }
+
+        // Set up the input variables
+        p->frame_count++;
+        GLSLH("#define frame "$" \n", sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_int("frame"),
+            .data = &p->frame_count,
+            .dynamic = true,
+        }));
+
+        float random = prng_step(p->prng_state);
+        GLSLH("#define random "$" \n", sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_float("random"),
+            .data = &random,
+            .dynamic = true,
+        }));
+
+        float src_size[2] = { pl_rect_w(params->src_rect), pl_rect_h(params->src_rect) };
+        GLSLH("#define input_size "$" \n", sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_vec2("input_size"),
+            .data = src_size,
+        }));
+
+        float dst_size[2] = { pl_rect_w(params->dst_rect), pl_rect_h(params->dst_rect) };
+        GLSLH("#define target_size "$" \n", sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_vec2("target_size"),
+            .data = dst_size,
+        }));
+
+        float tex_off[2] = { params->src_rect.x0, params->src_rect.y0 };
+        GLSLH("#define tex_offset "$" \n", sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_vec2("tex_offset"),
+            .data = tex_off,
+        }));
+
+        // Custom parameters
+        for (int i = 0; i < p->hook_params.num; i++) {
+            const struct pl_hook_par *hp = &p->hook_params.elem[i];
+            switch (hp->mode) {
+            case PL_HOOK_PAR_VARIABLE:
+            case PL_HOOK_PAR_DYNAMIC:
+                GLSLH("#define %s "$" \n", hp->name,
+                      sh_var(sh, (struct pl_shader_var) {
+                        .var = {
+                            .name = hp->name,
+                            .type = hp->type,
+                            .dim_v = 1,
+                            .dim_m = 1,
+                            .dim_a = 1,
+                        },
+                        .data = hp->data,
+                        .dynamic = hp->mode == PL_HOOK_PAR_DYNAMIC,
+                }));
+                break;
+
+            case PL_HOOK_PAR_CONSTANT:
+                GLSLH("#define %s "$" \n", hp->name,
+                      sh_const(sh, (struct pl_shader_const) {
+                        .name = hp->name,
+                        .type = hp->type,
+                        .data = hp->data,
+                        .compile_time = true,
+                }));
+                break;
+
+            case PL_HOOK_PAR_DEFINE:
+                GLSLH("#define %s %d \n", hp->name, hp->data->i);
+                break;
+
+            case PL_HOOK_PAR_MODE_COUNT:
+                pl_unreachable();
+            }
+
+            if (hp->names) {
+                for (int j = hp->minimum.i; j <= hp->maximum.i; j++)
+                    GLSLH("#define %s %d \n", hp->names[j], j);
+            }
+        }
+
+        // Helper sub-shaders
+        uint64_t sh_id = SH_PARAMS(sh).id;
+        pl_shader_reset(p->trc_helper, pl_shader_params(
+            .id = ++sh_id,
+            .gpu = p->gpu,
+        ));
+        pl_shader_linearize(p->trc_helper, params->orig_color);
+        GLSLH("#define linearize "$" \n", sh_subpass(sh, p->trc_helper));
+
+        pl_shader_reset(p->trc_helper, pl_shader_params(
+            .id = ++sh_id,
+            .gpu = p->gpu,
+        ));
+        pl_shader_delinearize(p->trc_helper, params->orig_color);
+        GLSLH("#define delinearize "$" \n", sh_subpass(sh, p->trc_helper));
+
+        // Load and run the user shader itself
+        sh_append_str(sh, SH_BUF_HEADER, hook->pass_body);
+        sh_describef(sh, "%.*s", PL_STR_FMT(hook->pass_desc));
+
+        // Resolve output size and create framebuffer
+        float out_size[2] = {0};
+        if (!eval_shexpr(&ctx, hook->width,  &out_size[0]) ||
+            !eval_shexpr(&ctx, hook->height, &out_size[1]))
+        {
+            goto error;
+        }
+
+        int out_w = roundf(out_size[0]),
+            out_h = roundf(out_size[1]);
+
+        if (!sh_require(sh, PL_SHADER_SIG_NONE, out_w, out_h))
+            goto error;
+
+        // Generate a new texture to store the render result
+        pl_tex fbo;
+        fbo = params->get_tex(params->priv, out_w, out_h);
+        if (!fbo) {
+            PL_ERR(p, "Failed dispatching hook: `get_tex` callback failed?");
+            goto error;
+        }
+
+        bool ok;
+        if (hook->is_compute) {
+
+            if (!sh_try_compute(sh, hook->threads_w, hook->threads_h, false, 0) ||
+                !fbo->params.storable)
+            {
+                PL_ERR(p, "Failed dispatching COMPUTE shader");
+                goto error;
+            }
+
+            GLSLP("#define out_image "$" \n", sh_desc(sh, (struct pl_shader_desc) {
+                .binding.object = fbo,
+                .desc = {
+                    .name = "out_image",
+                    .type = PL_DESC_STORAGE_IMG,
+                    .access = PL_DESC_ACCESS_WRITEONLY,
+                },
+            }));
+
+            sh->output = PL_SHADER_SIG_NONE;
+
+            GLSL("hook(); \n");
+            ok = pl_dispatch_compute(params->dispatch, pl_dispatch_compute_params(
+                .shader = &sh,
+                .dispatch_size = {
+                    // Round up as many blocks as are needed to cover the image
+                    PL_DIV_UP(out_w, hook->block_w),
+                    PL_DIV_UP(out_h, hook->block_h),
+                    1,
+                },
+                .width  = out_w,
+                .height = out_h,
+            ));
+
+        } else {
+
+            // Default non-COMPUTE shaders to explicitly use fragment shaders
+            // only, to avoid breaking things like fwidth()
+            sh->type = PL_DEF(sh->type, SH_FRAGMENT);
+
+            GLSL("vec4 color = hook(); \n");
+            ok = pl_dispatch_finish(params->dispatch, pl_dispatch_params(
+                .shader = &sh,
+                .target = fbo,
+            ));
+
+        }
+
+        if (!ok)
+            goto error;
+
+        float sx = (float) out_w / ctx.hooked.tex->params.w,
+              sy = (float) out_h / ctx.hooked.tex->params.h,
+              x0 = sx * ctx.hooked.rect.x0 + hook->offset[0],
+              y0 = sy * ctx.hooked.rect.y0 + hook->offset[1];
+
+        pl_rect2df new_rect = {
+            x0,
+            y0,
+            x0 + sx * pl_rect_w(ctx.hooked.rect),
+            y0 + sy * pl_rect_h(ctx.hooked.rect),
+        };
+
+        if (hook->offset_align) {
+            float rx = pl_rect_w(new_rect) / pl_rect_w(params->src_rect),
+                  ry = pl_rect_h(new_rect) / pl_rect_h(params->src_rect),
+                  ox = rx * params->src_rect.x0 - sx * ctx.hooked.rect.x0,
+                  oy = ry * params->src_rect.y0 - sy * ctx.hooked.rect.y0;
+
+            pl_rect2df_offset(&new_rect, ox, oy);
+        }
+
+        // Save the result of this shader invocation
+        struct pass_tex ptex = {
+            .name  = hook->save_tex.len ? hook->save_tex : stage,
+            .tex   = fbo,
+            .repr  = ctx.hooked.repr,
+            .color = ctx.hooked.color,
+            .comps = PL_DEF(hook->comps, ctx.hooked.comps),
+            .rect  = new_rect,
+        };
+
+        // It's assumed that users will correctly normalize the input
+        pl_color_repr_normalize(&ptex.repr);
+
+        PL_TRACE(p, "Saving output texture '%.*s' from hook execution on '%.*s'",
+                 PL_STR_FMT(ptex.name), PL_STR_FMT(stage));
+
+        save_pass_tex(p, ptex);
+
+        // Update the result object, unless we saved to a different name
+        if (pl_str_equals(ptex.name, stage)) {
+            ctx.hooked = ptex;
+            res = (struct pl_hook_res) {
+                .output     = PL_HOOK_SIG_TEX,
+                .tex        = fbo,
+                .repr       = ptex.repr,
+                .color      = ptex.color,
+                .components = ptex.comps,
+                .rect       = new_rect,
+            };
+        }
+
+next_pass: ;
+    }
+
+    return res;
+
+error:
+    pl_dispatch_abort(params->dispatch, &sh);
+    return (struct pl_hook_res) { .failed = true };
+}
+
+const struct pl_hook *pl_mpv_user_shader_parse(pl_gpu gpu,
+                                               const char *shader_text,
+                                               size_t shader_len)
+{
+    if (!shader_len)
+        return NULL;
+
+    pl_str shader = { (uint8_t *) shader_text, shader_len };
+
+    struct pl_hook *hook = pl_zalloc_obj(NULL, hook, struct hook_priv);
+    struct hook_priv *p = PL_PRIV(hook);
+
+    *hook = (struct pl_hook) {
+        .input = PL_HOOK_SIG_TEX,
+        .priv = p,
+        .reset = hook_reset,
+        .hook = hook_hook,
+        .signature = pl_str_hash(shader),
+    };
+
+    *p = (struct hook_priv) {
+        .log = gpu->log,
+        .gpu = gpu,
+        .alloc = hook,
+        .trc_helper = pl_shader_alloc(gpu->log, NULL),
+        .prng_state = {
+            // Determined by fair die roll
+            0xb76d71f9443c228allu, 0x93a02092fc4807e8llu,
+            0x06d81748f838bd07llu, 0x9381ee129dddce6cllu,
+        },
+    };
+
+    shader = pl_strdup(hook, shader);
+
+    // Skip all garbage (e.g. comments) before the first header
+    int pos = pl_str_find(shader, pl_str0("//!"));
+    if (pos < 0) {
+        PL_ERR(gpu, "Shader appears to contain no headers?");
+        goto error;
+    }
+    shader = pl_str_drop(shader, pos);
+
+    // Loop over the file
+    while (shader.len > 0)
+    {
+        // Peek at the first header to dispatch the right type
+        if (pl_str_startswith0(shader, "//!TEXTURE")) {
+            struct pl_shader_desc sd;
+            if (!parse_tex(gpu, hook, &shader, &sd))
+                goto error;
+
+            PL_INFO(gpu, "Registering named texture '%s'", sd.desc.name);
+            PL_ARRAY_APPEND(hook, p->descriptors, sd);
+            continue;
+        }
+
+        if (pl_str_startswith0(shader, "//!BUFFER")) {
+            struct pl_shader_desc sd;
+            if (!parse_buf(gpu, hook, &shader, &sd))
+                goto error;
+
+            PL_INFO(gpu, "Registering named buffer '%s'", sd.desc.name);
+            PL_ARRAY_APPEND(hook, p->descriptors, sd);
+            continue;
+        }
+
+        if (pl_str_startswith0(shader, "//!PARAM")) {
+            struct pl_hook_par hp;
+            if (!parse_param(gpu->log, hook, &shader, &hp))
+                goto error;
+
+            PL_INFO(gpu, "Registering named parameter '%s'", hp.name);
+            PL_ARRAY_APPEND(hook, p->hook_params, hp);
+            continue;
+        }
+
+        struct custom_shader_hook h;
+        if (!parse_hook(gpu->log, &shader, &h))
+            goto error;
+
+        struct hook_pass pass = {
+            .exec_stages = 0,
+            .hook = h,
+        };
+
+        for (int i = 0; i < PL_ARRAY_SIZE(h.hook_tex); i++)
+            pass.exec_stages |= mp_stage_to_pl(h.hook_tex[i]);
+        for (int i = 0; i < PL_ARRAY_SIZE(h.bind_tex); i++) {
+            p->save_stages |= mp_stage_to_pl(h.bind_tex[i]);
+            if (pl_str_equals0(h.bind_tex[i], "HOOKED"))
+                p->save_stages |= pass.exec_stages;
+        }
+
+        // As an extra precaution, this avoids errors when trying to run
+        // conditions against planes that were never hooked. As a sole
+        // exception, OUTPUT is special because it's hard-coded to return the
+        // dst_rect even before it was hooked. (This is an apparently
+        // undocumented mpv quirk, but shaders rely on it in practice)
+        enum pl_hook_stage rpn_stages = 0;
+        for (int i = 0; i < PL_ARRAY_SIZE(h.width); i++) {
+            if (h.width[i].tag == SHEXP_TEX_W || h.width[i].tag == SHEXP_TEX_H)
+                rpn_stages |= mp_stage_to_pl(h.width[i].val.varname);
+        }
+        for (int i = 0; i < PL_ARRAY_SIZE(h.height); i++) {
+            if (h.height[i].tag == SHEXP_TEX_W || h.height[i].tag == SHEXP_TEX_H)
+                rpn_stages |= mp_stage_to_pl(h.height[i].val.varname);
+        }
+        for (int i = 0; i < PL_ARRAY_SIZE(h.cond); i++) {
+            if (h.cond[i].tag == SHEXP_TEX_W || h.cond[i].tag == SHEXP_TEX_H)
+                rpn_stages |= mp_stage_to_pl(h.cond[i].val.varname);
+        }
+
+        p->save_stages |= rpn_stages & ~PL_HOOK_OUTPUT;
+
+        PL_INFO(gpu, "Registering hook pass: %.*s", PL_STR_FMT(h.pass_desc));
+        PL_ARRAY_APPEND(hook, p->hook_passes, pass);
+    }
+
+    // We need to hook on both the exec and save stages, so that we can keep
+    // track of any textures we might need
+    hook->stages |= p->save_stages;
+    for (int i = 0; i < p->hook_passes.num; i++)
+        hook->stages |= p->hook_passes.elem[i].exec_stages;
+
+    hook->parameters = p->hook_params.elem;
+    hook->num_parameters = p->hook_params.num;
+
+    PL_MSG(gpu, PL_LOG_DEBUG, "Loaded user shader:");
+    pl_msg_source(gpu->log, PL_LOG_DEBUG, shader_text);
+
+    return hook;
+
+error:
+    pl_mpv_user_shader_destroy((const struct pl_hook **) &hook);
+    PL_MSG(gpu, PL_LOG_ERR, "Failed to parse user shader:");
+    pl_msg_source(gpu->log, PL_LOG_ERR, shader_text);
+    pl_log_stack_trace(gpu->log, PL_LOG_ERR);
+    return NULL;
+}
+
+void pl_mpv_user_shader_destroy(const struct pl_hook **hookp)
+{
+    const struct pl_hook *hook = *hookp;
+    if (!hook)
+        return;
+
+    struct hook_priv *p = PL_PRIV(hook);
+    for (int i = 0; i < p->descriptors.num; i++) {
+        switch (p->descriptors.elem[i].desc.type) {
+            case PL_DESC_BUF_UNIFORM:
+            case PL_DESC_BUF_STORAGE:
+            case PL_DESC_BUF_TEXEL_UNIFORM:
+            case PL_DESC_BUF_TEXEL_STORAGE: {
+                pl_buf buf = p->descriptors.elem[i].binding.object;
+                pl_buf_destroy(p->gpu, &buf);
+                break;
+            }
+
+            case PL_DESC_SAMPLED_TEX:
+            case PL_DESC_STORAGE_IMG: {
+                pl_tex tex = p->descriptors.elem[i].binding.object;
+                pl_tex_destroy(p->gpu, &tex);
+                break;
+
+            case PL_DESC_INVALID:
+            case PL_DESC_TYPE_COUNT:
+                pl_unreachable();
+            }
+        }
+    }
+
+    pl_shader_free(&p->trc_helper);
+    pl_free((void *) hook);
+    *hookp = NULL;
+}
diff --git a/src/shaders/deinterlacing.c b/src/shaders/deinterlacing.c
new file mode 100644
index 0000000..5c85138
--- /dev/null
+++ b/src/shaders/deinterlacing.c
@@ -0,0 +1,260 @@
+/*
+ * This file is part of libplacebo, but also based on vf_yadif_cuda.cu:
+ * Copyright (C) 2018 Philip Langdale <philipl@overt.org>
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "shaders.h"
+
+#include <libplacebo/shaders/deinterlacing.h>
+
+const struct pl_deinterlace_params pl_deinterlace_default_params = { PL_DEINTERLACE_DEFAULTS };
+
+void pl_shader_deinterlace(pl_shader sh, const struct pl_deinterlace_source *src,
+                           const struct pl_deinterlace_params *params)
+{
+    params = PL_DEF(params, &pl_deinterlace_default_params);
+
+    const struct pl_tex_params *texparams = &src->cur.top->params;
+    if (!sh_require(sh, PL_SHADER_SIG_NONE, texparams->w, texparams->h))
+        return;
+
+    sh_describe(sh, "deinterlacing");
+    GLSL("vec4 color = vec4(0,0,0,1);   \n"
+         "// pl_shader_deinterlace      \n"
+         "{                             \n");
+
+    uint8_t comp_mask = PL_DEF(src->component_mask, 0xFu);
+    comp_mask &= (1u << texparams->format->num_components) - 1u;
+    if (!comp_mask) {
+        SH_FAIL(sh, "pl_shader_deinterlace: empty component mask?");
+        return;
+    }
+
+    const uint8_t num_comps = sh_num_comps(comp_mask);
+    const char *swiz = sh_swizzle(comp_mask);
+    GLSL("#define T %s \n", sh_float_type(comp_mask));
+
+    ident_t pos, pt;
+    ident_t cur = sh_bind(sh, src->cur.top, PL_TEX_ADDRESS_MIRROR,
+                          PL_TEX_SAMPLE_NEAREST, "cur", NULL, &pos, &pt);
+    if (!cur)
+        return;
+
+    GLSL("#define GET(TEX, X, Y)                              \\\n"
+         "    (textureLod(TEX, pos + pt * vec2(X, Y), 0.0).%s)  \n"
+         "vec2 pos = "$";                                       \n"
+         "vec2 pt  = "$";                                       \n"
+         "T res;                                                \n",
+         swiz, pos, pt);
+
+    if (src->field == PL_FIELD_NONE) {
+        GLSL("res = GET("$", 0, 0); \n", cur);
+        goto done;
+    }
+
+    // Don't modify the primary field
+    GLSL("int yh = textureSize("$", 0).y;   \n"
+         "int yo = int("$".y * float(yh));  \n"
+         "if (yo %% 2 == %d) {              \n"
+         "    res = GET("$", 0, 0);         \n"
+         "} else {                          \n",
+         cur, pos,
+         src->field == PL_FIELD_TOP ? 0 : 1,
+         cur);
+
+    switch (params->algo) {
+    case PL_DEINTERLACE_WEAVE:
+        GLSL("res = GET("$", 0, 0); \n", cur);
+        break;
+
+    case PL_DEINTERLACE_BOB:
+        GLSL("res = GET("$", 0, %d); \n", cur,
+             src->field == PL_FIELD_TOP ? -1 : 1);
+        break;
+
+
+    case PL_DEINTERLACE_YADIF: {
+        // Try using a compute shader for this, for the sole reason of
+        // optimizing for thread group synchronicity. Otherwise, because we
+        // alternate between lines output as-is and lines output deinterlaced,
+        // half of our thread group will be mostly idle at any point in time.
+        const int bw = PL_DEF(sh_glsl(sh).subgroup_size, 32);
+        sh_try_compute(sh, bw, 1, true, 0);
+
+        // This magic constant is hard-coded in the original implementation as
+        // '1' on an 8-bit scale. Since we work with arbitrary bit depth
+        // floating point textures, we have to convert this somehow. Hard-code
+        // it as 1/255 under the assumption that the original intent was to be
+        // roughly 1 unit of brightness increment on an 8-bit source. This may
+        // or may not produce suboptimal results on higher-bit-depth content.
+        static const float spatial_bias = 1 / 255.0f;
+
+        // Calculate spatial prediction
+        ident_t spatial_pred = sh_fresh(sh, "spatial_predictor");
+        GLSLH("float "$"(float a, float b, float c, float d, float e, float f, float g, \n"
+              "          float h, float i, float j, float k, float l, float m, float n) \n"
+              "{                                                                        \n"
+              "    float spatial_pred = (d + k) / 2.0;                                  \n"
+              "    float spatial_score = abs(c - j) + abs(d - k) + abs(e - l) - %f;     \n"
+
+              "    float score = abs(b - k) + abs(c - l) + abs(d - m);                  \n"
+              "    if (score < spatial_score) {                                         \n"
+              "        spatial_pred = (c + l) / 2.0;                                    \n"
+              "        spatial_score = score;                                           \n"
+              "        score = abs(a - l) + abs(b - m) + abs(c - n);                    \n"
+              "        if (score < spatial_score) {                                     \n"
+              "          spatial_pred = (b + m) / 2.0;                                  \n"
+              "          spatial_score = score;                                         \n"
+              "        }                                                                \n"
+              "    }                                                                    \n"
+              "    score = abs(d - i) + abs(e - j) + abs(f - k);                        \n"
+              "    if (score < spatial_score) {                                         \n"
+              "        spatial_pred = (e + j) / 2.0;                                    \n"
+              "        spatial_score = score;                                           \n"
+              "        score = abs(e - h) + abs(f - i) + abs(g - j);                    \n"
+              "        if (score < spatial_score) {                                     \n"
+              "          spatial_pred = (f + i) / 2.0;                                  \n"
+              "          spatial_score = score;                                         \n"
+              "        }                                                                \n"
+              "    }                                                                    \n"
+              "    return spatial_pred;                                                 \n"
+              "}                                                                        \n",
+              spatial_pred, spatial_bias);
+
+        GLSL("T a = GET("$", -3, -1); \n"
+             "T b = GET("$", -2, -1); \n"
+             "T c = GET("$", -1, -1); \n"
+             "T d = GET("$",  0, -1); \n"
+             "T e = GET("$", +1, -1); \n"
+             "T f = GET("$", +2, -1); \n"
+             "T g = GET("$", +3, -1); \n"
+             "T h = GET("$", -3, +1); \n"
+             "T i = GET("$", -2, +1); \n"
+             "T j = GET("$", -1, +1); \n"
+             "T k = GET("$",  0, +1); \n"
+             "T l = GET("$", +1, +1); \n"
+             "T m = GET("$", +2, +1); \n"
+             "T n = GET("$", +3, +1); \n",
+             cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur, cur);
+
+        if (num_comps == 1) {
+            GLSL("res = "$"(a, b, c, d, e, f, g, h, i, j, k, l, m, n); \n", spatial_pred);
+        } else {
+            for (uint8_t i = 0; i < num_comps; i++) {
+                char c = "xyzw"[i];
+                GLSL("res.%c = "$"(a.%c, b.%c, c.%c, d.%c, e.%c, f.%c, g.%c,  \n"
+                     "             h.%c, i.%c, j.%c, k.%c, l.%c, m.%c, n.%c); \n",
+                     c, spatial_pred, c, c, c, c, c, c, c, c, c, c, c, c, c, c);
+            }
+        }
+
+        // Calculate temporal prediction
+        ident_t temporal_pred = sh_fresh(sh, "temporal_predictor");
+        GLSLH("float "$"(float A, float B, float C, float D, float E, float F,  \n"
+              "          float G, float H, float I, float J, float K, float L,  \n"
+              "          float spatial_pred)                                    \n"
+              "{                                                                \n"
+              "    float p0 = (C + H) / 2.0;                                    \n"
+              "    float p1 = F;                                                \n"
+              "    float p2 = (D + I) / 2.0;                                    \n"
+              "    float p3 = G;                                                \n"
+              "    float p4 = (E + J) / 2.0;                                    \n"
+
+              "    float tdiff0 = abs(D - I) / 2.0;                             \n"
+              "    float tdiff1 = (abs(A - F) + abs(B - G)) / 2.0;              \n"
+              "    float tdiff2 = (abs(K - F) + abs(G - L)) / 2.0;              \n"
+              "    float diff = max(tdiff0, max(tdiff1, tdiff2));               \n",
+              temporal_pred);
+        if (!params->skip_spatial_check) {
+            GLSLH("float maxi = max(p2 - min(p3, p1), min(p0 - p1, p4 - p3));   \n"
+                  "float mini = min(p2 - max(p3, p1), max(p0 - p1, p4 - p3));   \n"
+                  "diff = max(diff, max(mini, -maxi));                          \n");
+        }
+        GLSLH("    if (spatial_pred > p2 + diff)                                \n"
+              "      spatial_pred = p2 + diff;                                  \n"
+              "    if (spatial_pred < p2 - diff)                                \n"
+              "      spatial_pred = p2 - diff;                                  \n"
+              "    return spatial_pred;                                         \n"
+              "}                                                                \n");
+
+        ident_t prev2 = cur, next2 = cur;
+        if (src->prev.top && src->prev.top != src->cur.top) {
+            pl_assert(src->prev.top->params.w == texparams->w);
+            pl_assert(src->prev.top->params.h == texparams->h);
+            prev2 = sh_bind(sh, src->prev.top, PL_TEX_ADDRESS_MIRROR,
+                            PL_TEX_SAMPLE_NEAREST, "prev", NULL, NULL, NULL);
+            if (!prev2)
+                return;
+        }
+
+        if (src->next.top && src->next.top != src->cur.top) {
+            pl_assert(src->next.top->params.w == texparams->w);
+            pl_assert(src->next.top->params.h == texparams->h);
+            next2 = sh_bind(sh, src->next.top, PL_TEX_ADDRESS_MIRROR,
+                            PL_TEX_SAMPLE_NEAREST, "next", NULL, NULL, NULL);
+            if (!next2)
+                return;
+        }
+
+        enum pl_field first_field = PL_DEF(src->first_field, PL_FIELD_TOP);
+        ident_t prev1 = src->field == first_field ? prev2 : cur;
+        ident_t next1 = src->field == first_field ? cur : next2;
+
+        GLSL("T A = GET("$", 0, -1); \n"
+             "T B = GET("$", 0,  1); \n"
+             "T C = GET("$", 0, -2); \n"
+             "T D = GET("$", 0,  0); \n"
+             "T E = GET("$", 0, +2); \n"
+             "T F = GET("$", 0, -1); \n"
+             "T G = GET("$", 0, +1); \n"
+             "T H = GET("$", 0, -2); \n"
+             "T I = GET("$", 0,  0); \n"
+             "T J = GET("$", 0, +2); \n"
+             "T K = GET("$", 0, -1); \n"
+             "T L = GET("$", 0, +1); \n",
+             prev2, prev2,
+             prev1, prev1, prev1,
+             cur, cur,
+             next1, next1, next1,
+             next2, next2);
+
+        if (num_comps == 1) {
+            GLSL("res = "$"(A, B, C, D, E, F, G, H, I, J, K, L, res); \n", temporal_pred);
+        } else {
+            for (uint8_t i = 0; i < num_comps; i++) {
+                char c = "xyzw"[i];
+                GLSL("res.%c = "$"(A.%c, B.%c, C.%c, D.%c, E.%c, F.%c, \n"
+                     "             G.%c, H.%c, I.%c, J.%c, K.%c, L.%c, \n"
+                     "             res.%c);                            \n",
+                     c, temporal_pred, c, c, c, c, c, c, c, c, c, c, c, c, c);
+            }
+        }
+        break;
+    }
+
+    case PL_DEINTERLACE_ALGORITHM_COUNT:
+        pl_unreachable();
+    }
+
+    GLSL("}\n"); // End of primary/secondary field branch
+
+done:
+    GLSL("color.%s = res;   \n"
+         "#undef T          \n"
+         "#undef GET        \n"
+         "}                 \n",
+         swiz);
+}
diff --git a/src/shaders/dithering.c b/src/shaders/dithering.c
new file mode 100644
index 0000000..4485d11
--- /dev/null
+++ b/src/shaders/dithering.c
@@ -0,0 +1,527 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include "shaders.h"
+
+#include <libplacebo/shaders/dithering.h>
+
+const struct pl_dither_params pl_dither_default_params = { PL_DITHER_DEFAULTS };
+
+struct sh_dither_obj {
+    pl_shader_obj lut;
+};
+
+static void sh_dither_uninit(pl_gpu gpu, void *ptr)
+{
+    struct sh_dither_obj *obj = ptr;
+    pl_shader_obj_destroy(&obj->lut);
+    *obj = (struct sh_dither_obj) {0};
+}
+
+static void fill_dither_matrix(void *data, const struct sh_lut_params *params)
+{
+    pl_assert(params->width > 0 && params->height > 0 && params->comps == 1);
+
+    const struct pl_dither_params *dpar = params->priv;
+    switch (dpar->method) {
+    case PL_DITHER_ORDERED_LUT:
+        pl_assert(params->width == params->height);
+        pl_generate_bayer_matrix(data, params->width);
+        return;
+
+    case PL_DITHER_BLUE_NOISE:
+        pl_assert(params->width == params->height);
+        pl_generate_blue_noise(data, params->width);
+        return;
+
+    case PL_DITHER_ORDERED_FIXED:
+    case PL_DITHER_WHITE_NOISE:
+    case PL_DITHER_METHOD_COUNT:
+        return;
+    }
+
+    pl_unreachable();
+}
+
+static bool dither_method_is_lut(enum pl_dither_method method)
+{
+    switch (method) {
+    case PL_DITHER_BLUE_NOISE:
+    case PL_DITHER_ORDERED_LUT:
+        return true;
+    case PL_DITHER_ORDERED_FIXED:
+    case PL_DITHER_WHITE_NOISE:
+        return false;
+    case PL_DITHER_METHOD_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+}
+
+static inline float approx_gamma(enum pl_color_transfer trc)
+{
+    switch (trc) {
+    case PL_COLOR_TRC_UNKNOWN:  return 1.0f;
+    case PL_COLOR_TRC_LINEAR:   return 1.0f;
+    case PL_COLOR_TRC_PRO_PHOTO:return 1.8f;
+    case PL_COLOR_TRC_GAMMA18:  return 1.8f;
+    case PL_COLOR_TRC_GAMMA20:  return 2.0f;
+    case PL_COLOR_TRC_GAMMA24:  return 2.4f;
+    case PL_COLOR_TRC_GAMMA26:  return 2.6f;
+    case PL_COLOR_TRC_ST428:    return 2.6f;
+    case PL_COLOR_TRC_GAMMA28:  return 2.8f;
+
+    case PL_COLOR_TRC_SRGB:
+    case PL_COLOR_TRC_BT_1886:
+    case PL_COLOR_TRC_GAMMA22:
+        return 2.2f;
+
+    case PL_COLOR_TRC_PQ:
+    case PL_COLOR_TRC_HLG:
+    case PL_COLOR_TRC_V_LOG:
+    case PL_COLOR_TRC_S_LOG1:
+    case PL_COLOR_TRC_S_LOG2:
+        return 2.0f; // TODO: handle this better
+
+    case PL_COLOR_TRC_COUNT: break;
+    }
+
+    pl_unreachable();
+}
+
+void pl_shader_dither(pl_shader sh, int new_depth,
+                      pl_shader_obj *dither_state,
+                      const struct pl_dither_params *params)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    if (new_depth <= 0 || new_depth > 256) {
+        PL_WARN(sh, "Invalid dither depth: %d.. ignoring", new_depth);
+        return;
+    }
+
+    sh_describef(sh, "dithering (%d bits)", new_depth);
+    GLSL("// pl_shader_dither \n"
+        "{                    \n"
+        "float bias;          \n");
+
+    params = PL_DEF(params, &pl_dither_default_params);
+    if (params->lut_size < 0 || params->lut_size > 8) {
+        SH_FAIL(sh, "Invalid `lut_size` specified: %d", params->lut_size);
+        return;
+    }
+
+    enum pl_dither_method method = params->method;
+    ident_t lut = NULL_IDENT;
+    int lut_size = 0;
+
+    if (dither_method_is_lut(method)) {
+        if (!dither_state) {
+            PL_WARN(sh, "LUT-based dither method specified but no dither state "
+                    "object given, falling back to non-LUT based methods.");
+            goto fallback;
+        }
+
+        struct sh_dither_obj *obj;
+        obj = SH_OBJ(sh, dither_state, PL_SHADER_OBJ_DITHER,
+                     struct sh_dither_obj, sh_dither_uninit);
+        if (!obj)
+            goto fallback;
+
+        bool cache = method == PL_DITHER_BLUE_NOISE;
+        lut_size = 1 << PL_DEF(params->lut_size, pl_dither_default_params.lut_size);
+        lut = sh_lut(sh, sh_lut_params(
+            .object     = &obj->lut,
+            .var_type   = PL_VAR_FLOAT,
+            .width      = lut_size,
+            .height     = lut_size,
+            .comps      = 1,
+            .fill       = fill_dither_matrix,
+            .signature  = (CACHE_KEY_DITHER ^ method) * lut_size,
+            .cache      = cache ? SH_CACHE(sh) : NULL,
+            .priv       = (void *) params,
+        ));
+        if (!lut)
+            goto fallback;
+    }
+
+    goto done;
+
+fallback:
+    method = PL_DITHER_ORDERED_FIXED;
+    // fall through
+
+done: ;
+
+    int size = 0;
+    if (lut) {
+        size = lut_size;
+    } else if (method == PL_DITHER_ORDERED_FIXED) {
+        size = 16; // hard-coded size
+    }
+
+    if (size) {
+        // Transform the screen position to the cyclic range [0,1)
+        GLSL("vec2 pos = fract(gl_FragCoord.xy * 1.0/"$"); \n", SH_FLOAT(size));
+
+        if (params->temporal) {
+            int phase = SH_PARAMS(sh).index % 8;
+            float r = phase * (M_PI / 2); // rotate
+            float m = phase < 4 ? 1 : -1; // mirror
+            float mat[2][2] = {
+                {cos(r),     -sin(r)    },
+                {sin(r) * m,  cos(r) * m},
+            };
+
+            ident_t rot = sh_var(sh, (struct pl_shader_var) {
+                .var  = pl_var_mat2("dither_rot"),
+                .data = &mat[0][0],
+                .dynamic = true,
+            });
+            GLSL("pos = fract("$" * pos + vec2(1.0));\n", rot);
+        }
+    }
+
+    switch (method) {
+    case PL_DITHER_WHITE_NOISE: {
+        ident_t prng = sh_prng(sh, params->temporal, NULL);
+        GLSL("bias = "$".x;\n", prng);
+        break;
+    }
+
+    case PL_DITHER_ORDERED_FIXED:
+        // Bitwise ordered dither using only 32-bit uints
+        GLSL("uvec2 xy = uvec2(pos * 16.0) %% 16u;     \n"
+             // Bitwise merge (morton number)
+             "xy.x = xy.x ^ xy.y;                      \n"
+             "xy = (xy | xy << 2) & uvec2(0x33333333); \n"
+             "xy = (xy | xy << 1) & uvec2(0x55555555); \n"
+             // Bitwise inversion
+             "uint b = xy.x + (xy.y << 1);             \n"
+             "b = (b * 0x0802u & 0x22110u) |           \n"
+             "    (b * 0x8020u & 0x88440u);            \n"
+             "b = 0x10101u * b;                        \n"
+             "b = (b >> 16) & 0xFFu;                   \n"
+             // Generate bias value
+             "bias = float(b) * 1.0/256.0;             \n");
+        break;
+
+    case PL_DITHER_BLUE_NOISE:
+    case PL_DITHER_ORDERED_LUT:
+        pl_assert(lut);
+        GLSL("bias = "$"(ivec2(pos * "$"));\n", lut, SH_FLOAT(lut_size));
+        break;
+
+    case PL_DITHER_METHOD_COUNT:
+        pl_unreachable();
+    }
+
+    // Scale factor for dither rounding
+    GLSL("const float scale = %llu.0; \n", (1LLU << new_depth) - 1);
+
+    const float gamma = approx_gamma(params->transfer);
+    if (gamma != 1.0f && new_depth <= 4) {
+        GLSL("const float gamma = "$";                  \n"
+             "vec4 color_lin = pow(color, vec4(gamma)); \n",
+             SH_FLOAT(gamma));
+
+        if (new_depth == 1) {
+            // Special case for bit depth 1 dithering, in this case we can just
+            // ignore the low/high rounding because we know we are always
+            // dithering between 0.0 and 1.0.
+            GLSL("const vec4 low = vec4(0.0);           \n"
+                 "const vec4 high = vec4(1.0);          \n"
+                 "vec4 offset = color_lin;              \n");
+        } else {
+            // Linearize the low, high and current color values
+            GLSL("vec4 low = floor(color * scale) / scale;  \n"
+                 "vec4 high = ceil(color * scale) / scale;  \n"
+                 "vec4 low_lin = pow(low, vec4(gamma));     \n"
+                 "vec4 high_lin = pow(high, vec4(gamma));   \n"
+                 "vec4 range = high_lin - low_lin;          \n"
+                 "vec4 offset = (color_lin - low_lin) /     \n"
+                 "              max(range, 1e-6);           \n");
+        }
+
+        // Mix in the correct ratio corresponding to the offset and bias
+        GLSL("color = mix(low, high, greaterThan(offset, vec4(bias))); \n");
+    } else {
+        // Approximate each gamma segment as a straight line, this simplifies
+        // the process of dithering down to a single scale and (biased) round.
+        GLSL("color = scale * color + vec4(bias);   \n"
+             "color = floor(color) * (1.0 / scale); \n");
+    }
+
+    GLSL("} \n");
+}
+
+/* Error diffusion code is taken from mpv, original copyright (c) 2019 Bin Jin
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// After a (y, x) -> (y, x + y * shift) mapping, find the right most column that
+// will be affected by the current column.
+static int compute_rightmost_shifted_column(const struct pl_error_diffusion_kernel *k)
+{
+    int ret = 0;
+    for (int y = 0; y <= PL_EDF_MAX_DY; y++) {
+        for (int x = PL_EDF_MIN_DX; x <= PL_EDF_MAX_DX; x++) {
+            if (k->pattern[y][x - PL_EDF_MIN_DX] != 0) {
+                int shifted_x = x + y * k->shift;
+
+                // The shift mapping guarantees current column (or left of it)
+                // won't be affected by error diffusion.
+                assert(shifted_x > 0);
+
+                ret = PL_MAX(ret, shifted_x);
+            }
+        }
+    }
+    return ret;
+}
+
+size_t pl_error_diffusion_shmem_req(const struct pl_error_diffusion_kernel *kernel,
+                                    int height)
+{
+    // We add PL_EDF_MAX_DY empty lines on the bottom to handle errors
+    // propagated out from bottom side.
+    int rows = height + PL_EDF_MAX_DY;
+    int shifted_columns = compute_rightmost_shifted_column(kernel) + 1;
+
+    // The shared memory is an array of size rows*shifted_columns. Each element
+    // is a single uint for three RGB component.
+    return rows * shifted_columns * sizeof(uint32_t);
+}
+
+bool pl_shader_error_diffusion(pl_shader sh, const struct pl_error_diffusion_params *params)
+{
+    const int width = params->input_tex->params.w, height = params->input_tex->params.h;
+    const struct pl_glsl_version glsl = sh_glsl(sh);
+    const struct pl_error_diffusion_kernel *kernel =
+        PL_DEF(params->kernel, &pl_error_diffusion_sierra_lite);
+
+    pl_assert(params->output_tex->params.w == width);
+    pl_assert(params->output_tex->params.h == height);
+    if (!sh_require(sh, PL_SHADER_SIG_NONE, width, height))
+        return false;
+
+    if (params->new_depth <= 0 || params->new_depth > 256) {
+        PL_WARN(sh, "Invalid dither depth: %d.. ignoring", params->new_depth);
+        return false;
+    }
+
+    // The parallel error diffusion works by applying the shift mapping first.
+    // Taking the Floyd and Steinberg algorithm for example. After applying
+    // the (y, x) -> (y, x + y * shift) mapping (with shift=2), all errors are
+    // propagated into the next few columns, which makes parallel processing on
+    // the same column possible.
+    //
+    //           X    7/16                X    7/16
+    //    3/16  5/16  1/16   ==>    0     0    3/16  5/16  1/16
+
+    // Figuring out the size of rectangle containing all shifted pixels.
+    // The rectangle height is not changed.
+    int shifted_width = width + (height - 1) * kernel->shift;
+
+    // We process all pixels from the shifted rectangles column by column, with
+    // a single global work group of size |block_size|.
+    // Figuring out how many block are required to process all pixels. We need
+    // this explicitly to make the number of barrier() calls match.
+    int block_size = PL_MIN(glsl.max_group_threads, height);
+    int blocks = PL_DIV_UP(height * shifted_width, block_size);
+
+    // If we figure out how many of the next columns will be affected while the
+    // current columns is being processed. We can store errors of only a few
+    // columns in the shared memory. Using a ring buffer will further save the
+    // cost while iterating to next column.
+    //
+    int ring_buffer_rows = height + PL_EDF_MAX_DY;
+    int ring_buffer_columns = compute_rightmost_shifted_column(kernel) + 1;
+    ident_t ring_buffer_size = sh_const(sh, (struct pl_shader_const) {
+        .type = PL_VAR_UINT,
+        .name = "ring_buffer_size",
+        .data = &(unsigned) { ring_buffer_rows * ring_buffer_columns },
+        .compile_time = true,
+    });
+
+    // Compute shared memory requirements and try enabling compute shader.
+    size_t shmem_req = ring_buffer_rows * ring_buffer_columns * sizeof(uint32_t);
+    if (!sh_try_compute(sh, block_size, 1, false, shmem_req)) {
+        PL_ERR(sh, "Cannot execute error diffusion kernel: too old GPU or "
+               "insufficient compute shader memory!");
+        return false;
+    }
+
+    ident_t in_tex = sh_desc(sh, (struct pl_shader_desc) {
+        .binding.object = params->input_tex,
+        .desc = {
+            .name   = "input_tex",
+            .type   = PL_DESC_SAMPLED_TEX,
+        },
+    });
+
+    ident_t out_img = sh_desc(sh, (struct pl_shader_desc) {
+        .binding.object = params->output_tex,
+        .desc = {
+            .name    = "output_tex",
+            .type    = PL_DESC_STORAGE_IMG,
+            .access  = PL_DESC_ACCESS_WRITEONLY,
+        },
+    });
+
+    sh->output = PL_SHADER_SIG_NONE;
+    sh_describef(sh, "error diffusion (%s, %d bits)",
+                 kernel->name, params->new_depth);
+
+    // Defines the ring buffer in shared memory.
+    GLSLH("shared uint err_rgb8["$"]; \n", ring_buffer_size);
+    GLSL("// pl_shader_error_diffusion                                          \n"
+         // Safeguard against accidental over-execution
+         "if (gl_WorkGroupID != uvec3(0))                                       \n"
+         "    return;                                                           \n"
+         // Initialize the ring buffer.
+         "for (uint i = gl_LocalInvocationIndex; i < "$"; i+=gl_WorkGroupSize.x)\n"
+         "    err_rgb8[i] = 0u;                                                 \n"
+
+        // Main block loop, add barrier here to have previous block all
+        // processed before starting the processing of the next.
+         "for (uint block_id = 0; block_id < "$"; block_id++) {                 \n"
+         "barrier();                                                            \n"
+        // Compute the coordinate of the pixel we are currently processing,
+        // both before and after the shift mapping.
+         "uint id = block_id * gl_WorkGroupSize.x + gl_LocalInvocationIndex;    \n"
+         "const uint height = "$";                                              \n"
+         "int y = int(id %% height), x_shifted = int(id / height);              \n"
+         "int x = x_shifted - y * %d;                                           \n"
+         // Proceed only if we are processing a valid pixel.
+         "if (x >= 0 && x < "$") {                                              \n"
+         // The index that the current pixel have on the ring buffer.
+         "uint idx = uint(x_shifted * "$" + y) %% "$";                          \n"
+         // Fetch the current pixel.
+         "vec4 pix_orig = texelFetch("$", ivec2(x, y), 0);                      \n"
+         "vec3 pix = pix_orig.rgb;                                              \n",
+         ring_buffer_size,
+         SH_UINT(blocks),
+         SH_UINT(height),
+         kernel->shift,
+         SH_INT(width),
+         SH_INT(ring_buffer_rows),
+         ring_buffer_size,
+         in_tex);
+
+    // The dithering will quantize pixel value into multiples of 1/dither_quant.
+    int dither_quant = (1 << params->new_depth) - 1;
+
+    // We encode errors in RGB components into a single 32-bit unsigned integer.
+    // The error we propagate from the current pixel is in range of
+    // [-0.5 / dither_quant, 0.5 / dither_quant]. While not quite obvious, the
+    // sum of all errors been propagated into a pixel is also in the same range.
+    // It's possible to map errors in this range into [-127, 127], and use an
+    // unsigned 8-bit integer to store it (using standard two's complement).
+    // The three 8-bit unsigned integers can then be encoded into a single
+    // 32-bit unsigned integer, with two 4-bit padding to prevent addition
+    // operation overflows affecting other component. There are at most 12
+    // addition operations on each pixel, so 4-bit padding should be enough.
+    // The overflow from R component will be discarded.
+    //
+    // The following figure is how the encoding looks like.
+    //
+    //     +------------------------------------+
+    //     |RRRRRRRR|0000|GGGGGGGG|0000|BBBBBBBB|
+    //     +------------------------------------+
+    //
+
+    // The bitshift position for R and G component.
+    const int bitshift_r = 24, bitshift_g = 12;
+    // The multiplier we use to map [-0.5, 0.5] to [-127, 127].
+    const int uint8_mul = 127 * 2;
+
+    GLSL(// Add the error previously propagated into current pixel, and clear
+         // it in the ring buffer.
+         "uint err_u32 = err_rgb8[idx] + %uu;                                   \n"
+         "pix = pix * %d.0 + vec3(int((err_u32 >> %d) & 0xFFu) - 128,           \n"
+         "                        int((err_u32 >> %d) & 0xFFu) - 128,           \n"
+         "                        int( err_u32        & 0xFFu) - 128) / %d.0;   \n"
+         "err_rgb8[idx] = 0u;                                                   \n"
+         // Write the dithered pixel.
+         "vec3 dithered = round(pix);                                           \n"
+         "imageStore("$", ivec2(x, y), vec4(dithered / %d.0, pix_orig.a));      \n"
+         // Prepare for error propagation pass
+         "vec3 err_divided = (pix - dithered) * %d.0 / %d.0;                    \n"
+         "ivec3 tmp;                                                            \n",
+         (128u << bitshift_r) | (128u << bitshift_g) | 128u,
+         dither_quant, bitshift_r, bitshift_g, uint8_mul,
+         out_img, dither_quant,
+         uint8_mul, kernel->divisor);
+
+    // Group error propagation with same weight factor together, in order to
+    // reduce the number of annoying error encoding.
+    for (int dividend = 1; dividend <= kernel->divisor; dividend++) {
+        bool err_assigned = false;
+
+        for (int y = 0; y <= PL_EDF_MAX_DY; y++) {
+            for (int x = PL_EDF_MIN_DX; x <= PL_EDF_MAX_DX; x++) {
+                if (kernel->pattern[y][x - PL_EDF_MIN_DX] != dividend)
+                    continue;
+
+                if (!err_assigned) {
+                    err_assigned = true;
+
+                    GLSL("tmp = ivec3(round(err_divided * %d.0));   \n"
+                         "err_u32 = (uint(tmp.r & 0xFF) << %d) |    \n"
+                         "          (uint(tmp.g & 0xFF) << %d) |    \n"
+                         "           uint(tmp.b & 0xFF);            \n",
+                         dividend,
+                         bitshift_r, bitshift_g);
+                }
+
+                int shifted_x = x + y * kernel->shift;
+
+                // Unlike the right border, errors propagated out from left
+                // border will remain in the ring buffer. This will produce
+                // visible artifacts near the left border, especially for
+                // shift=3 kernels.
+                if (x < 0)
+                    GLSL("if (x >= %d) \n", -x);
+
+                // Calculate the new position in the ring buffer to propagate
+                // the error into.
+                int ring_buffer_delta = shifted_x * ring_buffer_rows + y;
+                GLSL("atomicAdd(err_rgb8[(idx + %du) %% "$"], err_u32); \n",
+                     ring_buffer_delta, ring_buffer_size);
+            }
+        }
+    }
+
+    GLSL("}} \n"); // end of main loop + valid pixel conditional
+    return true;
+}
diff --git a/src/shaders/film_grain.c b/src/shaders/film_grain.c
new file mode 100644
index 0000000..b1d25ff
--- /dev/null
+++ b/src/shaders/film_grain.c
@@ -0,0 +1,65 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "shaders.h"
+#include "shaders/film_grain.h"
+
+bool pl_needs_film_grain(const struct pl_film_grain_params *params)
+{
+    switch (params->data.type) {
+    case PL_FILM_GRAIN_NONE: return false;
+    case PL_FILM_GRAIN_AV1:  return pl_needs_fg_av1(params);
+    case PL_FILM_GRAIN_H274: return pl_needs_fg_h274(params);
+    default: pl_unreachable();
+    }
+}
+
+struct sh_grain_obj {
+    pl_shader_obj av1;
+    pl_shader_obj h274;
+};
+
+static void sh_grain_uninit(pl_gpu gpu, void *ptr)
+{
+    struct sh_grain_obj *obj = ptr;
+    pl_shader_obj_destroy(&obj->av1);
+    pl_shader_obj_destroy(&obj->h274);
+}
+
+bool pl_shader_film_grain(pl_shader sh, pl_shader_obj *grain_state,
+                          const struct pl_film_grain_params *params)
+{
+    if (!pl_needs_film_grain(params)) {
+        // FIXME: Instead of erroring, sample directly
+        SH_FAIL(sh, "pl_shader_film_grain called but no film grain needs to be "
+                    "applied, test with `pl_needs_film_grain` first!");
+        return false;
+    }
+
+    struct sh_grain_obj *obj;
+    obj = SH_OBJ(sh, grain_state, PL_SHADER_OBJ_FILM_GRAIN,
+                 struct sh_grain_obj, sh_grain_uninit);
+    if (!obj)
+        return false;
+
+    switch (params->data.type) {
+    case PL_FILM_GRAIN_NONE: return false;
+    case PL_FILM_GRAIN_AV1:  return pl_shader_fg_av1(sh, &obj->av1, params);
+    case PL_FILM_GRAIN_H274: return pl_shader_fg_h274(sh, &obj->h274, params);
+    default: pl_unreachable();
+    }
+}
diff --git a/src/shaders/film_grain.h b/src/shaders/film_grain.h
new file mode 100644
index 0000000..f6498c1
--- /dev/null
+++ b/src/shaders/film_grain.h
@@ -0,0 +1,75 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+#include <libplacebo/shaders/film_grain.h>
+
+bool pl_needs_fg_av1(const struct pl_film_grain_params *);
+bool pl_needs_fg_h274(const struct pl_film_grain_params *);
+
+bool pl_shader_fg_av1(pl_shader, pl_shader_obj *, const struct pl_film_grain_params *);
+bool pl_shader_fg_h274(pl_shader, pl_shader_obj *, const struct pl_film_grain_params *);
+
+// Common helper function
+static inline enum pl_channel channel_map(int i, const struct pl_film_grain_params *params)
+{
+    static const enum pl_channel map_rgb[3] = {
+        [PL_CHANNEL_G] = PL_CHANNEL_Y,
+        [PL_CHANNEL_B] = PL_CHANNEL_CB,
+        [PL_CHANNEL_R] = PL_CHANNEL_CR,
+    };
+
+    static const enum pl_channel map_xyz[3] = {
+        [1] = PL_CHANNEL_Y,  // Y
+        [2] = PL_CHANNEL_CB, // Z
+        [0] = PL_CHANNEL_CR, // X
+    };
+
+    if (i >= params->components)
+        return PL_CHANNEL_NONE;
+
+    int comp = params->component_mapping[i];
+    if (comp < 0 || comp > 2)
+        return PL_CHANNEL_NONE;
+
+    switch (params->repr->sys) {
+    case PL_COLOR_SYSTEM_UNKNOWN:
+    case PL_COLOR_SYSTEM_RGB:
+        return map_rgb[comp];
+    case PL_COLOR_SYSTEM_XYZ:
+        return map_xyz[comp];
+
+    case PL_COLOR_SYSTEM_BT_601:
+    case PL_COLOR_SYSTEM_BT_709:
+    case PL_COLOR_SYSTEM_SMPTE_240M:
+    case PL_COLOR_SYSTEM_BT_2020_NC:
+    case PL_COLOR_SYSTEM_BT_2020_C:
+    case PL_COLOR_SYSTEM_BT_2100_PQ:
+    case PL_COLOR_SYSTEM_BT_2100_HLG:
+    case PL_COLOR_SYSTEM_DOLBYVISION:
+    case PL_COLOR_SYSTEM_YCGCO:
+        return comp;
+
+    case PL_COLOR_SYSTEM_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+}
diff --git a/src/shaders/film_grain_av1.c b/src/shaders/film_grain_av1.c
new file mode 100644
index 0000000..3b11ea3
--- /dev/null
+++ b/src/shaders/film_grain_av1.c
@@ -0,0 +1,1001 @@
+/*
+ * This file is part of libplacebo, which is normally licensed under the terms
+ * of the LGPL v2.1+. However, this file (film_grain_av1.c) is also available
+ * under the terms of the more permissive MIT license:
+ *
+ * Copyright (c) 2018-2019 Niklas Haas
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "shaders.h"
+#include "shaders/film_grain.h"
+
+// Taken from the spec. Range is [-2048, 2047], mean is 0 and stddev is 512
+static const int16_t gaussian_sequence[2048] = {
+  56,    568,   -180,  172,   124,   -84,   172,   -64,   -900,  24,   820,
+  224,   1248,  996,   272,   -8,    -916,  -388,  -732,  -104,  -188, 800,
+  112,   -652,  -320,  -376,  140,   -252,  492,   -168,  44,    -788, 588,
+  -584,  500,   -228,  12,    680,   272,   -476,  972,   -100,  652,  368,
+  432,   -196,  -720,  -192,  1000,  -332,  652,   -136,  -552,  -604, -4,
+  192,   -220,  -136,  1000,  -52,   372,   -96,   -624,  124,   -24,  396,
+  540,   -12,   -104,  640,   464,   244,   -208,  -84,   368,   -528, -740,
+  248,   -968,  -848,  608,   376,   -60,   -292,  -40,   -156,  252,  -292,
+  248,   224,   -280,  400,   -244,  244,   -60,   76,    -80,   212,  532,
+  340,   128,   -36,   824,   -352,  -60,   -264,  -96,   -612,  416,  -704,
+  220,   -204,  640,   -160,  1220,  -408,  900,   336,   20,    -336, -96,
+  -792,  304,   48,    -28,   -1232, -1172, -448,  104,   -292,  -520, 244,
+  60,    -948,  0,     -708,  268,   108,   356,   -548,  488,   -344, -136,
+  488,   -196,  -224,  656,   -236,  -1128, 60,    4,     140,   276,  -676,
+  -376,  168,   -108,  464,   8,     564,   64,    240,   308,   -300, -400,
+  -456,  -136,  56,    120,   -408,  -116,  436,   504,   -232,  328,  844,
+  -164,  -84,   784,   -168,  232,   -224,  348,   -376,  128,   568,  96,
+  -1244, -288,  276,   848,   832,   -360,  656,   464,   -384,  -332, -356,
+  728,   -388,  160,   -192,  468,   296,   224,   140,   -776,  -100, 280,
+  4,     196,   44,    -36,   -648,  932,   16,    1428,  28,    528,  808,
+  772,   20,    268,   88,    -332,  -284,  124,   -384,  -448,  208,  -228,
+  -1044, -328,  660,   380,   -148,  -300,  588,   240,   540,   28,   136,
+  -88,   -436,  256,   296,   -1000, 1400,  0,     -48,   1056,  -136, 264,
+  -528,  -1108, 632,   -484,  -592,  -344,  796,   124,   -668,  -768, 388,
+  1296,  -232,  -188,  -200,  -288,  -4,    308,   100,   -168,  256,  -500,
+  204,   -508,  648,   -136,  372,   -272,  -120,  -1004, -552,  -548, -384,
+  548,   -296,  428,   -108,  -8,    -912,  -324,  -224,  -88,   -112, -220,
+  -100,  996,   -796,  548,   360,   -216,  180,   428,   -200,  -212, 148,
+  96,    148,   284,   216,   -412,  -320,  120,   -300,  -384,  -604, -572,
+  -332,  -8,    -180,  -176,  696,   116,   -88,   628,   76,    44,   -516,
+  240,   -208,  -40,   100,   -592,  344,   -308,  -452,  -228,  20,   916,
+  -1752, -136,  -340,  -804,  140,   40,    512,   340,   248,   184,  -492,
+  896,   -156,  932,   -628,  328,   -688,  -448,  -616,  -752,  -100, 560,
+  -1020, 180,   -800,  -64,   76,    576,   1068,  396,   660,   552,  -108,
+  -28,   320,   -628,  312,   -92,   -92,   -472,  268,   16,    560,  516,
+  -672,  -52,   492,   -100,  260,   384,   284,   292,   304,   -148, 88,
+  -152,  1012,  1064,  -228,  164,   -376,  -684,  592,   -392,  156,  196,
+  -524,  -64,   -884,  160,   -176,  636,   648,   404,   -396,  -436, 864,
+  424,   -728,  988,   -604,  904,   -592,  296,   -224,  536,   -176, -920,
+  436,   -48,   1176,  -884,  416,   -776,  -824,  -884,  524,   -548, -564,
+  -68,   -164,  -96,   692,   364,   -692,  -1012, -68,   260,   -480, 876,
+  -1116, 452,   -332,  -352,  892,   -1088, 1220,  -676,  12,    -292, 244,
+  496,   372,   -32,   280,   200,   112,   -440,  -96,   24,    -644, -184,
+  56,    -432,  224,   -980,  272,   -260,  144,   -436,  420,   356,  364,
+  -528,  76,    172,   -744,  -368,  404,   -752,  -416,  684,   -688, 72,
+  540,   416,   92,    444,   480,   -72,   -1416, 164,   -1172, -68,  24,
+  424,   264,   1040,  128,   -912,  -524,  -356,  64,    876,   -12,  4,
+  -88,   532,   272,   -524,  320,   276,   -508,  940,   24,    -400, -120,
+  756,   60,    236,   -412,  100,   376,   -484,  400,   -100,  -740, -108,
+  -260,  328,   -268,  224,   -200,  -416,  184,   -604,  -564,  -20,  296,
+  60,    892,   -888,  60,    164,   68,    -760,  216,   -296,  904,  -336,
+  -28,   404,   -356,  -568,  -208,  -1480, -512,  296,   328,   -360, -164,
+  -1560, -776,  1156,  -428,  164,   -504,  -112,  120,   -216,  -148, -264,
+  308,   32,    64,    -72,   72,    116,   176,   -64,   -272,  460,  -536,
+  -784,  -280,  348,   108,   -752,  -132,  524,   -540,  -776,  116,  -296,
+  -1196, -288,  -560,  1040,  -472,  116,   -848,  -1116, 116,   636,  696,
+  284,   -176,  1016,  204,   -864,  -648,  -248,  356,   972,   -584, -204,
+  264,   880,   528,   -24,   -184,  116,   448,   -144,  828,   524,  212,
+  -212,  52,    12,    200,   268,   -488,  -404,  -880,  824,   -672, -40,
+  908,   -248,  500,   716,   -576,  492,   -576,  16,    720,   -108, 384,
+  124,   344,   280,   576,   -500,  252,   104,   -308,  196,   -188, -8,
+  1268,  296,   1032,  -1196, 436,   316,   372,   -432,  -200,  -660, 704,
+  -224,  596,   -132,  268,   32,    -452,  884,   104,   -1008, 424,  -1348,
+  -280,  4,     -1168, 368,   476,   696,   300,   -8,    24,    180,  -592,
+  -196,  388,   304,   500,   724,   -160,  244,   -84,   272,   -256, -420,
+  320,   208,   -144,  -156,  156,   364,   452,   28,    540,   316,  220,
+  -644,  -248,  464,   72,    360,   32,    -388,  496,   -680,  -48,  208,
+  -116,  -408,  60,    -604,  -392,  548,   -840,  784,   -460,  656,  -544,
+  -388,  -264,  908,   -800,  -628,  -612,  -568,  572,   -220,  164,  288,
+  -16,   -308,  308,   -112,  -636,  -760,  280,   -668,  432,   364,  240,
+  -196,  604,   340,   384,   196,   592,   -44,   -500,  432,   -580, -132,
+  636,   -76,   392,   4,     -412,  540,   508,   328,   -356,  -36,  16,
+  -220,  -64,   -248,  -60,   24,    -192,  368,   1040,  92,    -24,  -1044,
+  -32,   40,    104,   148,   192,   -136,  -520,  56,    -816,  -224, 732,
+  392,   356,   212,   -80,   -424,  -1008, -324,  588,   -1496, 576,  460,
+  -816,  -848,  56,    -580,  -92,   -1372, -112,  -496,  200,   364,  52,
+  -140,  48,    -48,   -60,   84,    72,    40,    132,   -356,  -268, -104,
+  -284,  -404,  732,   -520,  164,   -304,  -540,  120,   328,   -76,  -460,
+  756,   388,   588,   236,   -436,  -72,   -176,  -404,  -316,  -148, 716,
+  -604,  404,   -72,   -88,   -888,  -68,   944,   88,    -220,  -344, 960,
+  472,   460,   -232,  704,   120,   832,   -228,  692,   -508,  132,  -476,
+  844,   -748,  -364,  -44,   1116,  -1104, -1056, 76,    428,   552,  -692,
+  60,    356,   96,    -384,  -188,  -612,  -576,  736,   508,   892,  352,
+  -1132, 504,   -24,   -352,  324,   332,   -600,  -312,  292,   508,  -144,
+  -8,    484,   48,    284,   -260,  -240,  256,   -100,  -292,  -204, -44,
+  472,   -204,  908,   -188,  -1000, -256,  92,    1164,  -392,  564,  356,
+  652,   -28,   -884,  256,   484,   -192,  760,   -176,  376,   -524, -452,
+  -436,  860,   -736,  212,   124,   504,   -476,  468,   76,    -472, 552,
+  -692,  -944,  -620,  740,   -240,  400,   132,   20,    192,   -196, 264,
+  -668,  -1012, -60,   296,   -316,  -828,  76,    -156,  284,   -768, -448,
+  -832,  148,   248,   652,   616,   1236,  288,   -328,  -400,  -124, 588,
+  220,   520,   -696,  1032,  768,   -740,  -92,   -272,  296,   448,  -464,
+  412,   -200,  392,   440,   -200,  264,   -152,  -260,  320,   1032, 216,
+  320,   -8,    -64,   156,   -1016, 1084,  1172,  536,   484,   -432, 132,
+  372,   -52,   -256,  84,    116,   -352,  48,    116,   304,   -384, 412,
+  924,   -300,  528,   628,   180,   648,   44,    -980,  -220,  1320, 48,
+  332,   748,   524,   -268,  -720,  540,   -276,  564,   -344,  -208, -196,
+  436,   896,   88,    -392,  132,   80,    -964,  -288,  568,   56,   -48,
+  -456,  888,   8,     552,   -156,  -292,  948,   288,   128,   -716, -292,
+  1192,  -152,  876,   352,   -600,  -260,  -812,  -468,  -28,   -120, -32,
+  -44,   1284,  496,   192,   464,   312,   -76,   -516,  -380,  -456, -1012,
+  -48,   308,   -156,  36,    492,   -156,  -808,  188,   1652,  68,   -120,
+  -116,  316,   160,   -140,  352,   808,   -416,  592,   316,   -480, 56,
+  528,   -204,  -568,  372,   -232,  752,   -344,  744,   -4,    324,  -416,
+  -600,  768,   268,   -248,  -88,   -132,  -420,  -432,  80,    -288, 404,
+  -316,  -1216, -588,  520,   -108,  92,    -320,  368,   -480,  -216, -92,
+  1688,  -300,  180,   1020,  -176,  820,   -68,   -228,  -260,  436,  -904,
+  20,    40,    -508,  440,   -736,  312,   332,   204,   760,   -372, 728,
+  96,    -20,   -632,  -520,  -560,  336,   1076,  -64,   -532,  776,  584,
+  192,   396,   -728,  -520,  276,   -188,  80,    -52,   -612,  -252, -48,
+  648,   212,   -688,  228,   -52,   -260,  428,   -412,  -272,  -404, 180,
+  816,   -796,  48,    152,   484,   -88,   -216,  988,   696,   188,  -528,
+  648,   -116,  -180,  316,   476,   12,    -564,  96,    476,   -252, -364,
+  -376,  -392,  556,   -256,  -576,  260,   -352,  120,   -16,   -136, -260,
+  -492,  72,    556,   660,   580,   616,   772,   436,   424,   -32,  -324,
+  -1268, 416,   -324,  -80,   920,   160,   228,   724,   32,    -516, 64,
+  384,   68,    -128,  136,   240,   248,   -204,  -68,   252,   -932, -120,
+  -480,  -628,  -84,   192,   852,   -404,  -288,  -132,  204,   100,  168,
+  -68,   -196,  -868,  460,   1080,  380,   -80,   244,   0,     484,  -888,
+  64,    184,   352,   600,   460,   164,   604,   -196,  320,   -64,  588,
+  -184,  228,   12,    372,   48,    -848,  -344,  224,   208,   -200, 484,
+  128,   -20,   272,   -468,  -840,  384,   256,   -720,  -520,  -464, -580,
+  112,   -120,  644,   -356,  -208,  -608,  -528,  704,   560,   -424, 392,
+  828,   40,    84,    200,   -152,  0,     -144,  584,   280,   -120, 80,
+  -556,  -972,  -196,  -472,  724,   80,    168,   -32,   88,    160,  -688,
+  0,     160,   356,   372,   -776,  740,   -128,  676,   -248,  -480, 4,
+  -364,  96,    544,   232,   -1032, 956,   236,   356,   20,    -40,  300,
+  24,    -676,  -596,  132,   1120,  -104,  532,   -1096, 568,   648,  444,
+  508,   380,   188,   -376,  -604,  1488,  424,   24,    756,   -220, -192,
+  716,   120,   920,   688,   168,   44,    -460,  568,   284,   1144, 1160,
+  600,   424,   888,   656,   -356,  -320,  220,   316,   -176,  -724, -188,
+  -816,  -628,  -348,  -228,  -380,  1012,  -452,  -660,  736,   928,  404,
+  -696,  -72,   -268,  -892,  128,   184,   -344,  -780,  360,   336,  400,
+  344,   428,   548,   -112,  136,   -228,  -216,  -820,  -516,  340,  92,
+  -136,  116,   -300,  376,   -244,  100,   -316,  -520,  -284,  -12,  824,
+  164,   -548,  -180,  -128,  116,   -924,  -828,  268,   -368,  -580, 620,
+  192,   160,   0,     -1676, 1068,  424,   -56,   -360,  468,   -156, 720,
+  288,   -528,  556,   -364,  548,   -148,  504,   316,   152,   -648, -620,
+  -684,  -24,   -376,  -384,  -108,  -920,  -1032, 768,   180,   -264, -508,
+  -1268, -260,  -60,   300,   -240,  988,   724,   -376,  -576,  -212, -736,
+  556,   192,   1092,  -620,  -880,  376,   -56,   -4,    -216,  -32,  836,
+  268,   396,   1332,  864,   -600,  100,   56,    -412,  -92,   356,  180,
+  884,   -468,  -436,  292,   -388,  -804,  -704,  -840,  368,   -348, 140,
+  -724,  1536,  940,   372,   112,   -372,  436,   -480,  1136,  296,  -32,
+  -228,  132,   -48,   -220,  868,   -1016, -60,   -1044, -464,  328,  916,
+  244,   12,    -736,  -296,  360,   468,   -376,  -108,  -92,   788,  368,
+  -56,   544,   400,   -672,  -420,  728,   16,    320,   44,    -284, -380,
+  -796,  488,   132,   204,   -596,  -372,  88,    -152,  -908,  -636, -572,
+  -624,  -116,  -692,  -200,  -56,   276,   -88,   484,   -324,  948,  864,
+  1000,  -456,  -184,  -276,  292,   -296,  156,   676,   320,   160,  908,
+  -84,   -1236, -288,  -116,  260,   -372,  -644,  732,   -756,  -96,  84,
+  344,   -520,  348,   -688,  240,   -84,   216,   -1044, -136,  -676, -396,
+  -1500, 960,   -40,   176,   168,   1516,  420,   -504,  -344,  -364, -360,
+  1216,  -940,  -380,  -212,  252,   -660,  -708,  484,   -444,  -152, 928,
+  -120,  1112,  476,   -260,  560,   -148,  -344,  108,   -196,  228,  -288,
+  504,   560,   -328,  -88,   288,   -1008, 460,   -228,  468,   -836, -196,
+  76,    388,   232,   412,   -1168, -716,  -644,  756,   -172,  -356, -504,
+  116,   432,   528,   48,    476,   -168,  -608,  448,   160,   -532, -272,
+  28,    -676,  -12,   828,   980,   456,   520,   104,   -104,  256,  -344,
+  -4,    -28,   -368,  -52,   -524,  -572,  -556,  -200,  768,   1124, -208,
+  -512,  176,   232,   248,   -148,  -888,  604,   -600,  -304,  804,  -156,
+  -212,  488,   -192,  -804,  -256,  368,   -360,  -916,  -328,  228,  -240,
+  -448,  -472,  856,   -556,  -364,  572,   -12,   -156,  -368,  -340, 432,
+  252,   -752,  -152,  288,   268,   -580,  -848,  -592,  108,   -76,  244,
+  312,   -716,  592,   -80,   436,   360,   4,     -248,  160,   516,  584,
+  732,   44,    -468,  -280,  -292,  -156,  -588,  28,    308,   912,  24,
+  124,   156,   180,   -252,  944,   -924,  -772,  -520,  -428,  -624, 300,
+  -212,  -1144, 32,    -724,  800,   -1128, -212,  -1288, -848,  180,  -416,
+  440,   192,   -576,  -792,  -76,   -1080, 80,    -532,  -352,  -132, 380,
+  -820,  148,   1112,  128,   164,   456,   700,   -924,  144,   -668, -384,
+  648,   -832,  508,   552,   -52,   -100,  -656,  208,   -568,  748,  -88,
+  680,   232,   300,   192,   -408,  -1012, -152,  -252,  -268,  272,  -876,
+  -664,  -648,  -332,  -136,  16,    12,    1152,  -28,   332,   -536, 320,
+  -672,  -460,  -316,  532,   -260,  228,   -40,   1052,  -816,  180,  88,
+  -496,  -556,  -672,  -368,  428,   92,    356,   404,   -408,  252,  196,
+  -176,  -556,  792,   268,   32,    372,   40,    96,    -332,  328,  120,
+  372,   -900,  -40,   472,   -264,  -592,  952,   128,   656,   112,  664,
+  -232,  420,   4,     -344,  -464,  556,   244,   -416,  -32,   252,  0,
+  -412,  188,   -696,  508,   -476,  324,   -1096, 656,   -312,  560,  264,
+  -136,  304,   160,   -64,   -580,  248,   336,   -720,  560,   -348, -288,
+  -276,  -196,  -500,  852,   -544,  -236,  -1128, -992,  -776,  116,  56,
+  52,    860,   884,   212,   -12,   168,   1020,  512,   -552,  924,  -148,
+  716,   188,   164,   -340,  -520,  -184,  880,   -152,  -680,  -208, -1156,
+  -300,  -528,  -472,  364,   100,   -744,  -1056, -32,   540,   280,  144,
+  -676,  -32,   -232,  -280,  -224,  96,    568,   -76,   172,   148,  148,
+  104,   32,    -296,  -32,   788,   -80,   32,    -16,   280,   288,  944,
+  428,   -484
+};
+
+static inline int get_random_number(int bits, uint16_t *state)
+{
+    int r = *state;
+    uint16_t bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1;
+    *state = (r >> 1) | (bit << 15);
+
+    return (*state >> (16 - bits)) & ((1 << bits) - 1);
+}
+
+static inline int round2(int x, int shift)
+{
+    if (!shift)
+        return x;
+
+    return (x + (1 << (shift - 1))) >> shift;
+}
+
+enum {
+    BLOCK_SIZE = 32,
+    SCALING_LUT_SIZE = 256,
+
+    GRAIN_WIDTH = 82,
+    GRAIN_HEIGHT = 73,
+    // On the GPU we only need a subsection of this
+    GRAIN_WIDTH_LUT = 64,
+    GRAIN_HEIGHT_LUT = 64,
+    GRAIN_PAD_LUT = 9,
+
+    // For subsampled grain textures
+    SUB_GRAIN_WIDTH = 44,
+    SUB_GRAIN_HEIGHT = 38,
+    SUB_GRAIN_WIDTH_LUT = GRAIN_WIDTH_LUT >> 1,
+    SUB_GRAIN_HEIGHT_LUT = GRAIN_HEIGHT_LUT >> 1,
+    SUB_GRAIN_PAD_LUT = 6,
+};
+
+// Contains the shift by which the offsets are indexed
+enum offset {
+    OFFSET_TL = 24,
+    OFFSET_T  = 16,
+    OFFSET_L  = 8,
+    OFFSET_N  = 0,
+};
+
+// Helper function to compute some common constants
+struct grain_scale {
+    int grain_center;
+    int grain_min;
+    int grain_max;
+    float texture_scale;
+    float grain_scale;
+};
+
+static inline int bit_depth(const struct pl_color_repr *repr)
+{
+    int depth = PL_DEF(repr->bits.color_depth,
+                PL_DEF(repr->bits.sample_depth, 8));
+    pl_assert(depth >= 8);
+    return PL_MIN(depth, 12);
+}
+
+static struct grain_scale get_grain_scale(const struct pl_film_grain_params *params)
+{
+    int bits = bit_depth(params->repr);
+    struct grain_scale ret = {
+        .grain_center = 128 << (bits - 8),
+    };
+
+    ret.grain_min = -ret.grain_center;
+    ret.grain_max = (256 << (bits - 8)) - 1 - ret.grain_center;
+
+    struct pl_color_repr repr = *params->repr;
+    ret.texture_scale = pl_color_repr_normalize(&repr);
+
+    // Since our color samples are normalized to the range [0, 1], we need to
+    // scale down grain values from the scale [0, 2^b - 1] to this range.
+    ret.grain_scale = 1.0 / ((1 << bits) - 1);
+
+    return ret;
+}
+
+// Generates the basic grain table (LumaGrain in the spec).
+static void generate_grain_y(float out[GRAIN_HEIGHT_LUT][GRAIN_WIDTH_LUT],
+                             int16_t buf[GRAIN_HEIGHT][GRAIN_WIDTH],
+                             const struct pl_film_grain_params *params)
+{
+    const struct pl_av1_grain_data *data = &params->data.params.av1;
+    struct grain_scale scale = get_grain_scale(params);
+    uint16_t seed = (uint16_t) params->data.seed;
+    int bits = bit_depth(params->repr);
+    int shift = 12 - bits + data->grain_scale_shift;
+    pl_assert(shift >= 0);
+
+    for (int y = 0; y < GRAIN_HEIGHT; y++) {
+        for (int x = 0; x < GRAIN_WIDTH; x++) {
+            int16_t value = gaussian_sequence[ get_random_number(11, &seed) ];
+            buf[y][x] = round2(value, shift);
+        }
+    }
+
+    const int ar_pad = 3;
+    int ar_lag = data->ar_coeff_lag;
+
+    for (int y = ar_pad; y < GRAIN_HEIGHT; y++) {
+        for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) {
+            const int8_t *coeff = data->ar_coeffs_y;
+            int sum = 0;
+            for (int dy = -ar_lag; dy <= 0; dy++) {
+                for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+                    if (!dx && !dy)
+                        break;
+                    sum += *(coeff++) * buf[y + dy][x + dx];
+                }
+            }
+
+            int16_t grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+            grain = PL_CLAMP(grain, scale.grain_min, scale.grain_max);
+            buf[y][x] = grain;
+        }
+    }
+
+    for (int y = 0; y < GRAIN_HEIGHT_LUT; y++) {
+        for (int x = 0; x < GRAIN_WIDTH_LUT; x++) {
+            int16_t grain = buf[y + GRAIN_PAD_LUT][x + GRAIN_PAD_LUT];
+            out[y][x] = grain * scale.grain_scale;
+        }
+    }
+}
+
+static void generate_grain_uv(float *out, int16_t buf[GRAIN_HEIGHT][GRAIN_WIDTH],
+                              const int16_t buf_y[GRAIN_HEIGHT][GRAIN_WIDTH],
+                              enum pl_channel channel, int sub_x, int sub_y,
+                              const struct pl_film_grain_params *params)
+{
+    const struct pl_av1_grain_data *data = &params->data.params.av1;
+    struct grain_scale scale = get_grain_scale(params);
+    int bits = bit_depth(params->repr);
+    int shift = 12 - bits + data->grain_scale_shift;
+    pl_assert(shift >= 0);
+
+    uint16_t seed = params->data.seed;
+    if (channel == PL_CHANNEL_CB) {
+        seed ^= 0xb524;
+    } else if (channel == PL_CHANNEL_CR) {
+        seed ^= 0x49d8;
+    }
+
+    int chromaW = sub_x ? SUB_GRAIN_WIDTH  : GRAIN_WIDTH;
+    int chromaH = sub_y ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT;
+
+    const int8_t *coeffs[] = {
+        [PL_CHANNEL_CB] = data->ar_coeffs_uv[0],
+        [PL_CHANNEL_CR] = data->ar_coeffs_uv[1],
+    };
+
+    for (int y = 0; y < chromaH; y++) {
+        for (int x = 0; x < chromaW; x++) {
+            int16_t value = gaussian_sequence[ get_random_number(11, &seed) ];
+            buf[y][x] = round2(value, shift);
+        }
+    }
+
+    const int ar_pad = 3;
+    int ar_lag = data->ar_coeff_lag;
+
+    for (int y = ar_pad; y < chromaH; y++) {
+        for (int x = ar_pad; x < chromaW - ar_pad; x++) {
+            const int8_t *coeff = coeffs[channel];
+            pl_assert(coeff);
+            int sum = 0;
+            for (int dy = -ar_lag; dy <= 0; dy++) {
+                for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+                    // For the final (current) pixel, we need to add in the
+                    // contribution from the luma grain texture
+                    if (!dx && !dy) {
+                        if (!data->num_points_y)
+                            break;
+                        int luma = 0;
+                        int lumaX = ((x - ar_pad) << sub_x) + ar_pad;
+                        int lumaY = ((y - ar_pad) << sub_y) + ar_pad;
+                        for (int i = 0; i <= sub_y; i++) {
+                            for (int j = 0; j <= sub_x; j++) {
+                                luma += buf_y[lumaY + i][lumaX + j];
+                            }
+                        }
+                        luma = round2(luma, sub_x + sub_y);
+                        sum += luma * (*coeff);
+                        break;
+                    }
+
+                    sum += *(coeff++) * buf[y + dy][x + dx];
+                }
+            }
+
+            int16_t grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+            grain = PL_CLAMP(grain, scale.grain_min, scale.grain_max);
+            buf[y][x] = grain;
+        }
+    }
+
+    int lutW = GRAIN_WIDTH_LUT >> sub_x;
+    int lutH = GRAIN_HEIGHT_LUT >> sub_y;
+    int padX = sub_x ? SUB_GRAIN_PAD_LUT : GRAIN_PAD_LUT;
+    int padY = sub_y ? SUB_GRAIN_PAD_LUT : GRAIN_PAD_LUT;
+
+    for (int y = 0; y < lutH; y++) {
+        for (int x = 0; x < lutW; x++) {
+            int16_t grain = buf[y + padY][x + padX];
+            out[y * lutW + x] = grain * scale.grain_scale;
+        }
+    }
+}
+
+static void generate_offsets(void *pbuf, const struct sh_lut_params *params)
+{
+    const struct pl_film_grain_data *data = params->priv;
+    unsigned int *buf = pbuf;
+    pl_static_assert(sizeof(unsigned int) >= sizeof(uint32_t));
+
+    for (int y = 0; y < params->height; y++) {
+        uint16_t state = data->seed;
+        state ^= ((y * 37 + 178) & 0xFF) << 8;
+        state ^= ((y * 173 + 105) & 0xFF);
+
+        for (int x = 0; x < params->width; x++) {
+            unsigned int *offsets = &buf[y * params->width + x];
+
+            uint8_t val = get_random_number(8, &state);
+            uint8_t val_l = x ? (offsets - 1)[0] : 0;
+            uint8_t val_t = y ? (offsets - params->width)[0] : 0;
+            uint8_t val_tl = x && y ? (offsets - params->width - 1)[0] : 0;
+
+            // Encode four offsets into a single 32-bit integer for the
+            // convenience of the GPU. That way only one LUT fetch is
+            // required for the entire block.
+            *offsets = ((uint32_t) val_tl << OFFSET_TL)
+                     | ((uint32_t) val_t  << OFFSET_T)
+                     | ((uint32_t) val_l  << OFFSET_L)
+                     | ((uint32_t) val    << OFFSET_N);
+        }
+    }
+}
+
+static void generate_scaling(void *pdata, const struct sh_lut_params *params)
+{
+    assert(params->width == SCALING_LUT_SIZE && params->comps == 1);
+    float *data = pdata;
+
+    struct {
+        int num;
+        uint8_t (*points)[2];
+        const struct pl_av1_grain_data *data;
+    } *ctx = params->priv;
+
+    float range = 1 << ctx->data->scaling_shift;
+
+    // Fill up the preceding entries with the initial value
+    for (int i = 0; i < ctx->points[0][0]; i++)
+        data[i] = ctx->points[0][1] / range;
+
+    // Linearly interpolate the values in the middle
+    for (int i = 0; i < ctx->num - 1; i++) {
+        int bx = ctx->points[i][0];
+        int by = ctx->points[i][1];
+        int dx = ctx->points[i + 1][0] - bx;
+        int dy = ctx->points[i + 1][1] - by;
+        int delta = dy * ((0x10000 + (dx >> 1)) / dx);
+        for (int x = 0; x < dx; x++) {
+            int v = by + ((x * delta + 0x8000) >> 16);
+            data[bx + x] = v / range;
+        }
+    }
+
+    // Fill up the remaining entries with the final value
+    for (int i = ctx->points[ctx->num - 1][0]; i < SCALING_LUT_SIZE; i++)
+        data[i] = ctx->points[ctx->num - 1][1] / range;
+}
+
+static void sample(pl_shader sh, enum offset off, ident_t lut, int idx,
+                   int sub_x, int sub_y)
+{
+    int dx = (off & OFFSET_L) ? 1 : 0,
+        dy = (off & OFFSET_T) ? 1 : 0;
+
+    static const char *index_strs[] = {
+        [0] = ".x",
+        [1] = ".y",
+    };
+
+    GLSL("offset = uvec2(%du, %du) * uvec2((data >> %d) & 0xFu, \n"
+         "                                 (data >> %d) & 0xFu);\n"
+         "pos = offset + local_id.xy + uvec2(%d, %d);           \n"
+         "val = "$"(pos)%s;                                     \n",
+         sub_x ? 1 : 2, sub_y ? 1 : 2, off + 4, off,
+         (BLOCK_SIZE >> sub_x) * dx,
+         (BLOCK_SIZE >> sub_y) * dy,
+         lut, idx >= 0 ? index_strs[idx] : "");
+}
+
+struct grain_obj_av1 {
+    // LUT objects for the offsets, grain and scaling luts
+    pl_shader_obj lut_offsets;
+    pl_shader_obj lut_grain[2];
+    pl_shader_obj lut_scaling[3];
+
+    // Previous parameters used to check reusability
+    struct pl_film_grain_data data;
+    struct pl_color_repr repr;
+    bool fg_has_y;
+    bool fg_has_u;
+    bool fg_has_v;
+
+    // Space to store the temporary arrays, reused
+    uint32_t *offsets;
+    float grain[2][GRAIN_HEIGHT_LUT][GRAIN_WIDTH_LUT];
+    int16_t grain_tmp_y[GRAIN_HEIGHT][GRAIN_WIDTH];
+    int16_t grain_tmp_uv[GRAIN_HEIGHT][GRAIN_WIDTH];
+};
+
+static void av1_grain_uninit(pl_gpu gpu, void *ptr)
+{
+    struct grain_obj_av1 *obj = ptr;
+    pl_shader_obj_destroy(&obj->lut_offsets);
+    for (int i = 0; i < PL_ARRAY_SIZE(obj->lut_grain); i++)
+        pl_shader_obj_destroy(&obj->lut_grain[i]);
+    for (int i = 0; i < PL_ARRAY_SIZE(obj->lut_scaling); i++)
+        pl_shader_obj_destroy(&obj->lut_scaling[i]);
+    *obj = (struct grain_obj_av1) {0};
+}
+
+bool pl_needs_fg_av1(const struct pl_film_grain_params *params)
+{
+    const struct pl_av1_grain_data *data = &params->data.params.av1;
+    bool has_y = data->num_points_y > 0;
+    bool has_u = data->num_points_uv[0] > 0 || data->chroma_scaling_from_luma;
+    bool has_v = data->num_points_uv[1] > 0 || data->chroma_scaling_from_luma;
+
+    for (int i = 0; i < 3; i++) {
+        enum pl_channel channel = channel_map(i, params);
+        if (channel == PL_CHANNEL_Y && has_y)
+            return true;
+        if (channel == PL_CHANNEL_CB && has_u)
+            return true;
+        if (channel == PL_CHANNEL_CR && has_v)
+            return true;
+    }
+
+    return false;
+}
+
+static inline bool av1_grain_data_eq(const struct pl_film_grain_data *da,
+                                     const struct pl_film_grain_data *db)
+{
+    const struct pl_av1_grain_data *a = &da->params.av1, *b = &db->params.av1;
+
+    // Only check the fields that are relevant for grain LUT generation
+    return da->seed == db->seed &&
+           a->chroma_scaling_from_luma == b->chroma_scaling_from_luma &&
+           a->scaling_shift == b->scaling_shift &&
+           a->ar_coeff_lag == b->ar_coeff_lag &&
+           a->ar_coeff_shift == b->ar_coeff_shift &&
+           a->grain_scale_shift == b->grain_scale_shift &&
+           !memcmp(a->ar_coeffs_y, b->ar_coeffs_y, sizeof(a->ar_coeffs_y)) &&
+           !memcmp(a->ar_coeffs_uv, b->ar_coeffs_uv, sizeof(a->ar_coeffs_uv));
+}
+
+static void fill_grain_lut(void *data, const struct sh_lut_params *params)
+{
+    struct grain_obj_av1 *obj = params->priv;
+    size_t entries = params->width * params->height * params->comps;
+    memcpy(data, obj->grain, entries * sizeof(float));
+}
+
+bool pl_shader_fg_av1(pl_shader sh, pl_shader_obj *grain_state,
+                      const struct pl_film_grain_params *params)
+{
+    int sub_x = 0, sub_y = 0;
+    int tex_w = params->tex->params.w,
+        tex_h = params->tex->params.h;
+
+    if (params->luma_tex) {
+        sub_x = params->luma_tex->params.w > tex_w;
+        sub_y = params->luma_tex->params.h > tex_h;
+    }
+
+    const struct pl_av1_grain_data *data = &params->data.params.av1;
+    bool fg_has_y = data->num_points_y > 0;
+    bool fg_has_u = data->num_points_uv[0] > 0 || data->chroma_scaling_from_luma;
+    bool fg_has_v = data->num_points_uv[1] > 0 || data->chroma_scaling_from_luma;
+
+    bool tex_is_y = false, tex_is_cb = false, tex_is_cr = false;
+    for (int i = 0; i < 3; i++) {
+        switch (channel_map(i, params)) {
+        case PL_CHANNEL_Y:  tex_is_y = true; break;
+        case PL_CHANNEL_CB: tex_is_cb = true; break;
+        case PL_CHANNEL_CR: tex_is_cr = true; break;
+        default: break;
+        };
+    }
+
+    if (tex_is_y && (sub_x || sub_y)) {
+        PL_WARN(sh, "pl_film_grain_params.channels includes PL_CHANNEL_Y but "
+                "plane is subsampled, this makes no sense. Continuing anyway "
+                "but output is likely incorrect.");
+    }
+
+    if (!sh_require(sh, PL_SHADER_SIG_NONE, tex_w, tex_h))
+        return false;
+
+    pl_gpu gpu = SH_GPU(sh);
+    if (!gpu) {
+        PL_ERR(sh, "AV1 film grain synthesis requires a non-NULL pl_gpu!");
+        return false;
+    }
+
+    // Disable generation for unneeded component types
+    fg_has_y &= tex_is_y;
+    fg_has_u &= tex_is_cb;
+    fg_has_v &= tex_is_cr;
+
+    int bw = BLOCK_SIZE >> sub_x;
+    int bh = BLOCK_SIZE >> sub_y;
+    bool is_compute = sh_try_compute(sh, bw, bh, false, sizeof(uint32_t));
+
+    struct grain_obj_av1 *obj;
+    obj = SH_OBJ(sh, grain_state, PL_SHADER_OBJ_AV1_GRAIN,
+                 struct grain_obj_av1, av1_grain_uninit);
+    if (!obj)
+        return false;
+
+    // Note: In theory we could check only the parameters related to luma or
+    // only related to chroma and skip updating for changes to irrelevant
+    // parts, but this is probably not worth it since the seed is expected to
+    // change per frame anyway.
+    bool needs_update = !av1_grain_data_eq(&params->data, &obj->data) ||
+                        !pl_color_repr_equal(params->repr, &obj->repr) ||
+                        fg_has_y != obj->fg_has_y ||
+                        fg_has_u != obj->fg_has_u ||
+                        fg_has_v != obj->fg_has_v;
+
+    if (needs_update) {
+        // This is needed even for chroma, so statically generate it
+        generate_grain_y(obj->grain[0], obj->grain_tmp_y, params);
+    }
+
+    ident_t lut[3];
+    int idx[3] = {-1};
+
+    if (fg_has_y) {
+        lut[0] = sh_lut(sh, sh_lut_params(
+            .object     = &obj->lut_grain[0],
+            .var_type   = PL_VAR_FLOAT,
+            .lut_type   = SH_LUT_TEXTURE,
+            .width      = GRAIN_WIDTH_LUT,
+            .height     = GRAIN_HEIGHT_LUT,
+            .comps      = 1,
+            .update     = needs_update,
+            .dynamic    = true,
+            .fill       = fill_grain_lut,
+            .priv       = obj,
+        ));
+
+        if (!lut[0]) {
+            SH_FAIL(sh, "Failed generating/uploading luma grain LUT!");
+            return false;
+        }
+    }
+
+    // Try merging the chroma LUTs into a single texture
+    int chroma_comps = 0;
+    if (fg_has_u) {
+        generate_grain_uv(&obj->grain[chroma_comps][0][0], obj->grain_tmp_uv,
+                          obj->grain_tmp_y, PL_CHANNEL_CB, sub_x, sub_y,
+                          params);
+        idx[1] = chroma_comps++;
+    }
+    if (fg_has_v) {
+        generate_grain_uv(&obj->grain[chroma_comps][0][0], obj->grain_tmp_uv,
+                          obj->grain_tmp_y, PL_CHANNEL_CR, sub_x, sub_y,
+                          params);
+        idx[2] = chroma_comps++;
+    }
+
+    if (chroma_comps > 0) {
+        lut[1] = lut[2] = sh_lut(sh, sh_lut_params(
+            .object     = &obj->lut_grain[1],
+            .var_type   = PL_VAR_FLOAT,
+            .lut_type   = SH_LUT_TEXTURE,
+            .width      = GRAIN_WIDTH_LUT >> sub_x,
+            .height     = GRAIN_HEIGHT_LUT >> sub_y,
+            .comps      = chroma_comps,
+            .update     = needs_update,
+            .dynamic    = true,
+            .fill       = fill_grain_lut,
+            .priv       = obj,
+        ));
+
+        if (!lut[1]) {
+            SH_FAIL(sh, "Failed generating/uploading chroma grain LUT!");
+            return false;
+        }
+
+        if (chroma_comps == 1)
+            idx[1] = idx[2] = -1;
+    }
+
+    ident_t offsets = sh_lut(sh, sh_lut_params(
+        .object     = &obj->lut_offsets,
+        .var_type   = PL_VAR_UINT,
+        .lut_type   = SH_LUT_AUTO,
+        .width      = PL_ALIGN2(tex_w << sub_x, 128) / 32,
+        .height     = PL_ALIGN2(tex_h << sub_y, 128) / 32,
+        .comps      = 1,
+        .update     = needs_update,
+        .dynamic    = true,
+        .fill       = generate_offsets,
+        .priv       = (void *) &params->data,
+    ));
+
+    if (!offsets) {
+        SH_FAIL(sh, "Failed generating/uploading block offsets LUT!");
+        return false;
+    }
+
+    // For the scaling LUTs, we assume they'll be relatively constant
+    // throughout the video so doing some extra work to avoid reinitializing
+    // them constantly is probably worth it. Probably.
+    const struct pl_av1_grain_data *obj_data = &obj->data.params.av1;
+    bool scaling_changed = false;
+    if (fg_has_y || data->chroma_scaling_from_luma) {
+        scaling_changed |= data->num_points_y != obj_data->num_points_y;
+        scaling_changed |= memcmp(data->points_y, obj_data->points_y,
+                                  sizeof(data->points_y));
+    }
+
+    if (fg_has_u && !data->chroma_scaling_from_luma) {
+        scaling_changed |= data->num_points_uv[0] != obj_data->num_points_uv[0];
+        scaling_changed |= memcmp(data->points_uv[0],
+                                  obj_data->points_uv[0],
+                                  sizeof(data->points_uv[0]));
+    }
+
+    if (fg_has_v && !data->chroma_scaling_from_luma) {
+        scaling_changed |= data->num_points_uv[1] != obj_data->num_points_uv[1];
+        scaling_changed |= memcmp(data->points_uv[1],
+                                  obj_data->points_uv[1],
+                                  sizeof(data->points_uv[1]));
+    }
+
+    ident_t scaling[3] = {0};
+    for (int i = 0; i < 3; i++) {
+        struct {
+            int num;
+            const uint8_t (*points)[2];
+            const struct pl_av1_grain_data *data;
+        } priv;
+
+        priv.data = data;
+        if (i == 0 || data->chroma_scaling_from_luma) {
+            priv.num = data->num_points_y;
+            priv.points = &data->points_y[0];
+        } else {
+            priv.num = data->num_points_uv[i - 1];
+            priv.points = &data->points_uv[i - 1][0];
+        }
+
+        // Skip scaling for unneeded channels
+        bool has_c[3] = { fg_has_y, fg_has_u, fg_has_v };
+        if (has_c[i] && priv.num > 0) {
+            scaling[i] = sh_lut(sh, sh_lut_params(
+                .object     = &obj->lut_scaling[i],
+                .var_type   = PL_VAR_FLOAT,
+                .method     = SH_LUT_LINEAR,
+                .width      = SCALING_LUT_SIZE,
+                .comps      = 1,
+                .update     = scaling_changed,
+                .dynamic    = true,
+                .fill       = generate_scaling,
+                .priv       = &priv,
+            ));
+
+            if (!scaling[i]) {
+                SH_FAIL(sh, "Failed generating/uploading scaling LUTs!");
+                return false;
+            }
+        }
+    }
+
+    // Done updating LUTs
+    obj->data = params->data;
+    obj->repr = *params->repr;
+    obj->fg_has_y = fg_has_y;
+    obj->fg_has_u = fg_has_u;
+    obj->fg_has_v = fg_has_v;
+
+    sh_describe(sh, "AV1 film grain");
+    GLSL("vec4 color;                   \n"
+         "// pl_shader_film_grain (AV1) \n"
+         "{                             \n"
+         "uvec2 offset;                 \n"
+         "uvec2 pos;                    \n"
+         "float val;                    \n"
+         "float grain;                  \n");
+
+    if (is_compute) {
+        GLSL("uvec2 block_id  = gl_WorkGroupID.xy;        \n"
+             "uvec2 local_id  = gl_LocalInvocationID.xy;  \n"
+             "uvec2 global_id = gl_GlobalInvocationID.xy; \n");
+    } else {
+        GLSL("uvec2 global_id = uvec2(gl_FragCoord);                  \n"
+             "uvec2 block_id  = global_id / uvec2(%d, %d);            \n"
+             "uvec2 local_id  = global_id - uvec2(%d, %d) * block_id; \n",
+             bw, bh, bw, bh);
+    }
+
+    // Load the data vector which holds the offsets
+    if (is_compute) {
+        ident_t id = sh_fresh(sh, "data");
+        GLSLH("shared uint "$"; \n", id);
+        GLSL("if (gl_LocalInvocationIndex == 0u)    \n"
+             "    "$" = uint("$"(block_id));        \n"
+             "barrier();                            \n"
+             "uint data = "$";                      \n",
+             id, offsets, id);
+    } else {
+        GLSL("uint data = uint("$"(block_id)); \n", offsets);
+    }
+
+    struct grain_scale scale = get_grain_scale(params);
+    pl_color_repr_normalize(params->repr);
+    int bits = PL_DEF(params->repr->bits.color_depth, 8);
+    pl_assert(bits >= 8);
+
+    ident_t minValue, maxLuma, maxChroma;
+    if (pl_color_levels_guess(params->repr) == PL_COLOR_LEVELS_LIMITED) {
+        float out_scale = (1 << bits) / ((1 << bits) - 1.0);
+        minValue  = SH_FLOAT(16  / 256.0 * out_scale);
+        maxLuma   = SH_FLOAT(235 / 256.0 * out_scale);
+        maxChroma = SH_FLOAT(240 / 256.0 * out_scale);
+        if (!pl_color_system_is_ycbcr_like(params->repr->sys))
+            maxChroma = maxLuma;
+    } else {
+        minValue  = SH_FLOAT(0.0);
+        maxLuma   = SH_FLOAT(1.0);
+        maxChroma = SH_FLOAT(1.0);
+    }
+
+    // Load the color value of the tex itself
+    ident_t tex = sh_desc(sh, (struct pl_shader_desc) {
+        .binding.object = params->tex,
+        .desc = (struct pl_desc) {
+            .name = "tex",
+            .type = PL_DESC_SAMPLED_TEX,
+        },
+    });
+
+    ident_t tex_scale = SH_FLOAT(scale.texture_scale);
+    GLSL("color = vec4("$") * texelFetch("$", ivec2(global_id), 0); \n",
+         tex_scale, tex);
+
+    // If we need access to the external luma plane, load it now
+    if (tex_is_cb || tex_is_cr) {
+        GLSL("float averageLuma; \n");
+        if (tex_is_y) {
+            // We already have the luma channel as part of the pre-sampled color
+            for (int i = 0; i < 3; i++) {
+                if (channel_map(i, params) == PL_CHANNEL_Y) {
+                    GLSL("averageLuma = color["$"]; \n", SH_INT(i));
+                    break;
+                }
+            }
+        } else {
+            // Luma channel not present in image, attach it separately
+            pl_assert(params->luma_tex);
+            ident_t luma = sh_desc(sh, (struct pl_shader_desc) {
+                .binding.object = params->luma_tex,
+                .desc = (struct pl_desc) {
+                    .name = "luma",
+                    .type = PL_DESC_SAMPLED_TEX,
+                },
+            });
+
+            GLSL("pos = global_id * uvec2(%du, %du);                    \n"
+                 "averageLuma = texelFetch("$", ivec2(pos), 0)["$"];    \n"
+                 "averageLuma *= "$";                                   \n",
+                 1 << sub_x, 1 << sub_y,
+                 luma, SH_INT(params->luma_comp),
+                 tex_scale);
+        }
+    }
+
+    ident_t grain_min = SH_FLOAT(scale.grain_min * scale.grain_scale);
+    ident_t grain_max = SH_FLOAT(scale.grain_max * scale.grain_scale);
+
+    for (int i = 0; i < params->components; i++) {
+        enum pl_channel c = channel_map(i, params);
+        if (c == PL_CHANNEL_NONE)
+            continue;
+        if (!scaling[c])
+            continue;
+
+        sample(sh, OFFSET_N, lut[c], idx[c], sub_x, sub_y);
+        GLSL("grain = val; \n");
+
+        if (data->overlap) {
+            const char *weights[] = { "vec2(27.0, 17.0)", "vec2(23.0, 22.0)" };
+
+            // X-direction overlapping
+            GLSL("if (block_id.x > 0u && local_id.x < %du) {    \n"
+                 "vec2 w = %s / 32.0;                           \n"
+                 "if (local_id.x == 1u) w.xy = w.yx;            \n",
+                 2 >> sub_x, weights[sub_x]);
+            sample(sh, OFFSET_L, lut[c], idx[c], sub_x, sub_y);
+            GLSL("grain = dot(vec2(val, grain), w);             \n"
+                 "}                                             \n");
+
+            // Y-direction overlapping
+            GLSL("if (block_id.y > 0u && local_id.y < %du) {    \n"
+                 "vec2 w = %s / 32.0;                           \n"
+                 "if (local_id.y == 1u) w.xy = w.yx;            \n",
+                 2 >> sub_y, weights[sub_y]);
+
+            // We need to special-case the top left pixels since these need to
+            // pre-blend the top-left offset block before blending vertically
+            GLSL("    if (block_id.x > 0u && local_id.x < %du) {\n"
+                 "        vec2 w2 = %s / 32.0;                  \n"
+                 "        if (local_id.x == 1u) w2.xy = w2.yx;  \n",
+                 2 >> sub_x, weights[sub_x]);
+                          sample(sh, OFFSET_TL, lut[c], idx[c], sub_x, sub_y);
+            GLSL("        float tmp = val;                      \n");
+                          sample(sh, OFFSET_T, lut[c], idx[c], sub_x, sub_y);
+            GLSL("        val = dot(vec2(tmp, val), w2);        \n"
+                 "    } else {                                  \n");
+                          sample(sh, OFFSET_T, lut[c], idx[c], sub_x, sub_y);
+            GLSL("    }                                         \n"
+                 "grain = dot(vec2(val, grain), w);             \n"
+                 "}                                             \n");
+
+            // Correctly clip the interpolated grain
+            GLSL("grain = clamp(grain, "$", "$"); \n", grain_min, grain_max);
+        }
+
+        if (c == PL_CHANNEL_Y) {
+            GLSL("color[%d] += "$"(color[%d]) * grain;      \n"
+                 "color[%d] = clamp(color[%d], "$", "$");   \n",
+                 i, scaling[c], i,
+                 i, i, minValue, maxLuma);
+        } else {
+            GLSL("val = averageLuma; \n");
+            if (!data->chroma_scaling_from_luma) {
+                // We need to load some extra variables for the mixing. Do this
+                // using sh_var instead of hard-coding them to avoid shader
+                // recompilation when these values change.
+                ident_t mult = sh_var(sh, (struct pl_shader_var) {
+                    .var = pl_var_vec2("mult"),
+                    .data = &(float[2]){
+                        data->uv_mult_luma[c - 1] / 64.0,
+                        data->uv_mult[c - 1] / 64.0,
+                    },
+                });
+
+                int c_offset = (unsigned) data->uv_offset[c - 1] << (bits - 8);
+                ident_t offset = sh_var(sh, (struct pl_shader_var) {
+                    .var = pl_var_float("offset"),
+                    .data = &(float) { c_offset * scale.grain_scale },
+                });
+
+                GLSL("val = dot(vec2(val, color[%d]), "$"); \n"
+                     "val += "$";                           \n",
+                     i, mult, offset);
+            }
+            GLSL("color[%d] += "$"(val) * grain;            \n"
+                 "color[%d] = clamp(color[%d], "$", "$");   \n",
+                 i, scaling[c],
+                 i, i, minValue, maxChroma);
+        }
+    }
+
+    GLSL("} \n");
+    return true;
+}
diff --git a/src/shaders/film_grain_h274.c b/src/shaders/film_grain_h274.c
new file mode 100644
index 0000000..6d524da
--- /dev/null
+++ b/src/shaders/film_grain_h274.c
@@ -0,0 +1,815 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "shaders.h"
+#include "shaders/film_grain.h"
+
+static const int8_t Gaussian_LUT[2048+4];
+static const uint32_t Seed_LUT[256];
+static const int8_t R64T[64][64];
+
+static void prng_shift(uint32_t *state)
+{
+    // Primitive polynomial x^31 + x^3 + 1 (modulo 2)
+    uint32_t x = *state;
+    uint8_t feedback = 1u ^ (x >> 2) ^ (x >> 30);
+    *state = (x << 1) | (feedback & 1u);
+}
+
+
+static void generate_slice(float *out, size_t out_width, uint8_t h, uint8_t v,
+                           int8_t grain[64][64], int16_t tmp[64][64])
+{
+    const uint8_t freq_h = ((h + 3) << 2) - 1;
+    const uint8_t freq_v = ((v + 3) << 2) - 1;
+    uint32_t seed = Seed_LUT[h + v * 13];
+
+    // Initialize with random gaussian values, using the output array as a
+    // temporary buffer for these intermediate values.
+    //
+    // Note: To make the subsequent matrix multiplication cache friendlier, we
+    // store each *column* of the starting image in a *row* of `grain`
+    for (int y = 0; y <= freq_v; y++) {
+        for (int x = 0; x <= freq_h; x += 4) {
+            uint16_t offset = seed % 2048;
+            grain[x + 0][y] = Gaussian_LUT[offset + 0];
+            grain[x + 1][y] = Gaussian_LUT[offset + 1];
+            grain[x + 2][y] = Gaussian_LUT[offset + 2];
+            grain[x + 3][y] = Gaussian_LUT[offset + 3];
+            prng_shift(&seed);
+        }
+    }
+
+    grain[0][0] = 0;
+
+    // 64x64 inverse integer transform
+    for (int y = 0; y < 64; y++) {
+        for (int x = 0; x <= freq_h; x++) {
+            int32_t sum = 0;
+            for (int p = 0; p <= freq_v; p++)
+                sum += R64T[y][p] * grain[x][p];
+            tmp[y][x] = (sum + 128) >> 8;
+        }
+    }
+
+    for (int y = 0; y < 64; y++) {
+        for (int x = 0; x < 64; x++) {
+            int32_t sum = 0;
+            for (int p = 0; p <= freq_h; p++)
+                sum += tmp[y][p] * R64T[x][p]; // R64T^T = R64
+            sum = (sum + 128) >> 8;
+            grain[y][x] = PL_CLAMP(sum, -127, 127);
+        }
+    }
+
+    static const uint8_t deblock_factors[13] = {
+        64, 71, 77, 84, 90, 96, 103, 109, 116, 122, 128, 128, 128
+    };
+
+    // Deblock horizontal edges by simple attentuation of values
+    const uint8_t deblock_coeff = deblock_factors[v];
+    for (int y = 0; y < 64; y++) {
+        switch (y % 8) {
+        case 0: case 7:
+            // Deblock
+            for (int x = 0; x < 64; x++)
+                out[x] = ((grain[y][x] * deblock_coeff) >> 7) / 255.0;
+            break;
+
+        case 1: case 2:
+        case 3: case 4:
+        case 5: case 6:
+            // No deblock
+            for (int x = 0; x < 64; x++)
+                out[x] = grain[y][x] / 255.0;
+            break;
+
+        default: pl_unreachable();
+        }
+
+        out += out_width;
+    }
+}
+
+static void fill_grain_lut(void *data, const struct sh_lut_params *params)
+{
+    struct {
+        int8_t grain[64][64];
+        int16_t tmp[64][64];
+    } *tmp = pl_alloc_ptr(NULL, tmp);
+
+    float *out = data;
+    assert(params->var_type == PL_VAR_FLOAT);
+
+    for (int h = 0; h < 13; h++) {
+        for (int v = 0; v < 13; v++) {
+            float *slice = out + (h * 64) * params->width + (v * 64);
+            generate_slice(slice, params->width, h, v, tmp->grain, tmp->tmp);
+        }
+    }
+
+    pl_free(tmp);
+}
+
+bool pl_needs_fg_h274(const struct pl_film_grain_params *params)
+{
+    const struct pl_h274_grain_data *data = &params->data.params.h274;
+    if (data->model_id != 0)
+        return false;
+
+    for (int i = 0; i < 3; i++) {
+        enum pl_channel channel = channel_map(i, params);
+        if (channel < 0 || channel >= 3)
+            continue;
+        if (data->component_model_present[channel])
+            return true;
+    }
+
+    return false;
+}
+
+bool pl_shader_fg_h274(pl_shader sh, pl_shader_obj *grain_state,
+                       const struct pl_film_grain_params *params)
+{
+    if (!sh_require(sh, PL_SHADER_SIG_NONE, params->tex->params.w, params->tex->params.h))
+        return false;
+
+    size_t shmem_req = 0;
+    ident_t group_sum = NULL_IDENT;
+
+    const struct pl_glsl_version glsl = sh_glsl(sh);
+    if (glsl.subgroup_size < 8*8) {
+        group_sum = sh_fresh(sh, "group_sum");
+        shmem_req += sizeof(int);
+        GLSLH("shared int "$"; \n", group_sum);
+        GLSL($" = 0; barrier(); \n", group_sum);
+    }
+
+    if (!sh_try_compute(sh, 8, 8, false, shmem_req)) {
+        SH_FAIL(sh, "H.274 film grain synthesis requires compute shaders!");
+        return false;
+    }
+
+    ident_t db = sh_lut(sh, sh_lut_params(
+        .object     = grain_state,
+        .var_type   = PL_VAR_FLOAT,
+        .lut_type   = SH_LUT_TEXTURE,
+        .width      = 13 * 64,
+        .height     = 13 * 64,
+        .comps      = 1,
+        .fill       = fill_grain_lut,
+        .signature  = CACHE_KEY_H274, // doesn't depend on anything
+        .cache      = SH_CACHE(sh),
+    ));
+
+    sh_describe(sh, "H.274 film grain");
+    GLSL("vec4 color;                       \n"
+         "// pl_shader_film_grain (H.274)   \n"
+         "{                                 \n");
+
+    // Load the color value of the tex itself
+    ident_t tex = sh_desc(sh, (struct pl_shader_desc) {
+        .binding.object = params->tex,
+        .desc = (struct pl_desc) {
+            .name = "tex",
+            .type = PL_DESC_SAMPLED_TEX,
+        },
+    });
+
+    GLSL("ivec2 pos = ivec2(gl_GlobalInvocationID);     \n"
+         "color = vec4("$") * texelFetch("$", pos, 0);  \n",
+         SH_FLOAT(pl_color_repr_normalize(params->repr)), tex);
+
+    const struct pl_h274_grain_data *data = &params->data.params.h274;
+    ident_t scale_factor = sh_var(sh, (struct pl_shader_var) {
+        .var = pl_var_float("scale_factor"),
+        .data = &(float){ 1.0 / (1 << (data->log2_scale_factor + 6)) },
+    });
+
+    // pcg3d (http://www.jcgt.org/published/0009/03/02/)
+    GLSL("uvec3 pcg = uvec3("$", gl_WorkGroupID.xy / 2u);   \n"
+         "pcg = pcg * 1664525u + 1013904223u;               \n"
+         "pcg.x += pcg.y * pcg.z;                           \n"
+         "pcg.y += pcg.z * pcg.x;                           \n"
+         "pcg.z += pcg.x * pcg.y;                           \n"
+         "pcg ^= pcg >> 16u;                                \n"
+         "pcg.x += pcg.y * pcg.z;                           \n"
+         "pcg.y += pcg.z * pcg.x;                           \n"
+         "pcg.z += pcg.x * pcg.y;                           \n",
+         sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_uint("seed"),
+            .data = &(unsigned int){ params->data.seed },
+    }));
+
+    for (int idx = 0; idx < params->components; idx++) {
+        enum pl_channel c = channel_map(idx, params);
+        if (c == PL_CHANNEL_NONE)
+            continue;
+        if (!data->component_model_present[c])
+            continue;
+
+        GLSL("// component %d\n{\n", c);
+
+        // Compute the local 8x8 average
+        GLSL("float avg = color[%d] / 64.0; \n", c);
+
+        const int precision = 10000000;
+        if (glsl.subgroup_size) {
+            GLSL("avg = subgroupAdd(avg); \n");
+
+            if (glsl.subgroup_size < 8*8) {
+                GLSL("if (subgroupElect())                  \n"
+                     "    atomicAdd("$", int(avg * %d.0));  \n"
+                     "barrier();                            \n"
+                     "avg = float("$") / %d.0;              \n",
+                     group_sum, precision, group_sum, precision);
+            }
+        } else {
+            GLSL("atomicAdd("$", int(avg * %d.0));  \n"
+                 "barrier();                        \n"
+                 "avg = float("$") / %d.0;          \n",
+                 group_sum, precision, group_sum, precision);
+        }
+
+        // Hard-coded unrolled loop, to avoid having to load a dynamically
+        // sized array into the shader - and to optimize for the very common
+        // case of there only being a single intensity interval
+        GLSL("uint val; \n");
+        for (int i = 0; i < data->num_intensity_intervals[c]; i++) {
+            ident_t bounds = sh_var(sh, (struct pl_shader_var) {
+                .var = pl_var_vec2("bounds"),
+                .data = &(float[2]) {
+                    data->intensity_interval_lower_bound[c][i] / 255.0,
+                    data->intensity_interval_upper_bound[c][i] / 255.0,
+                },
+            });
+
+            const uint8_t num_values = data->num_model_values[c];
+            uint8_t h = num_values > 1 ? data->comp_model_value[c][i][1] : 8;
+            uint8_t v = num_values > 2 ? data->comp_model_value[c][i][2] : h;
+            h = PL_CLAMP(h, 2, 14) - 2;
+            v = PL_CLAMP(v, 2, 14) - 2;
+            // FIXME: double h/v for subsampled planes!
+
+            // Reduce scale for chroma planes
+            int16_t scale = data->comp_model_value[c][i][0];
+            if (c > 0 && pl_color_system_is_ycbcr_like(params->repr->sys))
+                scale >>= 1;
+
+            pl_static_assert(sizeof(unsigned int) >= sizeof(uint32_t));
+            ident_t values = sh_var(sh, (struct pl_shader_var) {
+                .var = pl_var_uint("comp_model_value"),
+                .data = &(unsigned int) {
+                    (uint16_t) scale << 16 | h << 8 | v,
+                },
+            });
+
+            GLSL("if (avg >= "$".x && avg <= "$".y) \n"
+                 "    val = "$"; else               \n",
+                 bounds, bounds, values);
+        }
+        GLSL("    val = 0u; \n");
+
+        // Extract the grain parameters from comp_model_value
+        GLSL("uvec2 offset = uvec2((val & 0xFF00u) >> 2,    \n"
+             "                     (val & 0xFFu) << 6);     \n"
+             "float scale = "$" * float(int(val >> 16));    \n"
+             // Add randomness
+             "uint rand = pcg[%d];                          \n"
+             "offset.x += (rand >> 16u) %% 52u;             \n"
+             "offset.y += (rand & 0xFFFFu) %% 56u;          \n"
+             "offset.x &= 0xFFFCu;                          \n"
+             "offset.y &= 0xFFF8u;                          \n"
+             "if ((rand & 1u) == 1u) scale = -scale;        \n"
+             // Add local offset and compute grain
+             "offset += 8u * (gl_WorkGroupID.xy %% 2u);     \n"
+             "offset += gl_LocalInvocationID.xy;            \n"
+             "float grain = "$"(offset);                    \n"
+             "color[%d] += scale * grain;                   \n",
+             scale_factor, c, db, c);
+
+        // TODO: Deblocking?
+
+        GLSL("}\n");
+    }
+
+    GLSL("} \n");
+    return true;
+}
+
+// These tables are all taken from the SMPTE RDD 5-2006 specification
+static const int8_t Gaussian_LUT[2048+4] = {
+    -11, 12, 103, -11, 42, -35, 12, 59, 77, 98, -87, 3, 65, -78, 45, 56, -51, 21,
+    13, -11, -20, -19, 33, -127, 17, -6, -105, 18, 19, 71, 48, -10, -38, 42,
+    -2, 75, -67, 52, -90, 33, -47, 21, -3, -56, 49, 1, -57, -42, -1, 120, -127,
+    -108, -49, 9, 14, 127, 122, 109, 52, 127, 2, 7, 114, 19, 30, 12, 77, 112,
+    82, -61, -127, 111, -52, -29, 2, -49, -24, 58, -29, -73, 12, 112, 67, 79,
+    -3, -114, -87, -6, -5, 40, 58, -81, 49, -27, -31, -34, -105, 50, 16, -24,
+    -35, -14, -15, -127, -55, -22, -55, -127, -112, 5, -26, -72, 127, 127, -2,
+    41, 87, -65, -16, 55, 19, 91, -81, -65, -64, 35, -7, -54, 99, -7, 88, 125,
+    -26, 91, 0, 63, 60, -14, -23, 113, -33, 116, 14, 26, 51, -16, 107, -8, 53,
+    38, -34, 17, -7, 4, -91, 6, 63, 63, -15, 39, -36, 19, 55, 17, -51, 40, 33,
+    -37, 126, -39, -118, 17, -30, 0, 19, 98, 60, 101, -12, -73, -17, -52, 98,
+    3, 3, 60, 33, -3, -2, 10, -42, -106, -38, 14, 127, 16, -127, -31, -86, -39,
+    -56, 46, -41, 75, 23, -19, -22, -70, 74, -54, -2, 32, -45, 17, -92, 59,
+    -64, -67, 56, -102, -29, -87, -34, -92, 68, 5, -74, -61, 93, -43, 14, -26,
+    -38, -126, -17, 16, -127, 64, 34, 31, 93, 17, -51, -59, 71, 77, 81, 127,
+    127, 61, 33, -106, -93, 0, 0, 75, -69, 71, 127, -19, -111, 30, 23, 15, 2,
+    39, 92, 5, 42, 2, -6, 38, 15, 114, -30, -37, 50, 44, 106, 27, 119, 7, -80,
+    25, -68, -21, 92, -11, -1, 18, 41, -50, 79, -127, -43, 127, 18, 11, -21,
+    32, -52, 27, -88, -90, -39, -19, -10, 24, -118, 72, -24, -44, 2, 12, 86,
+    -107, 39, -33, -127, 47, 51, -24, -22, 46, 0, 15, -35, -69, -2, -74, 24,
+    -6, 0, 29, -3, 45, 32, -32, 117, -45, 79, -24, -17, -109, -10, -70, 88,
+    -48, 24, -91, 120, -37, 50, -127, 58, 32, -82, -10, -17, -7, 46, -127, -15,
+    89, 127, 17, 98, -39, -33, 37, 42, -40, -32, -21, 105, -19, 19, 19, -59,
+    -9, 30, 0, -127, 34, 127, -84, 75, 24, -40, -49, -127, -107, -14, 45, -75,
+    1, 30, -20, 41, -68, -40, 12, 127, -3, 5, 20, -73, -59, -127, -3, -3, -53,
+    -6, -119, 93, 120, -80, -50, 0, 20, -46, 67, 78, -12, -22, -127, 36, -41,
+    56, 119, -5, -116, -22, 68, -14, -90, 24, -82, -44, -127, 107, -25, -37,
+    40, -7, -7, -82, 5, -87, 44, -34, 9, -127, 39, 70, 49, -63, 74, -49, 109,
+    -27, -89, -47, -39, 44, 49, -4, 60, -42, 80, 9, -127, -9, -56, -49, 125,
+    -66, 47, 36, 117, 15, -11, -96, 109, 94, -17, -56, 70, 8, -14, -5, 50, 37,
+    -45, 120, -30, -76, 40, -46, 6, 3, 69, 17, -78, 1, -79, 6, 127, 43, 26,
+    127, -127, 28, -55, -26, 55, 112, 48, 107, -1, -77, -1, 53, -9, -22, -43,
+    123, 108, 127, 102, 68, 46, 5, 1, 123, -13, -55, -34, -49, 89, 65, -105,
+    -5, 94, -53, 62, 45, 30, 46, 18, -35, 15, 41, 47, -98, -24, 94, -75, 127,
+    -114, 127, -68, 1, -17, 51, -95, 47, 12, 34, -45, -75, 89, -107, -9, -58,
+    -29, -109, -24, 127, -61, -13, 77, -45, 17, 19, 83, -24, 9, 127, -66, 54,
+    4, 26, 13, 111, 43, -113, -22, 10, -24, 83, 67, -14, 75, -123, 59, 127,
+    -12, 99, -19, 64, -38, 54, 9, 7, 61, -56, 3, -57, 113, -104, -59, 3, -9,
+    -47, 74, 85, -55, -34, 12, 118, 28, 93, -72, 13, -99, -72, -20, 30, 72,
+    -94, 19, -54, 64, -12, -63, -25, 65, 72, -10, 127, 0, -127, 103, -20, -73,
+    -112, -103, -6, 28, -42, -21, -59, -29, -26, 19, -4, -51, 94, -58, -95,
+    -37, 35, 20, -69, 127, -19, -127, -22, -120, -53, 37, 74, -127, -1, -12,
+    -119, -53, -28, 38, 69, 17, 16, -114, 89, 62, 24, 37, -23, 49, -101, -32,
+    -9, -95, -53, 5, 93, -23, -49, -8, 51, 3, -75, -90, -10, -39, 127, -86,
+    -22, 20, 20, 113, 75, 52, -31, 92, -63, 7, -12, 46, 36, 101, -43, -17, -53,
+    -7, -38, -76, -31, -21, 62, 31, 62, 20, -127, 31, 64, 36, 102, -85, -10,
+    77, 80, 58, -79, -8, 35, 8, 80, -24, -9, 3, -17, 72, 127, 83, -87, 55, 18,
+    -119, -123, 36, 10, 127, 56, -55, 113, 13, 26, 32, -13, -48, 22, -13, 5,
+    58, 27, 24, 26, -11, -36, 37, -92, 78, 81, 9, 51, 14, 67, -13, 0, 32, 45,
+    -76, 32, -39, -22, -49, -127, -27, 31, -9, 36, 14, 71, 13, 57, 12, -53,
+    -86, 53, -44, -35, 2, 127, 12, -66, -44, 46, -115, 3, 10, 56, -35, 119,
+    -19, -61, 52, -59, -127, -49, -23, 4, -5, 17, -82, -6, 127, 25, 79, 67, 64,
+    -25, 14, -64, -37, -127, -28, 21, -63, 66, -53, -41, 109, -62, 15, -22, 13,
+    29, -63, 20, 27, 95, -44, -59, -116, -10, 79, -49, 22, -43, -16, 46, -47,
+    -120, -36, -29, -52, -44, 29, 127, -13, 49, -9, -127, 75, -28, -23, 88, 59,
+    11, -95, 81, -59, 58, 60, -26, 40, -92, -3, -22, -58, -45, -59, -22, -53,
+    71, -29, 66, -32, -23, 14, -17, -66, -24, -28, -62, 47, 38, 17, 16, -37,
+    -24, -11, 8, -27, -19, 59, 45, -49, -47, -4, -22, -81, 30, -67, -127, 74,
+    102, 5, -18, 98, 34, -66, 42, -52, 7, -59, 24, -58, -19, -24, -118, -73,
+    91, 15, -16, 79, -32, -79, -127, -36, 41, 77, -83, 2, 56, 22, -75, 127,
+    -16, -21, 12, 31, 56, -113, -127, 90, 55, 61, 12, 55, -14, -113, -14, 32,
+    49, -67, -17, 91, -10, 1, 21, 69, -70, 99, -19, -112, 66, -90, -10, -9,
+    -71, 127, 50, -81, -49, 24, 61, -61, -111, 7, -41, 127, 88, -66, 108, -127,
+    -6, 36, -14, 41, -50, 14, 14, 73, -101, -28, 77, 127, -8, -100, 88, 38,
+    121, 88, -125, -60, 13, -94, -115, 20, -67, -87, -94, -119, 44, -28, -30,
+    18, 5, -53, -61, 20, -43, 11, -77, -60, 13, 29, 3, 6, -72, 38, -60, -11,
+    108, -53, 41, 66, -12, -127, -127, -49, 24, 29, 46, 36, 91, 34, -33, 116,
+    -51, -34, -52, 91, 7, -83, 73, -26, -103, 24, -10, 76, 84, 5, 68, -80, -13,
+    -17, -32, -48, 20, 50, 26, 10, 63, -104, -14, 37, 127, 114, 97, 35, 1, -33,
+    -55, 127, -124, -33, 61, -7, 119, -32, -127, -53, -42, 63, 3, -5, -26, 70,
+    -58, -33, -44, -43, 34, -56, -127, 127, 25, -35, -11, 16, -81, 29, -58, 40,
+    -127, -127, 20, -47, -11, -36, -63, -52, -32, -82, 78, -76, -73, 8, 27,
+    -72, -9, -74, -85, -86, -57, 25, 78, -10, -97, 35, -65, 8, -59, 14, 1, -42,
+    32, -88, -44, 17, -3, -9, 59, 40, 12, -108, -40, 24, 34, 18, -28, 2, 51,
+    -110, -4, 100, 1, 65, 22, 0, 127, 61, 45, 25, -31, 6, 9, -7, -48, 99, 16,
+    44, -2, -40, 32, -39, -52, 10, -110, -19, 56, -127, 69, 26, 51, 92, 40, 61,
+    -52, 45, -38, 13, 85, 122, 27, 66, 45, -111, -83, -3, 31, 37, 19, -36, 58,
+    71, 39, -78, -47, 58, -78, 8, -62, -36, -14, 61, 42, -127, 71, -4, 24, -54,
+    52, -127, 67, -4, -42, 30, -63, 59, -3, -1, -18, -46, -92, -81, -96, -14,
+    -53, -10, -11, -77, 13, 1, 8, -67, -127, 127, -28, 26, -14, 18, -13, -26,
+    2, 10, -46, -32, -15, 27, -31, -59, 59, 77, -121, 28, 40, -54, -62, -31,
+    -21, -37, -32, -6, -127, -25, -60, 70, -127, 112, -127, 127, 88, -7, 116,
+    110, 53, 87, -127, 3, 16, 23, 74, -106, -51, 3, 74, -82, -112, -74, 65, 81,
+    25, 53, 127, -45, -50, -103, -41, -65, -29, 79, -67, 64, -33, -30, -8, 127,
+    0, -13, -51, 67, -14, 5, -92, 29, -35, -8, -90, -57, -3, 36, 43, 44, -31,
+    -69, -7, 36, 39, -51, 43, -81, 58, 6, 127, 12, 57, 66, 46, 59, -43, -42,
+    41, -15, -120, 24, 3, -11, 19, -13, 51, 28, 3, 55, -48, -12, -1, 2, 97,
+    -19, 29, 42, 13, 43, 78, -44, 56, -108, -43, -19, 127, 15, -11, -18, -81,
+    83, -37, 77, -109, 15, 65, -50, 43, 12, 13, 27, 28, 61, 57, 30, 26, 106,
+    -18, 56, 13, 97, 4, -8, -62, -103, 94, 108, -44, 52, 27, -47, -9, 105, -53,
+    46, 89, 103, -33, 38, -34, 55, 51, 70, -94, -35, -87, -107, -19, -31, 9,
+    -19, 79, -14, 77, 5, -19, -107, 85, 21, -45, -39, -42, 9, -29, 74, 47, -75,
+    60, -127, 120, -112, -57, -32, 41, 7, 79, 76, 66, 57, 41, -25, 31, 37, -47,
+    -36, 43, -73, -37, 63, 127, -69, -52, 90, -33, -61, 60, -55, 44, 15, 4,
+    -67, 13, -92, 64, 29, -39, -3, 83, -2, -38, -85, -86, 58, 35, -69, -61, 29,
+    -37, -95, -78, 4, 30, -4, -32, -80, -22, -9, -77, 46, 7, -93, -71, 65, 9,
+    -50, 127, -70, 26, -12, -39, -114, 63, -127, -100, 4, -32, 111, 22, -60,
+    65, -101, 26, -42, 21, -59, -27, -74, 2, -94, 6, 126, 5, 76, -88, -9, -43,
+    -101, 127, 1, 125, 92, -63, 52, 56, 4, 81, -127, 127, 80, 127, -29, 30,
+    116, -74, -17, -57, 105, 48, 45, 25, -72, 48, -38, -108, 31, -34, 4, -11,
+    41, -127, 52, -104, -43, -37, 52, 2, 47, 87, -9, 77, 27, -41, -25, 90, 86,
+    -56, 75, 10, 33, 78, 58, 127, 127, -7, -73, 49, -33, -106, -35, 38, 57, 53,
+    -17, -4, 83, 52, -108, 54, -125, 28, 23, 56, -43, -88, -17, -6, 47, 23, -9,
+    0, -13, 111, 75, 27, -52, -38, -34, 39, 30, 66, 39, 38, -64, 38, 3, 21,
+    -32, -51, -28, 54, -38, -87, 20, 52, 115, 18, -81, -70, 0, -14, -46, -46,
+    -3, 125, 16, -14, 23, -82, -84, -69, -20, -65, -127, 9, 81, -49, 61, 7,
+    -36, -45, -42, 57, -26, 47, 20, -85, 46, -13, 41, -37, -75, -60, 86, -78,
+    -127, 12, 50, 2, -3, 13, 47, 5, 19, -78, -55, -27, 65, -71, 12, -108, 20,
+    -16, 11, -31, 63, -55, 37, 75, -17, 127, -73, -33, -28, -120, 105, 68, 106,
+    -103, -106, 71, 61, 2, 23, -3, 33, -5, -15, -67, -15, -23, -54, 15, -63,
+    76, 58, -110, 1, 83, -27, 22, 75, -39, -17, -11, 64, -17, -127, -54, -66,
+    31, 96, 116, 3, -114, -7, -108, -63, 97, 9, 50, 8, 75, -28, 72, 112, -36,
+    -112, 95, -50, 23, -13, -19, 55, 21, 23, 92, 91, 22, -49, 16, -75, 23, 9,
+    -49, -97, -37, 49, -36, 36, -127, -86, 43, 127, -24, -24, 84, 83, -35, -34,
+    -12, 109, 102, -38, 51, -68, 34, 19, -22, 49, -32, 127, 40, 24, -93, -4,
+    -3, 105, 3, -58, -18, 8, 127, -18, 125, 68, 69, -62, 30, -36, 54, -57, -24,
+    17, 43, -36, -27, -57, -67, -21, -10, -49, 68, 12, 65, 4, 48, 55, 127, -75,
+    44, 89, -66, -13, -78, -82, -91, 22, 30, 33, -40, -87, -34, 96, -91, 39,
+    10, -64, -3, -12, 127, -50, -37, -56, 23, -35, -36, -54, 90, -91, 2, 50,
+    77, -6, -127, 16, 46, -5, -73, 0, -56, -18, -72, 28, 93, 60, 49, 20, 18,
+    111, -111, 32, -83, 47, 47, -10, 35, -88, 43, 57, -98, 127, -17, 0, 1, -39,
+    -127, -2, 0, 63, 93, 0, 36, -66, -61, -19, 39, -127, 58, 50, -17, 127, 88,
+    -43, -108, -51, -16, 7, -36, 68, 46, -14, 107, 40, 57, 7, 19, 8, 3, 88,
+    -90, -92, -18, -21, -24, 13, 7, -4, -78, -91, -4, 8, -35, -5, 19, 2, -111,
+    4, -66, -81, 122, -20, -34, -37, -84, 127, 68, 46, 17, 47,
+
+    // Repeat the beginning of the array to allow wrapping reads
+    -11, 12, 103, -11,
+};
+
+static const uint32_t Seed_LUT[256] = {
+    747538460, 1088979410, 1744950180, 1767011913, 1403382928,
+    521866116, 1060417601, 2110622736, 1557184770, 105289385, 585624216,
+    1827676546, 1191843873, 1018104344, 1123590530, 663361569, 2023850500,
+    76561770, 1226763489, 80325252, 1992581442, 502705249, 740409860,
+    516219202, 557974537, 1883843076, 720112066, 1640137737, 1820967556,
+    40667586, 155354121, 1820967557, 1115949072, 1631803309, 98284748,
+    287433856, 2119719977, 988742797, 1827432592, 579378475, 1017745956,
+    1309377032, 1316535465, 2074315269, 1923385360, 209722667, 1546228260,
+    168102420, 135274561, 355958469, 248291472, 2127839491, 146920100,
+    585982612, 1611702337, 696506029, 1386498192, 1258072451, 1212240548,
+    1043171860, 1217404993, 1090770605, 1386498193, 169093201, 541098240,
+    1468005469, 456510673, 1578687785, 1838217424, 2010752065, 2089828354,
+    1362717428, 970073673, 854129835, 714793201, 1266069081, 1047060864,
+    1991471829, 1098097741, 913883585, 1669598224, 1337918685, 1219264706,
+    1799741108, 1834116681, 683417731, 1120274457, 1073098457, 1648396544,
+    176642749, 31171789, 718317889, 1266977808, 1400892508, 549749008,
+    1808010512, 67112961, 1005669825, 903663673, 1771104465, 1277749632,
+    1229754427, 950632997, 1979371465, 2074373264, 305357524, 1049387408,
+    1171033360, 1686114305, 2147468765, 1941195985, 117709841, 809550080,
+    991480851, 1816248997, 1561503561, 329575568, 780651196, 1659144592,
+    1910793616, 604016641, 1665084765, 1530186961, 1870928913, 809550081,
+    2079346113, 71307521, 876663040, 1073807360, 832356664, 1573927377,
+    204073344, 2026918147, 1702476788, 2043881033, 57949587, 2001393952,
+    1197426649, 1186508931, 332056865, 950043140, 890043474, 349099312,
+    148914948, 236204097, 2022643605, 1441981517, 498130129, 1443421481,
+    924216797, 1817491777, 1913146664, 1411989632, 929068432, 495735097,
+    1684636033, 1284520017, 432816184, 1344884865, 210843729, 676364544,
+    234449232, 12112337, 1350619139, 1753272996, 2037118872, 1408560528,
+    533334916, 1043640385, 357326099, 201376421, 110375493, 541106497,
+    416159637, 242512193, 777294080, 1614872576, 1535546636, 870600145,
+    910810409, 1821440209, 1605432464, 1145147393, 951695441, 1758494976,
+    1506656568, 1557150160, 608221521, 1073840384, 217672017, 684818688,
+    1750138880, 16777217, 677990609, 953274371, 1770050213, 1359128393,
+    1797602707, 1984616737, 1865815816, 2120835200, 2051677060, 1772234061,
+    1579794881, 1652821009, 1742099468, 1887260865, 46468113, 1011925248,
+    1134107920, 881643832, 1354774993, 472508800, 1892499769, 1752793472,
+    1962502272, 687898625, 883538000, 1354355153, 1761673473, 944820481,
+    2020102353, 22020353, 961597696, 1342242816, 964808962, 1355809701,
+    17016649, 1386540177, 647682692, 1849012289, 751668241, 1557184768,
+    127374604, 1927564752, 1045744913, 1614921984, 43588881, 1016185088,
+    1544617984, 1090519041, 136122424, 215038417, 1563027841, 2026918145,
+    1688778833, 701530369, 1372639488, 1342242817, 2036945104, 953274369,
+    1750192384, 16842753, 964808960, 1359020032, 1358954497
+};
+
+// Note: This is pre-transposed, i.e. stored column-major order
+static const int8_t R64T[64][64] = {
+    {
+         32,  45,  45,  45,  45,  45,  45,  45,  44,  44,  44,  44,  43,  43,  43,  42,
+         42,  41,  41,  40,  40,  39,  39,  38,  38,  37,  36,  36,  35,  34,  34,  33,
+         32,  31,  30,  30,  29,  28,  27,  26,  25,  24,  23,  22,  21,  20,  19,  18,
+         17,  16,  15,  14,  13,  12,  11,  10,   9,   8,   7,   6,   4,   3,   2,   1,
+    }, {
+         32,  45,  45,  44,  43,  42,  41,  39,  38,  36,  34,  31,  29,  26,  23,  20,
+         17,  14,  11,   8,   4,   1,  -2,  -6,  -9, -12, -15, -18, -21, -24, -27, -30,
+        -32, -34, -36, -38, -40, -41, -43, -44, -44, -45, -45, -45, -45, -45, -44, -43,
+        -42, -40, -39, -37, -35, -33, -30, -28, -25, -22, -19, -16, -13, -10,  -7,  -3,
+    }, {
+         32,  45,  44,  42,  40,  37,  34,  30,  25,  20,  15,  10,   4,  -1,  -7, -12,
+        -17, -22, -27, -31, -35, -38, -41, -43, -44, -45, -45, -45, -43, -41, -39, -36,
+        -32, -28, -23, -18, -13,  -8,  -2,   3,   9,  14,  19,  24,  29,  33,  36,  39,
+         42,  44,  45,  45,  45,  44,  43,  40,  38,  34,  30,  26,  21,  16,  11,   6,
+    }, {
+         32,  45,  43,  39,  35,  30,  23,  16,   9,   1,  -7, -14, -21, -28, -34, -38,
+        -42, -44, -45, -45, -43, -40, -36, -31, -25, -18, -11,  -3,   4,  12,  19,  26,
+         32,  37,  41,  44,  45,  45,  44,  41,  38,  33,  27,  20,  13,   6,  -2, -10,
+        -17, -24, -30, -36, -40, -43, -45, -45, -44, -42, -39, -34, -29, -22, -15,  -8,
+    }, {
+         32,  44,  41,  36,  29,  20,  11,   1,  -9, -18, -27, -34, -40, -44, -45, -45,
+        -42, -37, -30, -22, -13,  -3,   7,  16,  25,  33,  39,  43,  45,  45,  43,  38,
+         32,  24,  15,   6,  -4, -14, -23, -31, -38, -42, -45, -45, -43, -39, -34, -26,
+        -17,  -8,   2,  12,  21,  30,  36,  41,  44,  45,  44,  40,  35,  28,  19,  10,
+    }, {
+         32,  44,  39,  31,  21,  10,  -2, -14, -25, -34, -41, -45, -45, -42, -36, -28,
+        -17,  -6,   7,  18,  29,  37,  43,  45,  44,  40,  34,  24,  13,   1, -11, -22,
+        -32, -39, -44, -45, -43, -38, -30, -20,  -9,   3,  15,  26,  35,  41,  45,  45,
+         42,  36,  27,  16,   4,  -8, -19, -30, -38, -43, -45, -44, -40, -33, -23, -12,
+    }, {
+         32,  43,  36,  26,  13,  -1, -15, -28, -38, -44, -45, -42, -35, -24, -11,   3,
+         17,  30,  39,  44,  45,  41,  34,  22,   9,  -6, -19, -31, -40, -45, -45, -40,
+        -32, -20,  -7,   8,  21,  33,  41,  45,  44,  39,  30,  18,   4, -10, -23, -34,
+        -42, -45, -44, -38, -29, -16,  -2,  12,  25,  36,  43,  45,  43,  37,  27,  14,
+    }, {
+         32,  42,  34,  20,   4, -12, -27, -38, -44, -45, -39, -28, -13,   3,  19,  33,
+         42,  45,  43,  34,  21,   6, -11, -26, -38, -44, -45, -39, -29, -14,   2,  18,
+         32,  41,  45,  43,  35,  22,   7, -10, -25, -37, -44, -45, -40, -30, -15,   1,
+         17,  31,  41,  45,  43,  36,  23,   8,  -9, -24, -36, -44, -45, -40, -30, -16,
+    }, {
+         32,  41,  30,  14,  -4, -22, -36, -44, -44, -37, -23,  -6,  13,  30,  41,  45,
+         42,  31,  15,  -3, -21, -36, -44, -45, -38, -24,  -7,  12,  29,  40,  45,  42,
+         32,  16,  -2, -20, -35, -44, -45, -38, -25,  -8,  11,  28,  40,  45,  43,  33,
+         17,  -1, -19, -34, -43, -45, -39, -26,  -9,  10,  27,  39,  45,  43,  34,  18,
+    }, {
+         32,  40,  27,   8, -13, -31, -43, -45, -38, -22,  -2,  18,  35,  44,  44,  34,
+         17,  -3, -23, -38, -45, -42, -30, -12,   9,  28,  41,  45,  40,  26,   7, -14,
+        -32, -43, -45, -37, -21,  -1,  19,  36,  44,  44,  34,  16,  -4, -24, -39, -45,
+        -42, -30, -11,  10,  29,  41,  45,  39,  25,   6, -15, -33, -43, -45, -36, -20,
+    }, {
+         32,  39,  23,   1, -21, -38, -45, -40, -25,  -3,  19,  37,  45,  41,  27,   6,
+        -17, -36, -45, -42, -29,  -8,  15,  34,  44,  43,  30,  10, -13, -33, -44, -44,
+        -32, -12,  11,  31,  43,  44,  34,  14,  -9, -30, -43, -45, -35, -16,   7,  28,
+         42,  45,  36,  18,  -4, -26, -41, -45, -38, -20,   2,  24,  40,  45,  39,  22,
+    }, {
+         32,  38,  19,  -6, -29, -43, -44, -31,  -9,  16,  36,  45,  40,  22,  -2, -26,
+        -42, -45, -34, -12,  13,  34,  45,  41,  25,   1, -23, -40, -45, -36, -15,  10,
+         32,  44,  43,  28,   4, -20, -39, -45, -38, -18,   7,  30,  43,  44,  30,   8,
+        -17, -37, -45, -39, -21,   3,  27,  42,  44,  33,  11, -14, -35, -45, -41, -24,
+    }, {
+         32,  37,  15, -12, -35, -45, -39, -18,   9,  33,  45,  40,  21,  -6, -30, -44,
+        -42, -24,   2,  28,  43,  43,  27,   1, -25, -42, -44, -30,  -4,  22,  41,  45,
+         32,   8, -19, -39, -45, -34, -11,  16,  38,  45,  36,  14, -13, -36, -45, -38,
+        -17,  10,  34,  45,  40,  20,  -7, -31, -44, -41, -23,   3,  29,  44,  43,  26,
+    }, {
+         32,  36,  11, -18, -40, -45, -30,  -3,  25,  43,  43,  24,  -4, -31, -45, -39,
+        -17,  12,  36,  45,  35,  10, -19, -40, -44, -30,  -2,  26,  43,  42,  23,  -6,
+        -32, -45, -39, -16,  13,  37,  45,  34,   9, -20, -41, -44, -29,  -1,  27,  44,
+         42,  22,  -7, -33, -45, -38, -15,  14,  38,  45,  34,   8, -21, -41, -44, -28,
+    }, {
+         32,  34,   7, -24, -43, -41, -19,  12,  38,  45,  30,   1, -29, -45, -39, -14,
+         17,  40,  44,  26,  -4, -33, -45, -36,  -9,  22,  43,  42,  21, -10, -36, -45,
+        -32,  -3,  27,  44,  40,  16, -15, -39, -44, -28,   2,  31,  45,  37,  11, -20,
+        -42, -43, -23,   8,  35,  45,  34,   6, -25, -44, -41, -18,  13,  38,  45,  30,
+    }, {
+         32,  33,   2, -30, -45, -36,  -7,  26,  44,  38,  11, -22, -43, -40, -15,  18,
+         42,  42,  19, -14, -40, -44, -23,  10,  38,  45,  27,  -6, -35, -45, -30,   1,
+         32,  45,  34,   3, -29, -45, -36,  -8,  25,  44,  39,  12, -21, -43, -41, -16,
+         17,  41,  43,  20, -13, -39, -44, -24,   9,  37,  45,  28,  -4, -34, -45, -31,
+    }, {
+         32,  31,  -2, -34, -45, -28,   7,  37,  44,  24, -11, -39, -43, -20,  15,  41,
+         42,  16, -19, -43, -40, -12,  23,  44,  38,   8, -27, -45, -35,  -3,  30,  45,
+         32,  -1, -34, -45, -29,   6,  36,  45,  25, -10, -39, -44, -21,  14,  41,  42,
+         17, -18, -43, -40, -13,  22,  44,  38,   9, -26, -45, -36,  -4,  30,  45,  33,
+    }, {
+         32,  30,  -7, -38, -43, -18,  19,  44,  38,   6, -30, -45, -29,   8,  39,  43,
+         17, -20, -44, -37,  -4,  31,  45,  28,  -9, -39, -43, -16,  21,  44,  36,   3,
+        -32, -45, -27,  10,  40,  42,  15, -22, -44, -36,  -2,  33,  45,  26, -11, -40,
+        -42, -14,  23,  45,  35,   1, -34, -45, -25,  12,  41,  41,  13, -24, -45, -34,
+    }, {
+         32,  28, -11, -41, -40,  -8,  30,  45,  25, -14, -43, -38,  -4,  33,  45,  22,
+        -17, -44, -36,  -1,  35,  44,  19, -20, -44, -34,   2,  37,  43,  16, -23, -45,
+        -32,   6,  39,  42,  13, -26, -45, -30,   9,  40,  41,  10, -29, -45, -27,  12,
+         42,  39,   7, -31, -45, -24,  15,  43,  38,   3, -34, -45, -21,  18,  44,  36,
+    }, {
+         32,  26, -15, -44, -35,   3,  39,  41,   9, -31, -45, -20,  21,  45,  30, -10,
+        -42, -38,  -2,  36,  43,  14, -27, -45, -25,  16,  44,  34,  -4, -39, -41,  -8,
+         32,  45,  19, -22, -45, -30,  11,  42,  38,   1, -36, -43, -13,  28,  45,  24,
+        -17, -44, -34,   6,  40,  40,   7, -33, -44, -18,  23,  45,  29, -12, -43, -37,
+    }, {
+         32,  24, -19, -45, -29,  14,  44,  33,  -9, -42, -36,   3,  40,  39,   2, -37,
+        -42,  -8,  34,  44,  13, -30, -45, -18,  25,  45,  23, -20, -45, -28,  15,  44,
+         32, -10, -43, -36,   4,  40,  39,   1, -38, -41,  -7,  34,  43,  12, -30, -45,
+        -17,  26,  45,  22, -21, -45, -27,  16,  44,  31, -11, -43, -35,   6,  41,  38,
+    }, {
+         32,  22, -23, -45, -21,  24,  45,  20, -25, -45, -19,  26,  45,  18, -27, -45,
+        -17,  28,  45,  16, -29, -45, -15,  30,  44,  14, -30, -44, -13,  31,  44,  12,
+        -32, -44, -11,  33,  43,  10, -34, -43,  -9,  34,  43,   8, -35, -42,  -7,  36,
+         42,   6, -36, -41,  -4,  37,  41,   3, -38, -40,  -2,  38,  40,   1, -39, -39,
+    }, {
+         32,  20, -27, -45, -13,  33,  43,   6, -38, -39,   2,  41,  35, -10, -44, -30,
+         17,  45,  23, -24, -45, -16,  30,  44,   9, -36, -41,  -1,  40,  37,  -7, -43,
+        -32,  14,  45,  26, -21, -45, -19,  28,  44,  12, -34, -42,  -4,  38,  39,  -3,
+        -42, -34,  11,  44,  29, -18, -45, -22,  25,  45,  15, -31, -43,  -8,  36,  40,
+    }, {
+         32,  18, -30, -43,  -4,  39,  36, -10, -44, -26,  23,  45,  13, -34, -41,   1,
+         42,  33, -15, -45, -21,  28,  44,   8, -38, -38,   7,  44,  29, -20, -45, -16,
+         32,  42,   2, -40, -35,  12,  45,  24, -25, -45, -11,  36,  40,  -3, -43, -31,
+         17,  45,  19, -30, -43,  -6,  39,  37,  -9, -44, -27,  22,  45,  14, -34, -41,
+    }, {
+         32,  16, -34, -40,   4,  44,  27, -24, -44,  -8,  39,  36, -13, -45, -19,  31,
+         42,  -1, -43, -30,  21,  45,  11, -37, -38,  10,  45,  22, -29, -43,  -2,  41,
+         32, -18, -45, -14,  35,  39,  -7, -44, -25,  26,  44,   6, -40, -34,  15,  45,
+         17, -33, -41,   3,  43,  28, -23, -45,  -9,  38,  36, -12, -45, -20,  30,  42,
+    }, {
+         32,  14, -36, -37,  13,  45,  15, -36, -38,  12,  45,  16, -35, -38,  11,  45,
+         17, -34, -39,  10,  45,  18, -34, -39,   9,  45,  19, -33, -40,   8,  45,  20,
+        -32, -40,   7,  45,  21, -31, -41,   6,  44,  22, -30, -41,   4,  44,  23, -30,
+        -42,   3,  44,  24, -29, -42,   2,  44,  25, -28, -43,   1,  43,  26, -27, -43,
+    }, {
+         32,  12, -39, -33,  21,  44,   2, -43, -25,  30,  41,  -8, -45, -16,  36,  36,
+        -17, -45,  -7,  41,  29, -26, -43,   3,  44,  20, -34, -38,  13,  45,  11, -39,
+        -32,  22,  44,   1, -43, -24,  30,  40,  -9, -45, -15,  37,  35, -18, -45,  -6,
+         42,  28, -27, -42,   4,  45,  19, -34, -38,  14,  45,  10, -40, -31,  23,  44,
+    }, {
+         32,  10, -41, -28,  29,  40, -11, -45,  -9,  41,  27, -30, -40,  12,  45,   8,
+        -42, -26,  30,  39, -13, -45,  -7,  42,  25, -31, -39,  14,  45,   6, -43, -24,
+         32,  38, -15, -45,  -4,  43,  23, -33, -38,  16,  45,   3, -43, -22,  34,  37,
+        -17, -45,  -2,  44,  21, -34, -36,  18,  44,   1, -44, -20,  35,  36, -19, -44,
+    }, {
+         32,   8, -43, -22,  35,  34, -23, -42,   9,  45,   7, -43, -21,  36,  34, -24,
+        -42,  10,  45,   6, -43, -20,  36,  33, -25, -41,  11,  45,   4, -44, -19,  37,
+         32, -26, -41,  12,  45,   3, -44, -18,  38,  31, -27, -40,  13,  45,   2, -44,
+        -17,  38,  30, -28, -40,  14,  45,   1, -44, -16,  39,  30, -29, -39,  15,  45,
+    }, {
+         32,   6, -44, -16,  40,  26, -34, -34,  25,  40, -15, -44,   4,  45,   7, -44,
+        -17,  39,  27, -33, -35,  24,  41, -14, -44,   3,  45,   8, -43, -18,  39,  28,
+        -32, -36,  23,  41, -13, -45,   2,  45,   9, -43, -19,  38,  29, -31, -36,  22,
+         42, -12, -45,   1,  45,  10, -43, -20,  38,  30, -30, -37,  21,  42, -11, -45,
+    }, {
+         32,   3, -45, -10,  43,  16, -41, -22,  38,  28, -34, -33,  29,  37, -23, -40,
+         17,  43, -11, -45,   4,  45,   2, -45,  -9,  44,  15, -41, -21,  38,  27, -34,
+        -32,  30,  36, -24, -40,  18,  43, -12, -44,   6,  45,   1, -45,  -8,  44,  14,
+        -42, -20,  39,  26, -35, -31,  30,  36, -25, -39,  19,  42, -13, -44,   7,  45,
+    }, {
+         32,   1, -45,  -3,  45,   6, -45,  -8,  44,  10, -44, -12,  43,  14, -43, -16,
+         42,  18, -41, -20,  40,  22, -39, -24,  38,  26, -36, -28,  35,  30, -34, -31,
+         32,  33, -30, -34,  29,  36, -27, -37,  25,  38, -23, -39,  21,  40, -19, -41,
+         17,  42, -15, -43,  13,  44, -11, -44,   9,  45,  -7, -45,   4,  45,  -2, -45,
+    }, {
+         32,  -1, -45,   3,  45,  -6, -45,   8,  44, -10, -44,  12,  43, -14, -43,  16,
+         42, -18, -41,  20,  40, -22, -39,  24,  38, -26, -36,  28,  35, -30, -34,  31,
+         32, -33, -30,  34,  29, -36, -27,  37,  25, -38, -23,  39,  21, -40, -19,  41,
+         17, -42, -15,  43,  13, -44, -11,  44,   9, -45,  -7,  45,   4, -45,  -2,  45,
+    }, {
+         32,  -3, -45,  10,  43, -16, -41,  22,  38, -28, -34,  33,  29, -37, -23,  40,
+         17, -43, -11,  45,   4, -45,   2,  45,  -9, -44,  15,  41, -21, -38,  27,  34,
+        -32, -30,  36,  24, -40, -18,  43,  12, -44,  -6,  45,  -1, -45,   8,  44, -14,
+        -42,  20,  39, -26, -35,  31,  30, -36, -25,  39,  19, -42, -13,  44,   7, -45,
+    }, {
+         32,  -6, -44,  16,  40, -26, -34,  34,  25, -40, -15,  44,   4, -45,   7,  44,
+        -17, -39,  27,  33, -35, -24,  41,  14, -44,  -3,  45,  -8, -43,  18,  39, -28,
+        -32,  36,  23, -41, -13,  45,   2, -45,   9,  43, -19, -38,  29,  31, -36, -22,
+         42,  12, -45,  -1,  45, -10, -43,  20,  38, -30, -30,  37,  21, -42, -11,  45,
+    }, {
+         32,  -8, -43,  22,  35, -34, -23,  42,   9, -45,   7,  43, -21, -36,  34,  24,
+        -42, -10,  45,  -6, -43,  20,  36, -33, -25,  41,  11, -45,   4,  44, -19, -37,
+         32,  26, -41, -12,  45,  -3, -44,  18,  38, -31, -27,  40,  13, -45,   2,  44,
+        -17, -38,  30,  28, -40, -14,  45,  -1, -44,  16,  39, -30, -29,  39,  15, -45,
+    }, {
+         32, -10, -41,  28,  29, -40, -11,  45,  -9, -41,  27,  30, -40, -12,  45,  -8,
+        -42,  26,  30, -39, -13,  45,  -7, -42,  25,  31, -39, -14,  45,  -6, -43,  24,
+         32, -38, -15,  45,  -4, -43,  23,  33, -38, -16,  45,  -3, -43,  22,  34, -37,
+        -17,  45,  -2, -44,  21,  34, -36, -18,  44,  -1, -44,  20,  35, -36, -19,  44,
+    }, {
+         32, -12, -39,  33,  21, -44,   2,  43, -25, -30,  41,   8, -45,  16,  36, -36,
+        -17,  45,  -7, -41,  29,  26, -43,  -3,  44, -20, -34,  38,  13, -45,  11,  39,
+        -32, -22,  44,  -1, -43,  24,  30, -40,  -9,  45, -15, -37,  35,  18, -45,   6,
+         42, -28, -27,  42,   4, -45,  19,  34, -38, -14,  45, -10, -40,  31,  23, -44,
+    }, {
+         32, -14, -36,  37,  13, -45,  15,  36, -38, -12,  45, -16, -35,  38,  11, -45,
+         17,  34, -39, -10,  45, -18, -34,  39,   9, -45,  19,  33, -40,  -8,  45, -20,
+        -32,  40,   7, -45,  21,  31, -41,  -6,  44, -22, -30,  41,   4, -44,  23,  30,
+        -42,  -3,  44, -24, -29,  42,   2, -44,  25,  28, -43,  -1,  43, -26, -27,  43,
+    }, {
+         32, -16, -34,  40,   4, -44,  27,  24, -44,   8,  39, -36, -13,  45, -19, -31,
+         42,   1, -43,  30,  21, -45,  11,  37, -38, -10,  45, -22, -29,  43,  -2, -41,
+         32,  18, -45,  14,  35, -39,  -7,  44, -25, -26,  44,  -6, -40,  34,  15, -45,
+         17,  33, -41,  -3,  43, -28, -23,  45,  -9, -38,  36,  12, -45,  20,  30, -42,
+    }, {
+         32, -18, -30,  43,  -4, -39,  36,  10, -44,  26,  23, -45,  13,  34, -41,  -1,
+         42, -33, -15,  45, -21, -28,  44,  -8, -38,  38,   7, -44,  29,  20, -45,  16,
+         32, -42,   2,  40, -35, -12,  45, -24, -25,  45, -11, -36,  40,   3, -43,  31,
+         17, -45,  19,  30, -43,   6,  39, -37,  -9,  44, -27, -22,  45, -14, -34,  41,
+    }, {
+         32, -20, -27,  45, -13, -33,  43,  -6, -38,  39,   2, -41,  35,  10, -44,  30,
+         17, -45,  23,  24, -45,  16,  30, -44,   9,  36, -41,   1,  40, -37,  -7,  43,
+        -32, -14,  45, -26, -21,  45, -19, -28,  44, -12, -34,  42,  -4, -38,  39,   3,
+        -42,  34,  11, -44,  29,  18, -45,  22,  25, -45,  15,  31, -43,   8,  36, -40,
+    }, {
+         32, -22, -23,  45, -21, -24,  45, -20, -25,  45, -19, -26,  45, -18, -27,  45,
+        -17, -28,  45, -16, -29,  45, -15, -30,  44, -14, -30,  44, -13, -31,  44, -12,
+        -32,  44, -11, -33,  43, -10, -34,  43,  -9, -34,  43,  -8, -35,  42,  -7, -36,
+         42,  -6, -36,  41,  -4, -37,  41,  -3, -38,  40,  -2, -38,  40,  -1, -39,  39,
+    }, {
+         32, -24, -19,  45, -29, -14,  44, -33,  -9,  42, -36,  -3,  40, -39,   2,  37,
+        -42,   8,  34, -44,  13,  30, -45,  18,  25, -45,  23,  20, -45,  28,  15, -44,
+         32,  10, -43,  36,   4, -40,  39,  -1, -38,  41,  -7, -34,  43, -12, -30,  45,
+        -17, -26,  45, -22, -21,  45, -27, -16,  44, -31, -11,  43, -35,  -6,  41, -38,
+    }, {
+         32, -26, -15,  44, -35,  -3,  39, -41,   9,  31, -45,  20,  21, -45,  30,  10,
+        -42,  38,  -2, -36,  43, -14, -27,  45, -25, -16,  44, -34,  -4,  39, -41,   8,
+         32, -45,  19,  22, -45,  30,  11, -42,  38,  -1, -36,  43, -13, -28,  45, -24,
+        -17,  44, -34,  -6,  40, -40,   7,  33, -44,  18,  23, -45,  29,  12, -43,  37,
+    }, {
+         32, -28, -11,  41, -40,   8,  30, -45,  25,  14, -43,  38,  -4, -33,  45, -22,
+        -17,  44, -36,   1,  35, -44,  19,  20, -44,  34,   2, -37,  43, -16, -23,  45,
+        -32,  -6,  39, -42,  13,  26, -45,  30,   9, -40,  41, -10, -29,  45, -27, -12,
+         42, -39,   7,  31, -45,  24,  15, -43,  38,  -3, -34,  45, -21, -18,  44, -36,
+    }, {
+         32, -30,  -7,  38, -43,  18,  19, -44,  38,  -6, -30,  45, -29,  -8,  39, -43,
+         17,  20, -44,  37,  -4, -31,  45, -28,  -9,  39, -43,  16,  21, -44,  36,  -3,
+        -32,  45, -27, -10,  40, -42,  15,  22, -44,  36,  -2, -33,  45, -26, -11,  40,
+        -42,  14,  23, -45,  35,  -1, -34,  45, -25, -12,  41, -41,  13,  24, -45,  34,
+    }, {
+         32, -31,  -2,  34, -45,  28,   7, -37,  44, -24, -11,  39, -43,  20,  15, -41,
+         42, -16, -19,  43, -40,  12,  23, -44,  38,  -8, -27,  45, -35,   3,  30, -45,
+         32,   1, -34,  45, -29,  -6,  36, -45,  25,  10, -39,  44, -21, -14,  41, -42,
+         17,  18, -43,  40, -13, -22,  44, -38,   9,  26, -45,  36,  -4, -30,  45, -33,
+    }, {
+         32, -33,   2,  30, -45,  36,  -7, -26,  44, -38,  11,  22, -43,  40, -15, -18,
+         42, -42,  19,  14, -40,  44, -23, -10,  38, -45,  27,   6, -35,  45, -30,  -1,
+         32, -45,  34,  -3, -29,  45, -36,   8,  25, -44,  39, -12, -21,  43, -41,  16,
+         17, -41,  43, -20, -13,  39, -44,  24,   9, -37,  45, -28,  -4,  34, -45,  31,
+    }, {
+         32, -34,   7,  24, -43,  41, -19, -12,  38, -45,  30,  -1, -29,  45, -39,  14,
+         17, -40,  44, -26,  -4,  33, -45,  36,  -9, -22,  43, -42,  21,  10, -36,  45,
+        -32,   3,  27, -44,  40, -16, -15,  39, -44,  28,   2, -31,  45, -37,  11,  20,
+        -42,  43, -23,  -8,  35, -45,  34,  -6, -25,  44, -41,  18,  13, -38,  45, -30,
+    }, {
+         32, -36,  11,  18, -40,  45, -30,   3,  25, -43,  43, -24,  -4,  31, -45,  39,
+        -17, -12,  36, -45,  35, -10, -19,  40, -44,  30,  -2, -26,  43, -42,  23,   6,
+        -32,  45, -39,  16,  13, -37,  45, -34,   9,  20, -41,  44, -29,   1,  27, -44,
+         42, -22,  -7,  33, -45,  38, -15, -14,  38, -45,  34,  -8, -21,  41, -44,  28,
+    }, {
+         32, -37,  15,  12, -35,  45, -39,  18,   9, -33,  45, -40,  21,   6, -30,  44,
+        -42,  24,   2, -28,  43, -43,  27,  -1, -25,  42, -44,  30,  -4, -22,  41, -45,
+         32,  -8, -19,  39, -45,  34, -11, -16,  38, -45,  36, -14, -13,  36, -45,  38,
+        -17, -10,  34, -45,  40, -20,  -7,  31, -44,  41, -23,  -3,  29, -44,  43, -26,
+    }, {
+         32, -38,  19,   6, -29,  43, -44,  31,  -9, -16,  36, -45,  40, -22,  -2,  26,
+        -42,  45, -34,  12,  13, -34,  45, -41,  25,  -1, -23,  40, -45,  36, -15, -10,
+         32, -44,  43, -28,   4,  20, -39,  45, -38,  18,   7, -30,  43, -44,  30,  -8,
+        -17,  37, -45,  39, -21,  -3,  27, -42,  44, -33,  11,  14, -35,  45, -41,  24,
+    }, {
+         32, -39,  23,  -1, -21,  38, -45,  40, -25,   3,  19, -37,  45, -41,  27,  -6,
+        -17,  36, -45,  42, -29,   8,  15, -34,  44, -43,  30, -10, -13,  33, -44,  44,
+        -32,  12,  11, -31,  43, -44,  34, -14,  -9,  30, -43,  45, -35,  16,   7, -28,
+         42, -45,  36, -18,  -4,  26, -41,  45, -38,  20,   2, -24,  40, -45,  39, -22,
+    }, {
+         32, -40,  27,  -8, -13,  31, -43,  45, -38,  22,  -2, -18,  35, -44,  44, -34,
+         17,   3, -23,  38, -45,  42, -30,  12,   9, -28,  41, -45,  40, -26,   7,  14,
+        -32,  43, -45,  37, -21,   1,  19, -36,  44, -44,  34, -16,  -4,  24, -39,  45,
+        -42,  30, -11, -10,  29, -41,  45, -39,  25,  -6, -15,  33, -43,  45, -36,  20,
+    }, {
+         32, -41,  30, -14,  -4,  22, -36,  44, -44,  37, -23,   6,  13, -30,  41, -45,
+         42, -31,  15,   3, -21,  36, -44,  45, -38,  24,  -7, -12,  29, -40,  45, -42,
+         32, -16,  -2,  20, -35,  44, -45,  38, -25,   8,  11, -28,  40, -45,  43, -33,
+         17,   1, -19,  34, -43,  45, -39,  26,  -9, -10,  27, -39,  45, -43,  34, -18,
+    }, {
+         32, -42,  34, -20,   4,  12, -27,  38, -44,  45, -39,  28, -13,  -3,  19, -33,
+         42, -45,  43, -34,  21,  -6, -11,  26, -38,  44, -45,  39, -29,  14,   2, -18,
+         32, -41,  45, -43,  35, -22,   7,  10, -25,  37, -44,  45, -40,  30, -15,  -1,
+         17, -31,  41, -45,  43, -36,  23,  -8,  -9,  24, -36,  44, -45,  40, -30,  16,
+    }, {
+         32, -43,  36, -26,  13,   1, -15,  28, -38,  44, -45,  42, -35,  24, -11,  -3,
+         17, -30,  39, -44,  45, -41,  34, -22,   9,   6, -19,  31, -40,  45, -45,  40,
+        -32,  20,  -7,  -8,  21, -33,  41, -45,  44, -39,  30, -18,   4,  10, -23,  34,
+        -42,  45, -44,  38, -29,  16,  -2, -12,  25, -36,  43, -45,  43, -37,  27, -14,
+    }, {
+         32, -44,  39, -31,  21, -10,  -2,  14, -25,  34, -41,  45, -45,  42, -36,  28,
+        -17,   6,   7, -18,  29, -37,  43, -45,  44, -40,  34, -24,  13,  -1, -11,  22,
+        -32,  39, -44,  45, -43,  38, -30,  20,  -9,  -3,  15, -26,  35, -41,  45, -45,
+         42, -36,  27, -16,   4,   8, -19,  30, -38,  43, -45,  44, -40,  33, -23,  12,
+    }, {
+         32, -44,  41, -36,  29, -20,  11,  -1,  -9,  18, -27,  34, -40,  44, -45,  45,
+        -42,  37, -30,  22, -13,   3,   7, -16,  25, -33,  39, -43,  45, -45,  43, -38,
+         32, -24,  15,  -6,  -4,  14, -23,  31, -38,  42, -45,  45, -43,  39, -34,  26,
+        -17,   8,   2, -12,  21, -30,  36, -41,  44, -45,  44, -40,  35, -28,  19, -10,
+    }, {
+         32, -45,  43, -39,  35, -30,  23, -16,   9,  -1,  -7,  14, -21,  28, -34,  38,
+        -42,  44, -45,  45, -43,  40, -36,  31, -25,  18, -11,   3,   4, -12,  19, -26,
+         32, -37,  41, -44,  45, -45,  44, -41,  38, -33,  27, -20,  13,  -6,  -2,  10,
+        -17,  24, -30,  36, -40,  43, -45,  45, -44,  42, -39,  34, -29,  22, -15,   8,
+    }, {
+         32, -45,  44, -42,  40, -37,  34, -30,  25, -20,  15, -10,   4,   1,  -7,  12,
+        -17,  22, -27,  31, -35,  38, -41,  43, -44,  45, -45,  45, -43,  41, -39,  36,
+        -32,  28, -23,  18, -13,   8,  -2,  -3,   9, -14,  19, -24,  29, -33,  36, -39,
+         42, -44,  45, -45,  45, -44,  43, -40,  38, -34,  30, -26,  21, -16,  11,  -6,
+    }, {
+         32, -45,  45, -44,  43, -42,  41, -39,  38, -36,  34, -31,  29, -26,  23, -20,
+         17, -14,  11,  -8,   4,  -1,  -2,   6,  -9,  12, -15,  18, -21,  24, -27,  30,
+        -32,  34, -36,  38, -40,  41, -43,  44, -44,  45, -45,  45, -45,  45, -44,  43,
+        -42,  40, -39,  37, -35,  33, -30,  28, -25,  22, -19,  16, -13,  10,  -7,   3,
+    }, {
+         32, -45,  45, -45,  45, -45,  45, -45,  44, -44,  44, -44,  43, -43,  43, -42,
+         42, -41,  41, -40,  40, -39,  39, -38,  38, -37,  36, -36,  35, -34,  34, -33,
+         32, -31,  30, -30,  29, -28,  27, -26,  25, -24,  23, -22,  21, -20,  19, -18,
+         17, -16,  15, -14,  13, -12,  11, -10,   9,  -8,   7,  -6,   4,  -3,   2,  -1,
+    }
+};
diff --git a/src/shaders/icc.c b/src/shaders/icc.c
new file mode 100644
index 0000000..6a16cfd
--- /dev/null
+++ b/src/shaders/icc.c
@@ -0,0 +1,781 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include "shaders.h"
+
+#include <libplacebo/tone_mapping.h>
+#include <libplacebo/shaders/icc.h>
+
+const struct pl_icc_params pl_icc_default_params = { PL_ICC_DEFAULTS };
+
+#ifdef PL_HAVE_LCMS
+
+#include <lcms2.h>
+#include <lcms2_plugin.h>
+
+struct icc_priv {
+    pl_log log;
+    pl_cache cache; // for backwards compatibility
+    cmsContext cms;
+    cmsHPROFILE profile;
+    cmsHPROFILE approx; // approximation profile
+    float a, b, scale; // approxmation tone curve parameters and scaling
+    cmsCIEXYZ black;
+    float gamma_stddev;
+    uint64_t lut_sig;
+};
+
+static void error_callback(cmsContext cms, cmsUInt32Number code,
+                           const char *msg)
+{
+    pl_log log = cmsGetContextUserData(cms);
+    pl_err(log, "lcms2: [%d] %s", (int) code, msg);
+}
+
+static void set_callback(void *priv, pl_cache_obj obj)
+{
+    pl_icc_object icc = priv;
+    icc->params.cache_save(icc->params.cache_priv, obj.key, obj.data, obj.size);
+}
+
+static pl_cache_obj get_callback(void *priv, uint64_t key)
+{
+    pl_icc_object icc = priv;
+    int s_r = icc->params.size_r, s_g = icc->params.size_g, s_b = icc->params.size_b;
+    size_t data_size = s_r * s_g * s_b * sizeof(uint16_t[4]);
+    void *data = pl_alloc(NULL, data_size);
+    bool ok = icc->params.cache_load(icc->params.cache_priv, key, data, data_size);
+    if (!ok) {
+        pl_free(data);
+        return (pl_cache_obj) {0};
+    }
+
+    return (pl_cache_obj) {
+        .key  = key,
+        .data = data,
+        .size = data_size,
+        .free = pl_free,
+    };
+}
+
+void pl_icc_close(pl_icc_object *picc)
+{
+    pl_icc_object icc = *picc;
+    if (!icc)
+        return;
+
+    struct icc_priv *p = PL_PRIV(icc);
+    cmsCloseProfile(p->approx);
+    cmsCloseProfile(p->profile);
+    cmsDeleteContext(p->cms);
+    pl_cache_destroy(&p->cache);
+    pl_free_ptr((void **) picc);
+}
+
+static bool detect_csp(pl_icc_object icc, struct pl_raw_primaries *prim,
+                       float *out_gamma)
+{
+    struct icc_priv *p = PL_PRIV(icc);
+    cmsHTRANSFORM tf;
+    cmsHPROFILE xyz = cmsCreateXYZProfileTHR(p->cms);
+    if (!xyz)
+        return false;
+
+    // We need to use an unadapted observer to get the raw values
+    cmsFloat64Number prev_adapt = cmsSetAdaptationStateTHR(p->cms, 0.0);
+    tf = cmsCreateTransformTHR(p->cms, p->profile, TYPE_RGB_8, xyz, TYPE_XYZ_DBL,
+                               INTENT_ABSOLUTE_COLORIMETRIC,
+                               /* Note: These flags mostly don't do anything
+                                * anyway, but specify them regardless */
+                               cmsFLAGS_NOCACHE |
+                               cmsFLAGS_NOOPTIMIZE);
+    cmsSetAdaptationStateTHR(p->cms, prev_adapt);
+    cmsCloseProfile(xyz);
+    if (!tf)
+        return false;
+
+    enum {
+        RED,
+        GREEN,
+        BLUE,
+        WHITE,
+        BLACK,
+        GRAY,
+        RAMP,
+    };
+
+    static const uint8_t test[][3] = {
+        [RED]   = { 0xFF,    0,    0 },
+        [GREEN] = {    0, 0xFF,    0 },
+        [BLUE]  = {    0,    0, 0xFF },
+        [WHITE] = { 0xFF, 0xFF, 0xFF },
+        [BLACK] = { 0x00, 0x00, 0x00 },
+        [GRAY]  = { 0x80, 0x80, 0x80 },
+
+        // Grayscale ramp (excluding endpoints)
+#define V(d) { d, d, d }
+                 V(0x01), V(0x02), V(0x03), V(0x04), V(0x05), V(0x06), V(0x07),
+        V(0x08), V(0x09), V(0x0A), V(0x0B), V(0x0C), V(0x0D), V(0x0E), V(0x0F),
+        V(0x10), V(0x11), V(0x12), V(0x13), V(0x14), V(0x15), V(0x16), V(0x17),
+        V(0x18), V(0x19), V(0x1A), V(0x1B), V(0x1C), V(0x1D), V(0x1E), V(0x1F),
+        V(0x20), V(0x21), V(0x22), V(0x23), V(0x24), V(0x25), V(0x26), V(0x27),
+        V(0x28), V(0x29), V(0x2A), V(0x2B), V(0x2C), V(0x2D), V(0x2E), V(0x2F),
+        V(0x30), V(0x31), V(0x32), V(0x33), V(0x34), V(0x35), V(0x36), V(0x37),
+        V(0x38), V(0x39), V(0x3A), V(0x3B), V(0x3C), V(0x3D), V(0x3E), V(0x3F),
+        V(0x40), V(0x41), V(0x42), V(0x43), V(0x44), V(0x45), V(0x46), V(0x47),
+        V(0x48), V(0x49), V(0x4A), V(0x4B), V(0x4C), V(0x4D), V(0x4E), V(0x4F),
+        V(0x50), V(0x51), V(0x52), V(0x53), V(0x54), V(0x55), V(0x56), V(0x57),
+        V(0x58), V(0x59), V(0x5A), V(0x5B), V(0x5C), V(0x5D), V(0x5E), V(0x5F),
+        V(0x60), V(0x61), V(0x62), V(0x63), V(0x64), V(0x65), V(0x66), V(0x67),
+        V(0x68), V(0x69), V(0x6A), V(0x6B), V(0x6C), V(0x6D), V(0x6E), V(0x6F),
+        V(0x70), V(0x71), V(0x72), V(0x73), V(0x74), V(0x75), V(0x76), V(0x77),
+        V(0x78), V(0x79), V(0x7A), V(0x7B), V(0x7C), V(0x7D), V(0x7E), V(0x7F),
+        V(0x80), V(0x81), V(0x82), V(0x83), V(0x84), V(0x85), V(0x86), V(0x87),
+        V(0x88), V(0x89), V(0x8A), V(0x8B), V(0x8C), V(0x8D), V(0x8E), V(0x8F),
+        V(0x90), V(0x91), V(0x92), V(0x93), V(0x94), V(0x95), V(0x96), V(0x97),
+        V(0x98), V(0x99), V(0x9A), V(0x9B), V(0x9C), V(0x9D), V(0x9E), V(0x9F),
+        V(0xA0), V(0xA1), V(0xA2), V(0xA3), V(0xA4), V(0xA5), V(0xA6), V(0xA7),
+        V(0xA8), V(0xA9), V(0xAA), V(0xAB), V(0xAC), V(0xAD), V(0xAE), V(0xAF),
+        V(0xB0), V(0xB1), V(0xB2), V(0xB3), V(0xB4), V(0xB5), V(0xB6), V(0xB7),
+        V(0xB8), V(0xB9), V(0xBA), V(0xBB), V(0xBC), V(0xBD), V(0xBE), V(0xBF),
+        V(0xC0), V(0xC1), V(0xC2), V(0xC3), V(0xC4), V(0xC5), V(0xC6), V(0xC7),
+        V(0xC8), V(0xC9), V(0xCA), V(0xCB), V(0xCC), V(0xCD), V(0xCE), V(0xCF),
+        V(0xD0), V(0xD1), V(0xD2), V(0xD3), V(0xD4), V(0xD5), V(0xD6), V(0xD7),
+        V(0xD8), V(0xD9), V(0xDA), V(0xDB), V(0xDC), V(0xDD), V(0xDE), V(0xDF),
+        V(0xE0), V(0xE1), V(0xE2), V(0xE3), V(0xE4), V(0xE5), V(0xE6), V(0xE7),
+        V(0xE8), V(0xE9), V(0xEA), V(0xEB), V(0xEC), V(0xED), V(0xEE), V(0xEF),
+        V(0xF0), V(0xF1), V(0xF2), V(0xF3), V(0xF4), V(0xF5), V(0xF6), V(0xF7),
+        V(0xF8), V(0xF9), V(0xFA), V(0xFB), V(0xFC), V(0xFD), V(0xFE),
+#undef V
+    };
+
+    cmsCIEXYZ dst[PL_ARRAY_SIZE(test)] = {0};
+    cmsDoTransform(tf, test, dst, PL_ARRAY_SIZE(dst));
+    cmsDeleteTransform(tf);
+
+    // Read primaries from transformed RGBW values
+    prim->red   = pl_cie_from_XYZ(dst[RED].X, dst[RED].Y, dst[RED].Z);
+    prim->green = pl_cie_from_XYZ(dst[GREEN].X, dst[GREEN].Y, dst[GREEN].Z);
+    prim->blue  = pl_cie_from_XYZ(dst[BLUE].X, dst[BLUE].Y, dst[BLUE].Z);
+    prim->white = pl_cie_from_XYZ(dst[WHITE].X, dst[WHITE].Y, dst[WHITE].Z);
+
+    // Rough estimate of overall gamma and starting point for curve black point
+    const float y_approx = dst[GRAY].Y ? log(dst[GRAY].Y) / log(0.5) : 1.0f;
+    const float kb = fmaxf(dst[BLACK].Y, 0.0f);
+    float b = powf(kb, 1 / y_approx);
+
+    // Estimate mean and stddev of gamma (Welford's method)
+    float M = 0.0, S = 0.0;
+    int k = 1;
+    for (int i = RAMP; i < PL_ARRAY_SIZE(dst); i++) { // exclude primaries
+        if (dst[i].Y <= 0 || dst[i].Y >= 1)
+            continue;
+        float src = (1 - b) * (test[i][0] / 255.0) + b;
+        float y = log(dst[i].Y) / log(src);
+        float tmpM = M;
+        M += (y - tmpM) / k;
+        S += (y - tmpM) * (y - M);
+        k++;
+
+        // Update estimate of black point according to current gamma estimate
+        b = powf(kb, 1 / M);
+    }
+    S = sqrt(S / (k - 1));
+
+    PL_INFO(p, "Detected profile approximation gamma %.3f", M);
+    if (S > 0.5) {
+        PL_WARN(p, "Detected profile gamma (%.3f) very far from pure power "
+                "response (stddev=%.1f), suspected unusual or broken profile. "
+                "Using anyway, but results may be poor.", M, S);
+    } else if (!(M > 0)) {
+        PL_ERR(p, "Arithmetic error in ICC profile gamma estimation? "
+               "Please open an issue");
+        return false;
+    }
+
+    *out_gamma = M;
+    p->gamma_stddev = S;
+    return true;
+}
+
+static bool detect_contrast(pl_icc_object icc, struct pl_hdr_metadata *hdr,
+                            struct pl_icc_params *params, float max_luma)
+{
+    struct icc_priv *p = PL_PRIV(icc);
+    cmsCIEXYZ *white = cmsReadTag(p->profile, cmsSigLuminanceTag);
+    enum pl_rendering_intent intent = params->intent;
+    /* LittleCMS refuses to detect an intent in absolute colorimetric intent,
+     * so fall back to relative colorimetric since we only care about the
+     * brightness value here */
+    if (intent == PL_INTENT_ABSOLUTE_COLORIMETRIC)
+        intent = PL_INTENT_RELATIVE_COLORIMETRIC;
+    if (!cmsDetectDestinationBlackPoint(&p->black, p->profile, intent, 0)) {
+        /*
+         * v4 ICC profiles have a black point tag but only for
+         * perceptual/saturation intents. So we change the rendering intent
+         * to perceptual if we are provided a v4 ICC profile.
+         */
+        if (cmsGetEncodedICCversion(p->profile) >= 0x4000000 && intent != PL_INTENT_PERCEPTUAL) {
+            params->intent = PL_INTENT_PERCEPTUAL;
+            return detect_contrast(icc, hdr, params, max_luma);
+        }
+
+        PL_ERR(p, "Failed detecting ICC profile black point!");
+        return false;
+    }
+
+    if (white) {
+        PL_DEBUG(p, "Detected raw white point X=%.2f Y=%.2f Z=%.2f cd/m^2",
+                 white->X, white->Y, white->Z);
+    }
+    PL_DEBUG(p, "Detected raw black point X=%.6f%% Y=%.6f%% Z=%.6f%%",
+             p->black.X * 100, p->black.Y * 100, p->black.Z * 100);
+
+    if (max_luma <= 0)
+        max_luma = white ? white->Y : PL_COLOR_SDR_WHITE;
+
+    hdr->max_luma = max_luma;
+    hdr->min_luma = p->black.Y * max_luma;
+    hdr->min_luma = PL_MAX(hdr->min_luma, 1e-6); // prevent true 0
+    PL_INFO(p, "Using ICC contrast %.0f:1", hdr->max_luma / hdr->min_luma);
+    return true;
+}
+
+static void infer_clut_size(struct pl_icc_object_t *icc)
+{
+    struct icc_priv *p = PL_PRIV(icc);
+    struct pl_icc_params *params = &icc->params;
+    if (params->size_r && params->size_g && params->size_b) {
+        PL_DEBUG(p, "Using fixed 3DLUT size: %dx%dx%d",
+                 (int) params->size_r, (int) params->size_g, (int) params->size_b);
+        return;
+    }
+
+#define REQUIRE_SIZE(N) \
+    params->size_r = PL_MAX(params->size_r, N); \
+    params->size_g = PL_MAX(params->size_g, N); \
+    params->size_b = PL_MAX(params->size_b, N)
+
+    // Default size for sanity
+    REQUIRE_SIZE(9);
+
+    // Ensure enough precision to track the (absolute) black point
+    if (p->black.Y > 1e-4) {
+        float black_rel = powf(p->black.Y, 1.0f / icc->gamma);
+        int min_size = 2 * (int) ceilf(1.0f / black_rel);
+        REQUIRE_SIZE(min_size);
+    }
+
+    // Ensure enough precision to track the gamma curve
+    if (p->gamma_stddev > 1e-2) {
+        REQUIRE_SIZE(65);
+    } else if (p->gamma_stddev > 1e-3) {
+        REQUIRE_SIZE(33);
+    } else if (p->gamma_stddev > 1e-4) {
+        REQUIRE_SIZE(17);
+    }
+
+    // Ensure enough precision to track any internal CLUTs
+    cmsPipeline *pipe = NULL;
+    switch (icc->params.intent) {
+    case PL_INTENT_SATURATION:
+        pipe = cmsReadTag(p->profile, cmsSigBToA2Tag);
+        if (pipe)
+            break;
+        // fall through
+    case PL_INTENT_RELATIVE_COLORIMETRIC:
+    case PL_INTENT_ABSOLUTE_COLORIMETRIC:
+    default:
+        pipe = cmsReadTag(p->profile, cmsSigBToA1Tag);
+        if (pipe)
+            break;
+        // fall through
+    case PL_INTENT_PERCEPTUAL:
+        pipe = cmsReadTag(p->profile, cmsSigBToA0Tag);
+        break;
+    }
+
+    if (!pipe) {
+        switch (icc->params.intent) {
+        case PL_INTENT_SATURATION:
+            pipe = cmsReadTag(p->profile, cmsSigAToB2Tag);
+            if (pipe)
+                break;
+            // fall through
+        case PL_INTENT_RELATIVE_COLORIMETRIC:
+        case PL_INTENT_ABSOLUTE_COLORIMETRIC:
+        default:
+            pipe = cmsReadTag(p->profile, cmsSigAToB1Tag);
+            if (pipe)
+                break;
+            // fall through
+        case PL_INTENT_PERCEPTUAL:
+            pipe = cmsReadTag(p->profile, cmsSigAToB0Tag);
+            break;
+        }
+    }
+
+    if (pipe) {
+        for (cmsStage *stage = cmsPipelineGetPtrToFirstStage(pipe);
+             stage; stage = cmsStageNext(stage))
+        {
+            switch (cmsStageType(stage)) {
+            case cmsSigCLutElemType: ;
+                _cmsStageCLutData *data = cmsStageData(stage);
+                if (data->Params->nInputs != 3)
+                    continue;
+                params->size_r = PL_MAX(params->size_r, data->Params->nSamples[0]);
+                params->size_g = PL_MAX(params->size_g, data->Params->nSamples[1]);
+                params->size_b = PL_MAX(params->size_b, data->Params->nSamples[2]);
+                break;
+
+            default:
+                continue;
+            }
+        }
+    }
+
+    // Clamp the output size to make sure profiles are not too large
+    params->size_r = PL_MIN(params->size_r, 129);
+    params->size_g = PL_MIN(params->size_g, 129);
+    params->size_b = PL_MIN(params->size_b, 129);
+
+    // Constrain the total LUT size to roughly 1M entries
+    const size_t max_size = 1000000;
+    size_t total_size = params->size_r * params->size_g * params->size_b;
+    if (total_size > max_size) {
+        float factor = powf((float) max_size / total_size, 1/3.0f);
+        params->size_r = ceilf(factor * params->size_r);
+        params->size_g = ceilf(factor * params->size_g);
+        params->size_b = ceilf(factor * params->size_b);
+    }
+
+    PL_INFO(p, "Chosen 3DLUT size: %dx%dx%d",
+            (int) params->size_r, (int) params->size_g, (int) params->size_b);
+}
+
+static bool icc_init(struct pl_icc_object_t *icc)
+{
+    struct icc_priv *p = PL_PRIV(icc);
+    struct pl_icc_params *params = &icc->params;
+    if (params->intent < 0 || params->intent > PL_INTENT_ABSOLUTE_COLORIMETRIC)
+        params->intent = cmsGetHeaderRenderingIntent(p->profile);
+
+    struct pl_raw_primaries *out_prim = &icc->csp.hdr.prim;
+    if (!detect_csp(icc, out_prim, &icc->gamma))
+        return false;
+    if (!detect_contrast(icc, &icc->csp.hdr, params, params->max_luma))
+        return false;
+    infer_clut_size(icc);
+
+    const struct pl_raw_primaries *best = NULL;
+    for (enum pl_color_primaries prim = 1; prim < PL_COLOR_PRIM_COUNT; prim++) {
+        const struct pl_raw_primaries *raw = pl_raw_primaries_get(prim);
+        if (!icc->csp.primaries && pl_raw_primaries_similar(raw, out_prim)) {
+            icc->containing_primaries = prim;
+            icc->csp.primaries = prim;
+            best = raw;
+            break;
+        }
+
+        if (pl_primaries_superset(raw, out_prim) &&
+            (!best || pl_primaries_superset(best, raw)))
+        {
+            icc->containing_primaries = prim;
+            best = raw;
+        }
+    }
+
+    if (!best) {
+        PL_WARN(p, "ICC profile too wide to handle, colors may be clipped!");
+        icc->containing_primaries = PL_COLOR_PRIM_ACES_AP0;
+        best = pl_raw_primaries_get(icc->containing_primaries);
+    }
+
+    // Create approximation profile. Use a tone-curve based on a BT.1886-style
+    // pure power curve, with an approximation gamma matched to the ICC
+    // profile. We stretch the luminance range *before* the input to the gamma
+    // function, to avoid numerical issues near the black point. (This removes
+    // the need for a separate linear section)
+    //
+    // Y = scale * (aX + b)^y, where Y = PCS luma and X = encoded value ([0-1])
+    p->scale = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_NORM, icc->csp.hdr.max_luma);
+    p->b = powf(icc->csp.hdr.min_luma / icc->csp.hdr.max_luma, 1.0f / icc->gamma);
+    p->a = (1 - p->b);
+    cmsToneCurve *curve = cmsBuildParametricToneCurve(p->cms, 2,
+            (double[3]) { icc->gamma, p->a, p->b });
+    if (!curve)
+        return false;
+
+    cmsCIExyY wp_xyY = { best->white.x, best->white.y, 1.0 };
+    cmsCIExyYTRIPLE prim_xyY = {
+        .Red   = { best->red.x,   best->red.y,   1.0 },
+        .Green = { best->green.x, best->green.y, 1.0 },
+        .Blue  = { best->blue.x,  best->blue.y,  1.0 },
+    };
+
+    p->approx = cmsCreateRGBProfileTHR(p->cms, &wp_xyY, &prim_xyY,
+                        (cmsToneCurve *[3]){ curve, curve, curve });
+    cmsFreeToneCurve(curve);
+    if (!p->approx)
+        return false;
+
+    // We need to create an ICC V2 profile because ICC V4 perceptual profiles
+    // have normalized semantics, but we want colorimetric mapping with BPC
+    cmsSetHeaderRenderingIntent(p->approx, icc->params.intent);
+    cmsSetProfileVersion(p->approx, 2.2);
+
+    // Hash all parameters affecting the generated 3DLUT
+    p->lut_sig = CACHE_KEY_ICC_3DLUT;
+    pl_hash_merge(&p->lut_sig, icc->signature);
+    pl_hash_merge(&p->lut_sig, params->intent);
+    pl_hash_merge(&p->lut_sig, params->size_r);
+    pl_hash_merge(&p->lut_sig, params->size_g);
+    pl_hash_merge(&p->lut_sig, params->size_b);
+    pl_hash_merge(&p->lut_sig, params->force_bpc);
+    union { double d; uint64_t u; } v = { .d = icc->csp.hdr.max_luma };
+    pl_hash_merge(&p->lut_sig, v.u);
+    // min luma depends only on the max luma and profile
+
+    // Backwards compatibility with old caching API
+    if ((params->cache_save || params->cache_load) && !params->cache) {
+        p->cache = pl_cache_create(pl_cache_params(
+            .log  = p->log,
+            .set  = params->cache_save ? set_callback : NULL,
+            .get  = params->cache_load ? get_callback : NULL,
+            .priv = icc,
+        ));
+    }
+
+    return true;
+}
+
+pl_icc_object pl_icc_open(pl_log log, const struct pl_icc_profile *profile,
+                          const struct pl_icc_params *params)
+{
+    if (!profile->len)
+        return NULL;
+
+    struct pl_icc_object_t *icc = pl_zalloc_obj(NULL, icc, struct icc_priv);
+    struct icc_priv *p = PL_PRIV(icc);
+    icc->params = params ? *params : pl_icc_default_params;
+    icc->signature = profile->signature;
+    p->log = log;
+    p->cms = cmsCreateContext(NULL, (void *) log);
+    if (!p->cms) {
+        PL_ERR(p, "Failed creating LittleCMS context!");
+        goto error;
+    }
+
+    cmsSetLogErrorHandlerTHR(p->cms, error_callback);
+    PL_INFO(p, "Opening ICC profile..");
+    p->profile = cmsOpenProfileFromMemTHR(p->cms, profile->data, profile->len);
+    if (!p->profile) {
+        PL_ERR(p, "Failed opening ICC profile");
+        goto error;
+    }
+
+    if (cmsGetColorSpace(p->profile) != cmsSigRgbData) {
+        PL_ERR(p, "Invalid ICC profile: not RGB");
+        goto error;
+    }
+
+    if (!icc_init(icc))
+        goto error;
+
+    return icc;
+
+error:
+    pl_icc_close((pl_icc_object *) &icc);
+    return NULL;
+}
+
+static bool icc_reopen(pl_icc_object kicc, const struct pl_icc_params *params)
+{
+    struct pl_icc_object_t *icc = (struct pl_icc_object_t *) kicc;
+    struct icc_priv *p = PL_PRIV(icc);
+    cmsCloseProfile(p->approx);
+    pl_cache_destroy(&p->cache);
+
+    *icc = (struct pl_icc_object_t) {
+        .params    = *params,
+        .signature = icc->signature,
+    };
+
+    *p = (struct icc_priv) {
+        .log     = p->log,
+        .cms     = p->cms,
+        .profile = p->profile,
+    };
+
+    PL_DEBUG(p, "Reinitializing ICC profile in-place");
+    return icc_init(icc);
+}
+
+bool pl_icc_update(pl_log log, pl_icc_object *out_icc,
+                   const struct pl_icc_profile *profile,
+                   const struct pl_icc_params *params)
+{
+    params = PL_DEF(params, &pl_icc_default_params);
+    pl_icc_object icc = *out_icc;
+    if (!icc && !profile)
+        return false; // nothing to update
+
+    uint64_t sig = profile ? profile->signature : icc->signature;
+    if (!icc || icc->signature != sig) {
+        pl_assert(profile);
+        pl_icc_close(&icc);
+        *out_icc = icc = pl_icc_open(log, profile, params);
+        return icc != NULL;
+    }
+
+    int size_r = PL_DEF(params->size_r, icc->params.size_r);
+    int size_g = PL_DEF(params->size_g, icc->params.size_g);
+    int size_b = PL_DEF(params->size_b, icc->params.size_b);
+    bool compat = params->intent     == icc->params.intent    &&
+                  params->max_luma   == icc->params.max_luma  &&
+                  params->force_bpc  == icc->params.force_bpc &&
+                  size_r             == icc->params.size_r    &&
+                  size_g             == icc->params.size_g    &&
+                  size_b             == icc->params.size_b;
+    if (compat)
+        return true;
+
+    // ICC signature is the same but parameters are different, re-open in-place
+    if (!icc_reopen(icc, params)) {
+        pl_icc_close(&icc);
+        *out_icc = NULL;
+        return false;
+    }
+
+    return true;
+}
+
+static void fill_lut(void *datap, const struct sh_lut_params *params, bool decode)
+{
+    pl_icc_object icc = params->priv;
+    struct icc_priv *p = PL_PRIV(icc);
+    cmsHPROFILE srcp = decode ? p->profile : p->approx;
+    cmsHPROFILE dstp = decode ? p->approx  : p->profile;
+    int s_r = params->width, s_g = params->height, s_b = params->depth;
+
+    pl_clock_t start = pl_clock_now();
+    cmsHTRANSFORM tf = cmsCreateTransformTHR(p->cms, srcp, TYPE_RGB_16,
+                                             dstp, TYPE_RGBA_16,
+                                             icc->params.intent,
+                                             cmsFLAGS_BLACKPOINTCOMPENSATION |
+                                             cmsFLAGS_NOCACHE | cmsFLAGS_NOOPTIMIZE);
+    if (!tf)
+        return;
+
+    pl_clock_t after_transform = pl_clock_now();
+    pl_log_cpu_time(p->log, start, after_transform, "creating ICC transform");
+
+    uint16_t *tmp = pl_alloc(NULL, s_r * 3 * sizeof(tmp[0]));
+    for (int b = 0; b < s_b; b++) {
+        for (int g = 0; g < s_g; g++) {
+            // Transform a single line of the output buffer
+            for (int r = 0; r < s_r; r++) {
+                tmp[r * 3 + 0] = r * 65535 / (s_r - 1);
+                tmp[r * 3 + 1] = g * 65535 / (s_g - 1);
+                tmp[r * 3 + 2] = b * 65535 / (s_b - 1);
+            }
+
+            size_t offset = (b * s_g + g) * s_r * 4;
+            uint16_t *data = ((uint16_t *) datap) + offset;
+            cmsDoTransform(tf, tmp, data, s_r);
+
+            if (!icc->params.force_bpc)
+                continue;
+
+            // Fix the black point manually. Work-around for "improper"
+            // profiles, as black point compensation should already have
+            // taken care of this normally.
+            const uint16_t knee = 16u << 8;
+            if (tmp[0] >= knee || tmp[1] >= knee)
+                continue;
+            for (int r = 0; r < s_r; r++) {
+                uint16_t s = (2 * tmp[1] + tmp[2] + tmp[r * 3]) >> 2;
+                if (s >= knee)
+                    break;
+                for (int c = 0; c < 3; c++)
+                    data[r * 3 + c] = (s * data[r * 3 + c] + (knee - s) * s) >> 12;
+            }
+        }
+    }
+
+    pl_log_cpu_time(p->log, after_transform, pl_clock_now(), "generating ICC 3DLUT");
+    cmsDeleteTransform(tf);
+    pl_free(tmp);
+}
+
+static void fill_decode(void *datap, const struct sh_lut_params *params)
+{
+    fill_lut(datap, params, true);
+}
+
+static void fill_encode(void *datap, const struct sh_lut_params *params)
+{
+    fill_lut(datap, params, false);
+}
+
+static pl_cache get_cache(pl_icc_object icc, pl_shader sh)
+{
+    struct icc_priv *p = PL_PRIV(icc);
+    return PL_DEF(icc->params.cache, PL_DEF(p->cache, SH_CACHE(sh)));
+}
+
+void pl_icc_decode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj,
+                   struct pl_color_space *out_csp)
+{
+    struct icc_priv *p = PL_PRIV(icc);
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    pl_fmt fmt = pl_find_fmt(SH_GPU(sh), PL_FMT_UNORM, 4, 16, 16, PL_FMT_CAP_LINEAR);
+    if (!fmt) {
+        SH_FAIL(sh, "Failed finding ICC 3DLUT texture format!");
+        return;
+    }
+
+    ident_t lut = sh_lut(sh, sh_lut_params(
+        .object     = lut_obj,
+        .var_type   = PL_VAR_FLOAT,
+        .method     = SH_LUT_TETRAHEDRAL,
+        .fmt        = fmt,
+        .width      = icc->params.size_r,
+        .height     = icc->params.size_g,
+        .depth      = icc->params.size_b,
+        .comps      = 4,
+        .signature  = p->lut_sig,
+        .fill       = fill_decode,
+        .cache      = get_cache(icc, sh),
+        .priv       = (void *) icc,
+    ));
+
+    if (!lut) {
+        SH_FAIL(sh, "pl_icc_decode: failed generating LUT object");
+        return;
+    }
+
+    // Y = scale * (aX + b)^y
+    sh_describe(sh, "ICC 3DLUT");
+    GLSL("// pl_icc_decode                          \n"
+         "{                                         \n"
+         "color.rgb = "$"(color.rgb).rgb;           \n"
+         "color.rgb = "$" * color.rgb + vec3("$");  \n"
+         "color.rgb = pow(color.rgb, vec3("$"));    \n"
+         "color.rgb = "$" * color.rgb;              \n"
+         "}                                         \n",
+         lut,
+         SH_FLOAT(p->a), SH_FLOAT(p->b),
+         SH_FLOAT(icc->gamma),
+         SH_FLOAT(p->scale));
+
+    if (out_csp) {
+        *out_csp = (struct pl_color_space) {
+            .primaries  = icc->containing_primaries,
+            .transfer   = PL_COLOR_TRC_LINEAR,
+            .hdr        = icc->csp.hdr,
+        };
+    }
+}
+
+void pl_icc_encode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj)
+{
+    struct icc_priv *p = PL_PRIV(icc);
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    pl_fmt fmt = pl_find_fmt(SH_GPU(sh), PL_FMT_UNORM, 4, 16, 16, PL_FMT_CAP_LINEAR);
+    if (!fmt) {
+        SH_FAIL(sh, "Failed finding ICC 3DLUT texture format!");
+        return;
+    }
+
+    ident_t lut = sh_lut(sh, sh_lut_params(
+        .object     = lut_obj,
+        .var_type   = PL_VAR_FLOAT,
+        .method     = SH_LUT_TETRAHEDRAL,
+        .fmt        = fmt,
+        .width      = icc->params.size_r,
+        .height     = icc->params.size_g,
+        .depth      = icc->params.size_b,
+        .comps      = 4,
+        .signature  = ~p->lut_sig, // avoid confusion with decoding LUTs
+        .fill       = fill_encode,
+        .cache      = get_cache(icc, sh),
+        .priv       = (void *) icc,
+    ));
+
+    if (!lut) {
+        SH_FAIL(sh, "pl_icc_encode: failed generating LUT object");
+        return;
+    }
+
+    // X = 1/a * (Y/scale)^(1/y) - b/a
+    sh_describe(sh, "ICC 3DLUT");
+    GLSL("// pl_icc_encode                          \n"
+         "{                                         \n"
+         "color.rgb = max(color.rgb, 0.0);          \n"
+         "color.rgb = 1.0/"$" * color.rgb;          \n"
+         "color.rgb = pow(color.rgb, vec3("$"));    \n"
+         "color.rgb = 1.0/"$" * color.rgb - "$";    \n"
+         "color.rgb = "$"(color.rgb).rgb;           \n"
+         "}                                         \n",
+         SH_FLOAT(p->scale),
+         SH_FLOAT(1.0f / icc->gamma),
+         SH_FLOAT(p->a), SH_FLOAT(p->b / p->a),
+         lut);
+}
+
+#else // !PL_HAVE_LCMS
+
+void pl_icc_close(pl_icc_object *picc) {};
+pl_icc_object pl_icc_open(pl_log log, const struct pl_icc_profile *profile,
+                          const struct pl_icc_params *pparams)
+{
+    pl_err(log, "libplacebo compiled without LittleCMS 2 support!");
+    return NULL;
+}
+
+bool pl_icc_update(pl_log log, pl_icc_object *obj,
+                   const struct pl_icc_profile *profile,
+                   const struct pl_icc_params *params)
+{
+    static bool warned;
+    if (!warned) {
+        pl_err(log, "libplacebo compiled without LittleCMS 2 support!");
+        warned = true;
+    }
+    *obj = NULL;
+    return false;
+}
+
+void pl_icc_decode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj,
+                   struct pl_color_space *out_csp)
+{
+    pl_unreachable(); // can't get a pl_icc_object
+}
+
+void pl_icc_encode(pl_shader sh, pl_icc_object icc, pl_shader_obj *lut_obj)
+{
+    pl_unreachable();
+}
+
+#endif
diff --git a/src/shaders/lut.c b/src/shaders/lut.c
new file mode 100644
index 0000000..b0124fc
--- /dev/null
+++ b/src/shaders/lut.c
@@ -0,0 +1,820 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include <ctype.h>
+
+#include "shaders.h"
+
+#include <libplacebo/shaders/lut.h>
+
+static inline bool isnumeric(char c)
+{
+    return (c >= '0' && c <= '9') || c == '-';
+}
+
+void pl_lut_free(struct pl_custom_lut **lut)
+{
+    pl_free_ptr(lut);
+}
+
+struct pl_custom_lut *pl_lut_parse_cube(pl_log log, const char *cstr, size_t cstr_len)
+{
+    struct pl_custom_lut *lut = pl_zalloc_ptr(NULL, lut);
+    pl_str str = (pl_str) { (uint8_t *) cstr, cstr_len };
+    lut->signature = pl_str_hash(str);
+    int entries = 0;
+
+    float min[3] = { 0.0, 0.0, 0.0 };
+    float max[3] = { 1.0, 1.0, 1.0 };
+
+    // Parse header
+    while (str.len && !isnumeric(str.buf[0])) {
+        pl_str line = pl_str_strip(pl_str_getline(str, &str));
+        if (!line.len)
+            continue; // skip empty line
+
+        if (pl_str_eatstart0(&line, "TITLE")) {
+            pl_info(log, "Loading LUT: %.*s", PL_STR_FMT(pl_str_strip(line)));
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "LUT_3D_SIZE")) {
+            line = pl_str_strip(line);
+            int size;
+            if (!pl_str_parse_int(line, &size)) {
+                pl_err(log, "Failed parsing dimension '%.*s'", PL_STR_FMT(line));
+                goto error;
+            }
+            if (size <= 0 || size > 1024) {
+                pl_err(log, "Invalid 3DLUT size: %dx%d%x", size, size, size);
+                goto error;
+            }
+
+            lut->size[0] = lut->size[1] = lut->size[2] = size;
+            entries = size * size * size;
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "LUT_1D_SIZE")) {
+            line = pl_str_strip(line);
+            int size;
+            if (!pl_str_parse_int(line, &size)) {
+                pl_err(log, "Failed parsing dimension '%.*s'", PL_STR_FMT(line));
+                goto error;
+            }
+            if (size <= 0 || size > 65536) {
+                pl_err(log, "Invalid 1DLUT size: %d", size);
+                goto error;
+            }
+
+            lut->size[0] = size;
+            lut->size[1] = lut->size[2] = 0;
+            entries = size;
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "DOMAIN_MIN")) {
+            line = pl_str_strip(line);
+            if (!pl_str_parse_float(pl_str_split_char(line, ' ', &line), &min[0]) ||
+                !pl_str_parse_float(pl_str_split_char(line, ' ', &line), &min[1]) ||
+                !pl_str_parse_float(line, &min[2]))
+            {
+                pl_err(log, "Failed parsing domain: '%.*s'", PL_STR_FMT(line));
+                goto error;
+            }
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "DOMAIN_MAX")) {
+            line = pl_str_strip(line);
+            if (!pl_str_parse_float(pl_str_split_char(line, ' ', &line), &max[0]) ||
+                !pl_str_parse_float(pl_str_split_char(line, ' ', &line), &max[1]) ||
+                !pl_str_parse_float(line, &max[2]))
+            {
+                pl_err(log, "Failed parsing domain: '%.*s'", PL_STR_FMT(line));
+                goto error;
+            }
+            continue;
+        }
+
+        if (pl_str_eatstart0(&line, "#")) {
+            pl_debug(log, "Unhandled .cube comment: %.*s",
+                     PL_STR_FMT(pl_str_strip(line)));
+            continue;
+        }
+
+        pl_warn(log, "Unhandled .cube line: %.*s", PL_STR_FMT(pl_str_strip(line)));
+    }
+
+    if (!entries) {
+        pl_err(log, "Missing LUT size specification?");
+        goto error;
+    }
+
+    for (int i = 0; i < 3; i++) {
+        if (max[i] - min[i] < 1e-6) {
+            pl_err(log, "Invalid domain range: [%f, %f]", min[i], max[i]);
+            goto error;
+        }
+    }
+
+    float *data = pl_alloc(lut, sizeof(float[3]) * entries);
+    lut->data = data;
+
+    // Parse LUT body
+    pl_clock_t start = pl_clock_now();
+    for (int n = 0; n < entries; n++) {
+        for (int c = 0; c < 3; c++) {
+            static const char * const digits = "0123456789.-+e";
+
+            // Extract valid digit sequence
+            size_t len = pl_strspn(str, digits);
+            pl_str entry = (pl_str) { str.buf, len };
+            str.buf += len;
+            str.len -= len;
+
+            if (!entry.len) {
+                if (!str.len) {
+                    pl_err(log, "Failed parsing LUT: Unexpected EOF, expected "
+                           "%d entries, got %d", entries * 3, n * 3 + c + 1);
+                } else {
+                    pl_err(log, "Failed parsing LUT: Unexpected '%c', expected "
+                           "digit", str.buf[0]);
+                }
+                goto error;
+            }
+
+            float num;
+            if (!pl_str_parse_float(entry, &num)) {
+                pl_err(log, "Failed parsing float value '%.*s'", PL_STR_FMT(entry));
+                goto error;
+            }
+
+            // Rescale to range 0.0 - 1.0
+            *data++ = (num - min[c]) / (max[c] - min[c]);
+
+            // Skip whitespace between digits
+            str = pl_str_strip(str);
+        }
+    }
+
+    str = pl_str_strip(str);
+    if (str.len)
+        pl_warn(log, "Extra data after LUT?... ignoring '%c'", str.buf[0]);
+
+    pl_log_cpu_time(log, start, pl_clock_now(), "parsing .cube LUT");
+    return lut;
+
+error:
+    pl_free(lut);
+    return NULL;
+}
+
+static void fill_lut(void *datap, const struct sh_lut_params *params)
+{
+    const struct pl_custom_lut *lut = params->priv;
+
+    int dim_r = params->width;
+    int dim_g = PL_DEF(params->height, 1);
+    int dim_b = PL_DEF(params->depth, 1);
+
+    float *data = datap;
+    for (int b = 0; b < dim_b; b++) {
+        for (int g = 0; g < dim_g; g++) {
+            for (int r = 0; r < dim_r; r++) {
+                size_t offset = (b * dim_g + g) * dim_r + r;
+                const float *src = &lut->data[offset * 3];
+                float *dst = &data[offset * 4];
+                dst[0] = src[0];
+                dst[1] = src[1];
+                dst[2] = src[2];
+                dst[3] = 0.0f;
+            }
+        }
+    }
+}
+
+void pl_shader_custom_lut(pl_shader sh, const struct pl_custom_lut *lut,
+                          pl_shader_obj *lut_state)
+{
+    if (!lut)
+        return;
+
+    int dims;
+    if (lut->size[0] > 0 && lut->size[1] > 0 && lut->size[2] > 0) {
+        dims = 3;
+    } else if (lut->size[0] > 0 && !lut->size[1] && !lut->size[2]) {
+        dims = 1;
+    } else {
+        SH_FAIL(sh, "Invalid dimensions %dx%dx%d for pl_custom_lut, must be 1D "
+                "or 3D!", lut->size[0], lut->size[1], lut->size[2]);
+        return;
+    }
+
+    if (!sh_require(sh, PL_SHADER_SIG_COLOR, 0, 0))
+        return;
+
+    ident_t fun = sh_lut(sh, sh_lut_params(
+        .object     = lut_state,
+        .var_type   = PL_VAR_FLOAT,
+        .method     = SH_LUT_TETRAHEDRAL,
+        .width      = lut->size[0],
+        .height     = lut->size[1],
+        .depth      = lut->size[2],
+        .comps      = 4, // for better texel alignment
+        .signature  = lut->signature,
+        .fill       = fill_lut,
+        .priv       = (void *) lut,
+    ));
+
+    if (!fun) {
+        SH_FAIL(sh, "pl_shader_custom_lut: failed generating LUT object");
+        return;
+    }
+
+    GLSL("// pl_shader_custom_lut \n");
+
+    static const pl_matrix3x3 zero = {0};
+    if (memcmp(&lut->shaper_in, &zero, sizeof(zero)) != 0) {
+        GLSL("color.rgb = "$" * color.rgb; \n", sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_mat3("shaper_in"),
+            .data = PL_TRANSPOSE_3X3(lut->shaper_in.m),
+        }));
+    }
+
+    switch (dims) {
+    case 1:
+        sh_describe(sh, "custom 1DLUT");
+        GLSL("color.rgb = vec3("$"(color.r).r,  \n"
+             "                 "$"(color.g).g,  \n"
+             "                 "$"(color.b).b); \n",
+             fun, fun, fun);
+        break;
+    case 3:
+        sh_describe(sh, "custom 3DLUT");
+        GLSL("color.rgb = "$"(color.rgb).rgb; \n", fun);
+        break;
+    }
+
+    if (memcmp(&lut->shaper_out, &zero, sizeof(zero)) != 0) {
+        GLSL("color.rgb = "$" * color.rgb; \n", sh_var(sh, (struct pl_shader_var) {
+            .var = pl_var_mat3("shaper_out"),
+            .data = PL_TRANSPOSE_3X3(lut->shaper_out.m),
+        }));
+    }
+}
+
+// Defines a LUT position helper macro. This translates from an absolute texel
+// scale (either in texels, or normalized to [0,1]) to the texture coordinate
+// scale for the corresponding sample in a texture of dimension `lut_size`.
+static ident_t texel_scale(pl_shader sh, int lut_size, bool normalized)
+{
+    const float base = 0.5f / lut_size;
+    const float end = 1.0f - 0.5f / lut_size;
+    const float scale = (end - base) / (normalized ? 1.0f : (lut_size - 1));
+
+    ident_t name = sh_fresh(sh, "LUT_SCALE");
+    GLSLH("#define "$"(x) ("$" * (x) + "$") \n",
+          name, SH_FLOAT(scale), SH_FLOAT(base));
+    return name;
+}
+
+struct sh_lut_obj {
+    enum sh_lut_type type;
+    enum sh_lut_method method;
+    enum pl_var_type vartype;
+    pl_fmt fmt;
+    int width, height, depth, comps;
+    uint64_t signature;
+    bool error; // reset if params change
+
+    // weights, depending on the lut type
+    pl_tex tex;
+    pl_str str;
+    void *data;
+};
+
+static void sh_lut_uninit(pl_gpu gpu, void *ptr)
+{
+    struct sh_lut_obj *lut = ptr;
+    pl_tex_destroy(gpu, &lut->tex);
+    pl_free(lut->str.buf);
+    pl_free(lut->data);
+
+    *lut = (struct sh_lut_obj) {0};
+}
+
+// Maximum number of floats to embed as a literal array (when using SH_LUT_AUTO)
+#define SH_LUT_MAX_LITERAL_SOFT 64
+#define SH_LUT_MAX_LITERAL_HARD 256
+
+ident_t sh_lut(pl_shader sh, const struct sh_lut_params *params)
+{
+    pl_gpu gpu = SH_GPU(sh);
+    pl_cache_obj obj = { .key = CACHE_KEY_SH_LUT ^ params->signature };
+
+    const enum pl_var_type vartype = params->var_type;
+    pl_assert(vartype != PL_VAR_INVALID);
+    pl_assert(params->method == SH_LUT_NONE || vartype == PL_VAR_FLOAT);
+    pl_assert(params->width > 0 && params->height >= 0 && params->depth >= 0);
+    pl_assert(params->comps > 0);
+    pl_assert(!params->cache || params->signature);
+
+    int sizes[] = { params->width, params->height, params->depth };
+    int size = params->width * PL_DEF(params->height, 1) * PL_DEF(params->depth, 1);
+    int dims = params->depth ? 3 : params->height ? 2 : 1;
+    enum sh_lut_method method = params->method;
+    if (method == SH_LUT_TETRAHEDRAL && dims != 3)
+        method = SH_LUT_LINEAR;
+    if (method == SH_LUT_CUBIC && dims != 3)
+        method = SH_LUT_LINEAR;
+
+    int texdim = 0;
+    uint32_t max_tex_dim[] = {
+        gpu ? gpu->limits.max_tex_1d_dim : 0,
+        gpu ? gpu->limits.max_tex_2d_dim : 0,
+        (gpu && gpu->glsl.version > 100) ? gpu->limits.max_tex_3d_dim : 0,
+    };
+
+    struct sh_lut_obj *lut = SH_OBJ(sh, params->object, PL_SHADER_OBJ_LUT,
+                                    struct sh_lut_obj, sh_lut_uninit);
+
+    if (!lut)
+        return NULL_IDENT;
+
+    bool update = params->update || lut->signature != params->signature ||
+                  vartype != lut->vartype || params->fmt != lut->fmt ||
+                  params->width != lut->width || params->height != lut->height ||
+                  params->depth != lut->depth || params->comps != lut->comps;
+
+    if (lut->error && !update)
+        return NULL_IDENT; // suppress error spam until something changes
+
+    // Try picking the right number of dimensions for the texture LUT. This
+    // allows e.g. falling back to 2D textures if 1D textures are unsupported.
+    for (int d = dims; d <= PL_ARRAY_SIZE(max_tex_dim); d++) {
+        // For a given dimension to be compatible, all coordinates need to be
+        // within the maximum texture size for that dimension
+        for (int i = 0; i < d; i++) {
+            if (sizes[i] > max_tex_dim[d - 1])
+                goto next_dim;
+        }
+
+        // All dimensions are compatible, so pick this texture dimension
+        texdim = d;
+        break;
+
+next_dim: ; // `continue` out of the inner loop
+    }
+
+    static const enum pl_fmt_type fmt_type[PL_VAR_TYPE_COUNT] = {
+        [PL_VAR_SINT]   = PL_FMT_SINT,
+        [PL_VAR_UINT]   = PL_FMT_UINT,
+        [PL_VAR_FLOAT]  = PL_FMT_FLOAT,
+    };
+
+    enum pl_fmt_caps texcaps = PL_FMT_CAP_SAMPLEABLE;
+    bool is_linear = method == SH_LUT_LINEAR || method == SH_LUT_CUBIC;
+    if (is_linear)
+        texcaps |= PL_FMT_CAP_LINEAR;
+
+    pl_fmt texfmt = params->fmt;
+    if (texfmt) {
+        bool ok;
+        switch (texfmt->type) {
+        case PL_FMT_SINT: ok = vartype == PL_VAR_SINT; break;
+        case PL_FMT_UINT: ok = vartype == PL_VAR_UINT; break;
+        default:          ok = vartype == PL_VAR_FLOAT; break;
+        }
+
+        if (!ok) {
+            PL_ERR(sh, "Specified texture format '%s' does not match LUT "
+                   "data type!", texfmt->name);
+            goto error;
+        }
+
+        if (~texfmt->caps & texcaps) {
+            PL_ERR(sh, "Specified texture format '%s' does not match "
+                   "required capabilities 0x%x!\n", texfmt->name, texcaps);
+            goto error;
+        }
+    }
+
+    if (texdim && !texfmt) {
+        texfmt = pl_find_fmt(gpu, fmt_type[vartype], params->comps,
+                             vartype == PL_VAR_FLOAT ? 16 : 32,
+                             pl_var_type_size(vartype) * 8,
+                             texcaps);
+    }
+
+    enum sh_lut_type type = params->lut_type;
+
+    // The linear sampling code currently only supports 1D linear interpolation
+    if (is_linear && dims > 1) {
+        if (texfmt) {
+            type = SH_LUT_TEXTURE;
+        } else {
+            PL_ERR(sh, "Can't emulate linear LUTs for 2D/3D LUTs and no "
+                  "texture support available!");
+            goto error;
+        }
+    }
+
+    bool can_uniform = gpu && gpu->limits.max_variable_comps >= size * params->comps;
+    bool can_literal = sh_glsl(sh).version > 110; // needed for literal arrays
+    can_literal &= size <= SH_LUT_MAX_LITERAL_HARD && !params->dynamic;
+
+    // Deselect unsupported methods
+    if (type == SH_LUT_UNIFORM && !can_uniform)
+        type = SH_LUT_AUTO;
+    if (type == SH_LUT_LITERAL && !can_literal)
+        type = SH_LUT_AUTO;
+    if (type == SH_LUT_TEXTURE && !texfmt)
+        type = SH_LUT_AUTO;
+
+    // Sorted by priority
+    if (!type && can_literal && !method && size <= SH_LUT_MAX_LITERAL_SOFT)
+        type = SH_LUT_LITERAL;
+    if (!type && texfmt)
+        type = SH_LUT_TEXTURE;
+    if (!type && can_uniform)
+        type = SH_LUT_UNIFORM;
+    if (!type && can_literal)
+        type = SH_LUT_LITERAL;
+
+    if (!type) {
+        PL_ERR(sh, "Can't generate LUT: no compatible methods!");
+        goto error;
+    }
+
+    // Reinitialize the existing LUT if needed
+    update |= type != lut->type;
+    update |= method != lut->method;
+
+    if (update) {
+        if (params->dynamic)
+            pl_log_level_cap(sh->log, PL_LOG_TRACE);
+
+        size_t el_size = params->comps * pl_var_type_size(vartype);
+        if (type == SH_LUT_TEXTURE)
+            el_size = texfmt->texel_size;
+
+        size_t buf_size = size * el_size;
+        if (pl_cache_get(params->cache, &obj) && obj.size == buf_size) {
+            PL_DEBUG(sh, "Re-using cached LUT (0x%"PRIx64") with size %zu",
+                     obj.key, obj.size);
+        } else {
+            PL_DEBUG(sh, "LUT invalidated, regenerating..");
+            pl_cache_obj_resize(NULL, &obj, buf_size);
+            pl_clock_t start = pl_clock_now();
+            params->fill(obj.data, params);
+            pl_log_cpu_time(sh->log, start, pl_clock_now(), "generating shader LUT");
+        }
+
+        pl_assert(obj.data && obj.size);
+        if (params->dynamic)
+            pl_log_level_cap(sh->log, PL_LOG_NONE);
+
+        switch (type) {
+        case SH_LUT_TEXTURE: {
+            if (!texdim) {
+                PL_ERR(sh, "Texture LUT exceeds texture dimensions!");
+                goto error;
+            }
+
+            if (!texfmt) {
+                PL_ERR(sh, "Found no compatible texture format for LUT!");
+                goto error;
+            }
+
+            struct pl_tex_params tex_params = {
+                .w              = params->width,
+                .h              = PL_DEF(params->height, texdim >= 2 ? 1 : 0),
+                .d              = PL_DEF(params->depth,  texdim >= 3 ? 1 : 0),
+                .format         = texfmt,
+                .sampleable     = true,
+                .host_writable  = params->dynamic,
+                .initial_data   = params->dynamic ? NULL : obj.data,
+                .debug_tag      = params->debug_tag,
+            };
+
+            bool ok;
+            if (params->dynamic) {
+                ok = pl_tex_recreate(gpu, &lut->tex, &tex_params);
+                if (ok) {
+                    ok = pl_tex_upload(gpu, pl_tex_transfer_params(
+                        .tex = lut->tex,
+                        .ptr = obj.data,
+                    ));
+                }
+            } else {
+                // Can't use pl_tex_recreate because of `initial_data`
+                pl_tex_destroy(gpu, &lut->tex);
+                lut->tex = pl_tex_create(gpu, &tex_params);
+                ok = lut->tex;
+            }
+
+            if (!ok) {
+                PL_ERR(sh, "Failed creating LUT texture!");
+                goto error;
+            }
+            break;
+        }
+
+        case SH_LUT_UNIFORM:
+            pl_free(lut->data);
+            lut->data = pl_memdup(NULL, obj.data, obj.size);
+            break;
+
+        case SH_LUT_LITERAL: {
+            lut->str.len = 0;
+            static const char prefix[PL_VAR_TYPE_COUNT] = {
+                [PL_VAR_SINT]   = 'i',
+                [PL_VAR_UINT]   = 'u',
+                [PL_VAR_FLOAT]  = ' ',
+            };
+
+            for (int i = 0; i < size * params->comps; i += params->comps) {
+                if (i > 0)
+                    pl_str_append_asprintf_c(lut, &lut->str, ",");
+                if (params->comps > 1) {
+                    pl_str_append_asprintf_c(lut, &lut->str, "%cvec%d(",
+                                             prefix[vartype], params->comps);
+                }
+                for (int c = 0; c < params->comps; c++) {
+                    switch (vartype) {
+                    case PL_VAR_FLOAT:
+                        pl_str_append_asprintf_c(lut, &lut->str, "%s%f",
+                                                 c > 0 ? "," : "",
+                                                 ((float *) obj.data)[i+c]);
+                        break;
+                    case PL_VAR_UINT:
+                        pl_str_append_asprintf_c(lut, &lut->str, "%s%u",
+                                                 c > 0 ? "," : "",
+                                                 ((unsigned int *) obj.data)[i+c]);
+                        break;
+                    case PL_VAR_SINT:
+                        pl_str_append_asprintf_c(lut, &lut->str, "%s%d",
+                                                 c > 0 ? "," : "",
+                                                 ((int *) obj.data)[i+c]);
+                        break;
+                    case PL_VAR_INVALID:
+                    case PL_VAR_TYPE_COUNT:
+                        pl_unreachable();
+                    }
+                }
+                if (params->comps > 1)
+                    pl_str_append_asprintf_c(lut, &lut->str, ")");
+            }
+            break;
+        }
+
+        case SH_LUT_AUTO:
+            pl_unreachable();
+        }
+
+        lut->type = type;
+        lut->method = method;
+        lut->vartype = vartype;
+        lut->fmt = params->fmt;
+        lut->width = params->width;
+        lut->height = params->height;
+        lut->depth = params->depth;
+        lut->comps = params->comps;
+        lut->signature = params->signature;
+        pl_cache_set(params->cache, &obj);
+    }
+
+    // Done updating, generate the GLSL
+    ident_t name = sh_fresh(sh, "lut");
+    ident_t arr_name = NULL_IDENT;
+
+    static const char * const swizzles[] = {"x", "xy", "xyz", "xyzw"};
+    static const char * const vartypes[PL_VAR_TYPE_COUNT][4] = {
+        [PL_VAR_SINT] = { "int", "ivec2", "ivec3", "ivec4" },
+        [PL_VAR_UINT] = { "uint", "uvec2", "uvec3", "uvec4" },
+        [PL_VAR_FLOAT] = { "float", "vec2", "vec3", "vec4" },
+    };
+
+    switch (type) {
+    case SH_LUT_TEXTURE: {
+        assert(texdim);
+        ident_t tex = sh_desc(sh, (struct pl_shader_desc) {
+            .desc = {
+                .name = "weights",
+                .type = PL_DESC_SAMPLED_TEX,
+            },
+            .binding = {
+                .object = lut->tex,
+                .sample_mode = is_linear ? PL_TEX_SAMPLE_LINEAR
+                                         : PL_TEX_SAMPLE_NEAREST,
+            }
+        });
+
+        if (is_linear) {
+            ident_t pos_macros[PL_ARRAY_SIZE(sizes)] = {0};
+            for (int i = 0; i < dims; i++)
+                pos_macros[i] = texel_scale(sh, sizes[i], true);
+
+            GLSLH("#define "$"(pos) (textureLod("$", %s(\\\n",
+                  name, tex, vartypes[PL_VAR_FLOAT][texdim - 1]);
+
+            for (int i = 0; i < texdim; i++) {
+                char sep = i == 0 ? ' ' : ',';
+                if (pos_macros[i]) {
+                    if (dims > 1) {
+                        GLSLH("   %c"$"(%s(pos).%c)\\\n", sep, pos_macros[i],
+                              vartypes[PL_VAR_FLOAT][dims - 1], "xyzw"[i]);
+                    } else {
+                        GLSLH("   %c"$"(float(pos))\\\n", sep, pos_macros[i]);
+                    }
+                } else {
+                    GLSLH("   %c%f\\\n", sep, 0.5);
+                }
+            }
+            GLSLH("  ), 0.0).%s)\n", swizzles[params->comps - 1]);
+        } else {
+            GLSLH("#define "$"(pos) (texelFetch("$", %s(pos",
+                  name, tex, vartypes[PL_VAR_SINT][texdim - 1]);
+
+            // Fill up extra components of the index
+            for (int i = dims; i < texdim; i++)
+                GLSLH(", 0");
+
+            GLSLH("), 0).%s)\n", swizzles[params->comps - 1]);
+        }
+        break;
+    }
+
+    case SH_LUT_UNIFORM:
+        arr_name = sh_var(sh, (struct pl_shader_var) {
+            .var = {
+                .name = "weights",
+                .type = vartype,
+                .dim_v = params->comps,
+                .dim_m = 1,
+                .dim_a = size,
+            },
+            .data = lut->data,
+        });
+        break;
+
+    case SH_LUT_LITERAL:
+        arr_name = sh_fresh(sh, "weights");
+        GLSLH("const %s "$"[%d] = %s[](\n  ",
+              vartypes[vartype][params->comps - 1], arr_name, size,
+              vartypes[vartype][params->comps - 1]);
+        sh_append_str(sh, SH_BUF_HEADER, lut->str);
+        GLSLH(");\n");
+        break;
+
+    case SH_LUT_AUTO:
+        pl_unreachable();
+    }
+
+    if (arr_name) {
+        GLSLH("#define "$"(pos) ("$"[int((pos)%s)\\\n",
+              name, arr_name, dims > 1 ? "[0]" : "");
+        int shift = params->width;
+        for (int i = 1; i < dims; i++) {
+            GLSLH("    + %d * int((pos)[%d])\\\n", shift, i);
+            shift *= sizes[i];
+        }
+        GLSLH("  ])\n");
+
+        if (is_linear) {
+            pl_assert(dims == 1);
+            pl_assert(vartype == PL_VAR_FLOAT);
+            ident_t arr_lut = name;
+            name = sh_fresh(sh, "lut_lin");
+            GLSLH("%s "$"(float fpos) {                             \n"
+                  "    fpos = clamp(fpos, 0.0, 1.0) * %d.0;         \n"
+                  "    float fbase = floor(fpos);                   \n"
+                  "    float fceil = ceil(fpos);                    \n"
+                  "    float fcoord = fpos - fbase;                 \n"
+                  "    return mix("$"(fbase), "$"(fceil), fcoord);  \n"
+                  "}                                                \n",
+                  vartypes[PL_VAR_FLOAT][params->comps - 1], name,
+                  size - 1,
+                  arr_lut, arr_lut);
+        }
+    }
+
+    if (method == SH_LUT_CUBIC && dims == 3) {
+        ident_t lin_lut = name;
+        name = sh_fresh(sh, "lut_tricubic");
+        GLSLH("%s "$"(vec3 pos) {                                       \n"
+              "    vec3 scale = vec3(%d.0, %d.0, %d.0);                 \n"
+              "    vec3 scale_inv = 1.0 / scale;                        \n"
+              "    pos *= scale;                                        \n"
+              "    vec3 fpos = fract(pos);                              \n"
+              "    vec3 base = pos - fpos;                              \n"
+              "    vec3 fpos2 = fpos * fpos;                            \n"
+              "    vec3 inv = 1.0 - fpos;                               \n"
+              "    vec3 inv2 = inv * inv;                               \n"
+              "    vec3 w0 = 1.0/6.0 * inv2 * inv;                      \n"
+              "    vec3 w1 = 2.0/3.0 - 0.5 * fpos2 * (2.0 - fpos);      \n"
+              "    vec3 w2 = 2.0/3.0 - 0.5 * inv2 * (2.0 - inv);        \n"
+              "    vec3 w3 = 1.0/6.0 * fpos2 * fpos;                    \n"
+              "    vec3 g0 = w0 + w1;                                   \n"
+              "    vec3 g1 = w2 + w3;                                   \n"
+              "    vec3 h0 = scale_inv * ((w1 / g0) - 1.0 + base);      \n"
+              "    vec3 h1 = scale_inv * ((w3 / g1) + 1.0 + base);      \n"
+              "    %s c000, c001, c010, c011, c100, c101, c110, c111;   \n"
+              "    c000 = "$"(h0);                                      \n"
+              "    c100 = "$"(vec3(h1.x, h0.y, h0.z));                  \n"
+              "    c000 = mix(c100, c000, g0.x);                        \n"
+              "    c010 = "$"(vec3(h0.x, h1.y, h0.z));                  \n"
+              "    c110 = "$"(vec3(h1.x, h1.y, h0.z));                  \n"
+              "    c010 = mix(c110, c010, g0.x);                        \n"
+              "    c000 = mix(c010, c000, g0.y);                        \n"
+              "    c001 = "$"(vec3(h0.x, h0.y, h1.z));                  \n"
+              "    c101 = "$"(vec3(h1.x, h0.y, h1.z));                  \n"
+              "    c001 = mix(c101, c001, g0.x);                        \n"
+              "    c011 = "$"(vec3(h0.x, h1.y, h1.z));                  \n"
+              "    c111 = "$"(h1);                                      \n"
+              "    c011 = mix(c111, c011, g0.x);                        \n"
+              "    c001 = mix(c011, c001, g0.y);                        \n"
+              "    return mix(c001, c000, g0.z);                        \n"
+              "}                                                        \n",
+              vartypes[PL_VAR_FLOAT][params->comps - 1], name,
+              sizes[0] - 1, sizes[1] - 1, sizes[2] - 1,
+              vartypes[PL_VAR_FLOAT][params->comps - 1],
+              lin_lut, lin_lut, lin_lut, lin_lut,
+              lin_lut, lin_lut, lin_lut, lin_lut);
+    }
+
+    if (method == SH_LUT_TETRAHEDRAL) {
+        ident_t int_lut = name;
+        name = sh_fresh(sh, "lut_barycentric");
+        GLSLH("%s "$"(vec3 pos) {                                       \n"
+              // Compute bounding vertices and fractional part
+              "    pos = clamp(pos, 0.0, 1.0) * vec3(%d.0, %d.0, %d.0); \n"
+              "    vec3 base = floor(pos);                              \n"
+              "    vec3 fpart = pos - base;                             \n"
+              // v0 and v3 are always 'black' and 'white', respectively
+              // v1 and v2 are the closest RGB and CMY vertices, respectively
+              "    ivec3 v0 = ivec3(base), v3 = ivec3(ceil(pos));       \n"
+              "    ivec3 v1 = v0, v2 = v3;                              \n"
+              // Table of boolean checks to simplify following math
+              "    bvec3 c = greaterThanEqual(fpart.xyz, fpart.yzx);    \n"
+              "    bool c_xy = c.x, c_yx = !c.x,                        \n"
+              "       c_yz = c.y, c_zy = !c.y,                          \n"
+              "       c_zx = c.z, c_xz = !c.z;                          \n"
+              "    vec3 s = fpart.xyz;                                  \n"
+              "    bool cond;                                           \n",
+              vartypes[PL_VAR_FLOAT][params->comps - 1], name,
+              sizes[0] - 1, sizes[1] - 1, sizes[2] - 1);
+
+        // Subdivision of the cube into six congruent tetrahedras
+        //
+        // For each tetrahedron, test if the point is inside, and if so, update
+        // the edge vertices. We test all six, even though only one case will
+        // ever be true, because this avoids branches.
+        static const char *indices[] = { "xyz", "xzy", "zxy", "zyx", "yzx", "yxz"};
+        for (int i = 0; i < PL_ARRAY_SIZE(indices); i++) {
+            const char x = indices[i][0], y = indices[i][1], z = indices[i][2];
+            GLSLH("cond = c_%c%c && c_%c%c;          \n"
+                  "s = cond ? fpart.%c%c%c : s;      \n"
+                  "v1.%c = cond ? v3.%c : v1.%c;     \n"
+                  "v2.%c = cond ? v0.%c : v2.%c;     \n",
+                  x, y, y, z,
+                  x, y, z,
+                  x, x, x,
+                  z, z, z);
+        }
+
+        // Interpolate in barycentric coordinates, with four texel fetches
+        GLSLH("    return (1.0 - s.x) * "$"(v0) +   \n"
+              "           (s.x - s.y) * "$"(v1) +   \n"
+              "           (s.y - s.z) * "$"(v2) +   \n"
+              "           (s.z)       * "$"(v3);    \n"
+              "}                                    \n",
+              int_lut, int_lut, int_lut, int_lut);
+    }
+
+    lut->error = false;
+    pl_cache_obj_free(&obj);
+    pl_assert(name);
+    return name;
+
+error:
+    lut->error = true;
+    pl_cache_obj_free(&obj);
+    return NULL_IDENT;
+}
diff --git a/src/shaders/meson.build b/src/shaders/meson.build
new file mode 100644
index 0000000..746747c
--- /dev/null
+++ b/src/shaders/meson.build
@@ -0,0 +1,23 @@
+shader_sources = [
+  'colorspace.c',
+  'custom.c',
+  'custom_mpv.c',
+  'deinterlacing.c',
+  'dithering.c',
+  'film_grain.c',
+  'film_grain_av1.c',
+  'film_grain_h274.c',
+  'icc.c',
+  'lut.c',
+  'sampling.c',
+]
+
+foreach s : shader_sources
+  sources += custom_target(s,
+    command: glsl_preproc,
+    depend_files: glsl_deps,
+    env: python_env,
+    input: s,
+    output: s,
+  )
+endforeach
diff --git a/src/shaders/sampling.c b/src/shaders/sampling.c
new file mode 100644
index 0000000..fc10f80
--- /dev/null
+++ b/src/shaders/sampling.c
@@ -0,0 +1,1198 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include "shaders.h"
+
+#include <libplacebo/colorspace.h>
+#include <libplacebo/shaders/sampling.h>
+
+const struct pl_deband_params pl_deband_default_params = { PL_DEBAND_DEFAULTS };
+
+static inline struct pl_tex_params src_params(const struct pl_sample_src *src)
+{
+    if (src->tex)
+        return src->tex->params;
+
+    return (struct pl_tex_params) {
+        .w = src->tex_w,
+        .h = src->tex_h,
+    };
+}
+
+enum filter {
+    NEAREST = PL_TEX_SAMPLE_NEAREST,
+    LINEAR  = PL_TEX_SAMPLE_LINEAR,
+    BEST,
+    FASTEST,
+};
+
+// Helper function to compute the src/dst sizes and upscaling ratios
+static bool setup_src(pl_shader sh, const struct pl_sample_src *src,
+                      ident_t *src_tex, ident_t *pos, ident_t *pt,
+                      float *ratio_x, float *ratio_y, uint8_t *comp_mask,
+                      float *scale, bool resizeable,
+                      enum filter filter)
+{
+    enum pl_shader_sig sig;
+    float src_w, src_h;
+    enum pl_tex_sample_mode sample_mode;
+    if (src->tex) {
+        pl_fmt fmt = src->tex->params.format;
+        bool can_linear = fmt->caps & PL_FMT_CAP_LINEAR;
+        pl_assert(pl_tex_params_dimension(src->tex->params) == 2);
+        sig = PL_SHADER_SIG_NONE;
+        src_w = pl_rect_w(src->rect);
+        src_h = pl_rect_h(src->rect);
+        switch (filter) {
+        case FASTEST:
+        case NEAREST:
+            sample_mode = PL_TEX_SAMPLE_NEAREST;
+            break;
+        case LINEAR:
+            if (!can_linear) {
+                SH_FAIL(sh, "Trying to use a shader that requires linear "
+                        "sampling with a texture whose format (%s) does not "
+                        "support PL_FMT_CAP_LINEAR", fmt->name);
+                return false;
+            }
+            sample_mode = PL_TEX_SAMPLE_LINEAR;
+            break;
+        case BEST:
+            sample_mode = can_linear ? PL_TEX_SAMPLE_LINEAR : PL_TEX_SAMPLE_NEAREST;
+            break;
+        }
+    } else {
+        pl_assert(src->tex_w && src->tex_h);
+        sig = PL_SHADER_SIG_SAMPLER;
+        src_w = src->sampled_w;
+        src_h = src->sampled_h;
+        if (filter == BEST || filter == FASTEST) {
+            sample_mode = src->mode;
+        } else {
+            sample_mode = (enum pl_tex_sample_mode) filter;
+            if (sample_mode != src->mode) {
+                SH_FAIL(sh, "Trying to use a shader that requires a different "
+                        "filter mode than the external sampler.");
+                return false;
+            }
+        }
+    }
+
+    src_w = PL_DEF(src_w, src_params(src).w);
+    src_h = PL_DEF(src_h, src_params(src).h);
+    pl_assert(src_w && src_h);
+
+    int out_w = PL_DEF(src->new_w, roundf(fabs(src_w)));
+    int out_h = PL_DEF(src->new_h, roundf(fabs(src_h)));
+    pl_assert(out_w && out_h);
+
+    if (ratio_x)
+        *ratio_x = out_w / fabs(src_w);
+    if (ratio_y)
+        *ratio_y = out_h / fabs(src_h);
+    if (scale)
+        *scale = PL_DEF(src->scale, 1.0);
+
+    if (comp_mask) {
+        uint8_t tex_mask = 0x0Fu;
+        if (src->tex) {
+            // Mask containing only the number of components in the texture
+            tex_mask = (1 << src->tex->params.format->num_components) - 1;
+        }
+
+        uint8_t src_mask = src->component_mask;
+        if (!src_mask)
+            src_mask = (1 << PL_DEF(src->components, 4)) - 1;
+
+        // Only actually sample components that are both requested and
+        // available in the texture being sampled
+        *comp_mask = tex_mask & src_mask;
+    }
+
+    if (resizeable)
+        out_w = out_h = 0;
+    if (!sh_require(sh, sig, out_w, out_h))
+        return false;
+
+    if (src->tex) {
+        pl_rect2df rect = {
+            .x0 = src->rect.x0,
+            .y0 = src->rect.y0,
+            .x1 = src->rect.x0 + src_w,
+            .y1 = src->rect.y0 + src_h,
+        };
+
+        *src_tex = sh_bind(sh, src->tex, src->address_mode, sample_mode,
+                           "src_tex", &rect, pos, pt);
+    } else {
+        if (pt) {
+            float sx = 1.0 / src->tex_w, sy = 1.0 / src->tex_h;
+            if (src->sampler == PL_SAMPLER_RECT)
+                sx = sy = 1.0;
+
+            *pt = sh_var(sh, (struct pl_shader_var) {
+                .var = pl_var_vec2("tex_pt"),
+                .data = &(float[2]) { sx, sy },
+            });
+        }
+
+        sh->sampler_type = src->sampler;
+
+        pl_assert(src->format);
+        switch (src->format) {
+        case PL_FMT_UNKNOWN:
+        case PL_FMT_FLOAT:
+        case PL_FMT_UNORM:
+        case PL_FMT_SNORM: sh->sampler_prefix = ' '; break;
+        case PL_FMT_UINT: sh->sampler_prefix = 'u'; break;
+        case PL_FMT_SINT: sh->sampler_prefix = 's'; break;
+        case PL_FMT_TYPE_COUNT:
+            pl_unreachable();
+        }
+
+        *src_tex = sh_fresh(sh, "src_tex");
+        *pos     = sh_fresh(sh, "pos");
+
+        GLSLH("#define "$" src_tex  \n"
+              "#define "$" pos      \n",
+              *src_tex, *pos);
+    }
+
+    return true;
+}
+
+void pl_shader_deband(pl_shader sh, const struct pl_sample_src *src,
+                      const struct pl_deband_params *params)
+{
+    float scale;
+    ident_t tex, pos, pt;
+    uint8_t mask;
+    if (!setup_src(sh, src, &tex, &pos, &pt, NULL, NULL, &mask, &scale, false, LINEAR))
+        return;
+
+    params = PL_DEF(params, &pl_deband_default_params);
+    sh_describe(sh, "debanding");
+    GLSL("vec4 color;                       \n"
+         "// pl_shader_deband               \n"
+         "{                                 \n"
+         "vec2 pos = "$", pt = "$";         \n"
+         "color = textureLod("$", pos, 0.0);\n",
+         pos, pt, tex);
+
+    mask &= ~0x8u; // ignore alpha channel
+    uint8_t num_comps = sh_num_comps(mask);
+    const char *swiz = sh_swizzle(mask);
+    pl_assert(num_comps <= 3);
+    if (!num_comps) {
+        GLSL("color *= "$"; \n"
+             "}             \n",
+             SH_FLOAT(scale));
+        return;
+    }
+
+    GLSL("#define GET(X, Y)                                   \\\n"
+         "    (textureLod("$", pos + pt * vec2(X, Y), 0.0).%s)  \n"
+         "#define T %s                                          \n",
+         tex, swiz, sh_float_type(mask));
+
+    ident_t prng = sh_prng(sh, true, NULL);
+    GLSL("T avg, diff, bound;   \n"
+         "T res = color.%s;     \n"
+         "vec2 d;               \n",
+         swiz);
+
+    if (params->iterations > 0) {
+        ident_t radius = sh_const_float(sh, "radius", params->radius);
+        ident_t threshold = sh_const_float(sh, "threshold",
+                                           params->threshold / (1000 * scale));
+
+        // For each iteration, compute the average at a given distance and
+        // pick it instead of the color if the difference is below the threshold.
+        for (int i = 1; i <= params->iterations; i++) {
+            GLSL(// Compute a random angle and distance
+                 "d = "$".xy * vec2(%d.0 * "$", %f);    \n"
+                 "d = d.x * vec2(cos(d.y), sin(d.y));   \n"
+                 // Sample at quarter-turn intervals around the source pixel
+                 "avg = T(0.0);                         \n"
+                 "avg += GET(+d.x, +d.y);               \n"
+                 "avg += GET(-d.x, +d.y);               \n"
+                 "avg += GET(-d.x, -d.y);               \n"
+                 "avg += GET(+d.x, -d.y);               \n"
+                 "avg *= 0.25;                          \n"
+                 // Compare the (normalized) average against the pixel
+                 "diff = abs(res - avg);                \n"
+                 "bound = T("$" / %d.0);                \n",
+                 prng, i, radius, M_PI * 2,
+                 threshold, i);
+
+            if (num_comps > 1) {
+                GLSL("res = mix(avg, res, greaterThan(diff, bound)); \n");
+            } else {
+                GLSL("res = mix(avg, res, diff > bound); \n");
+            }
+        }
+    }
+
+    // Add some random noise to smooth out residual differences
+    if (params->grain > 0) {
+        // Avoid adding grain near true black
+        GLSL("bound = T(\n");
+        for (int c = 0; c < num_comps; c++) {
+            GLSL("%c"$, c > 0 ? ',' : ' ',
+                 SH_FLOAT(params->grain_neutral[c] / scale));
+        }
+        GLSL(");                                        \n"
+             "T strength = min(abs(res - bound), "$");  \n"
+             "res += strength * (T("$") - T(0.5));      \n",
+             SH_FLOAT(params->grain / (1000.0 * scale)), prng);
+    }
+
+    GLSL("color.%s = res;   \n"
+         "color *= "$";     \n"
+         "#undef T          \n"
+         "#undef GET        \n"
+         "}                 \n",
+         swiz, SH_FLOAT(scale));
+}
+
+bool pl_shader_sample_direct(pl_shader sh, const struct pl_sample_src *src)
+{
+    float scale;
+    ident_t tex, pos;
+    if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, &scale, true, BEST))
+        return false;
+
+    GLSL("// pl_shader_sample_direct                            \n"
+         "vec4 color = vec4("$") * textureLod("$", "$", 0.0);   \n",
+         SH_FLOAT(scale), tex, pos);
+    return true;
+}
+
+bool pl_shader_sample_nearest(pl_shader sh, const struct pl_sample_src *src)
+{
+    float scale;
+    ident_t tex, pos;
+    if (!setup_src(sh, src, &tex, &pos,  NULL, NULL, NULL, NULL, &scale, true, NEAREST))
+        return false;
+
+    sh_describe(sh, "nearest");
+    GLSL("// pl_shader_sample_nearest                           \n"
+         "vec4 color = vec4("$") * textureLod("$", "$", 0.0);   \n",
+         SH_FLOAT(scale), tex, pos);
+    return true;
+}
+
+bool pl_shader_sample_bilinear(pl_shader sh, const struct pl_sample_src *src)
+{
+    float scale;
+    ident_t tex, pos;
+    if (!setup_src(sh, src, &tex, &pos, NULL, NULL, NULL, NULL, &scale, true, LINEAR))
+        return false;
+
+    sh_describe(sh, "bilinear");
+    GLSL("// pl_shader_sample_bilinear                          \n"
+         "vec4 color = vec4("$") * textureLod("$", "$", 0.0);   \n",
+         SH_FLOAT(scale), tex, pos);
+    return true;
+}
+
+bool pl_shader_sample_bicubic(pl_shader sh, const struct pl_sample_src *src)
+{
+    ident_t tex, pos, pt;
+    float rx, ry, scale;
+    if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
+        return false;
+
+    if (rx < 1 || ry < 1) {
+        PL_TRACE(sh, "Using fast bicubic sampling when downscaling. This "
+                 "will most likely result in nasty aliasing!");
+    }
+
+    // Explanation of how bicubic scaling with only 4 texel fetches is done:
+    //   http://www.mate.tue.nl/mate/pdfs/10318.pdf
+    //   'Efficient GPU-Based Texture Interpolation using Uniform B-Splines'
+
+    sh_describe(sh, "bicubic");
+#pragma GLSL /* pl_shader_sample_bicubic */         \
+    vec4 color;                                     \
+    {                                               \
+    vec2 pos = $pos;                                \
+    vec2 size = vec2(textureSize($tex, 0));         \
+    vec2 frac  = fract(pos * size + vec2(0.5));     \
+    vec2 frac2 = frac * frac;                       \
+    vec2 inv   = vec2(1.0) - frac;                  \
+    vec2 inv2  = inv * inv;                         \
+    /* compute filter weights directly */           \
+    vec2 w0 = 1.0/6.0 * inv2 * inv;                 \
+    vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac); \
+    vec2 w2 = 2.0/3.0 - 0.5 * inv2  * (2.0 - inv);  \
+    vec2 w3 = 1.0/6.0 * frac2 * frac;               \
+    vec4 g = vec4(w0 + w1, w2 + w3);                \
+    vec4 h = vec4(w1, w3) / g + inv.xyxy;           \
+    h.xy -= vec2(2.0);                              \
+    /* sample four corners, then interpolate */     \
+    vec4 p = pos.xyxy + $pt.xyxy * h;               \
+    vec4 c00 = textureLod($tex, p.xy, 0.0);         \
+    vec4 c01 = textureLod($tex, p.xw, 0.0);         \
+    vec4 c0 = mix(c01, c00, g.y);                   \
+    vec4 c10 = textureLod($tex, p.zy, 0.0);         \
+    vec4 c11 = textureLod($tex, p.zw, 0.0);         \
+    vec4 c1 = mix(c11, c10, g.y);                   \
+    color = ${float:scale} * mix(c1, c0, g.x);      \
+    }
+
+    return true;
+}
+
+bool pl_shader_sample_hermite(pl_shader sh, const struct pl_sample_src *src)
+{
+    ident_t tex, pos, pt;
+    float rx, ry, scale;
+    if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
+        return false;
+
+    if (rx < 1 || ry < 1) {
+        PL_TRACE(sh, "Using fast hermite sampling when downscaling. This "
+                 "will most likely result in nasty aliasing!");
+    }
+
+    sh_describe(sh, "hermite");
+#pragma GLSL /* pl_shader_sample_hermite */              \
+    vec4 color;                                          \
+    {                                                    \
+    vec2 pos  = $pos;                                    \
+    vec2 size = vec2(textureSize($tex, 0));              \
+    vec2 frac = fract(pos * size + vec2(0.5));           \
+    pos += $pt * (smoothstep(0.0, 1.0, frac) - frac);    \
+    color = ${float:scale} * textureLod($tex, pos, 0.0); \
+    }
+
+    return true;
+}
+
+bool pl_shader_sample_gaussian(pl_shader sh, const struct pl_sample_src *src)
+{
+    ident_t tex, pos, pt;
+    float rx, ry, scale;
+    if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
+        return false;
+
+    if (rx < 1 || ry < 1) {
+        PL_TRACE(sh, "Using fast gaussian sampling when downscaling. This "
+                 "will most likely result in nasty aliasing!");
+    }
+
+    sh_describe(sh, "gaussian");
+#pragma GLSL /* pl_shader_sample_gaussian */        \
+    vec4 color;                                     \
+    {                                               \
+    vec2 pos  = $pos;                               \
+    vec2 size = vec2(textureSize($tex, 0));         \
+    vec2 off  = -fract(pos * size + vec2(0.5));     \
+    vec2 off2 = -2.0 * off * off;                   \
+    /* compute gaussian weights */                  \
+    vec2 w0 = exp(off2 + 4.0 * off - vec2(2.0));    \
+    vec2 w1 = exp(off2);                            \
+    vec2 w2 = exp(off2 - 4.0 * off - vec2(2.0));    \
+    vec2 w3 = exp(off2 - 8.0 * off - vec2(8.0));    \
+    vec4 g = vec4(w0 + w1, w2 + w3);                \
+    vec4 h = vec4(w1, w3) / g;                      \
+    h.xy -= vec2(1.0);                              \
+    h.zw += vec2(1.0);                              \
+    g.xy /= g.xy + g.zw; /* explicitly normalize */ \
+    /* sample four corners, then interpolate */     \
+    vec4 p = pos.xyxy + $pt.xyxy * (h + off.xyxy);  \
+    vec4 c00 = textureLod($tex, p.xy, 0.0);         \
+    vec4 c01 = textureLod($tex, p.xw, 0.0);         \
+    vec4 c0 = mix(c01, c00, g.y);                   \
+    vec4 c10 = textureLod($tex, p.zy, 0.0);         \
+    vec4 c11 = textureLod($tex, p.zw, 0.0);         \
+    vec4 c1 = mix(c11, c10, g.y);                   \
+    color = ${float:scale} * mix(c1, c0, g.x);      \
+    }
+
+    return true;
+}
+
+bool pl_shader_sample_oversample(pl_shader sh, const struct pl_sample_src *src,
+                                 float threshold)
+{
+    ident_t tex, pos, pt;
+    float rx, ry, scale;
+    if (!setup_src(sh, src, &tex, &pos, &pt, &rx, &ry, NULL, &scale, true, LINEAR))
+        return false;
+
+    threshold = PL_CLAMP(threshold, 0.0f, 0.5f);
+    sh_describe(sh, "oversample");
+    #pragma GLSL /* pl_shader_sample_oversample */       \
+    vec4 color;                                          \
+    {                                                    \
+    vec2 pos = $pos;                                     \
+    vec2 size = vec2(textureSize($tex, 0));              \
+    /* Round the position to the nearest pixel */        \
+    vec2 fcoord = fract(pos * size - vec2(0.5));         \
+    float rx = ${dynamic float:rx};                      \
+    float ry = ${dynamic float:ry};                      \
+    vec2 coeff = (fcoord - vec2(0.5)) * vec2(rx, ry);    \
+    coeff = clamp(coeff + vec2(0.5), 0.0, 1.0);          \
+    @if (threshold > 0) {                                \
+        float thresh = ${float:threshold};               \
+        coeff = mix(coeff, vec2(0.0),                    \
+            lessThan(coeff, vec2(thresh)));              \
+        coeff = mix(coeff, vec2(1.0),                    \
+            greaterThan(coeff, vec2(1.0 - thresh)));     \
+    @}                                                   \
+                                                         \
+    /* Compute the right output blend of colors */       \
+    pos += (coeff - fcoord) * $pt;                       \
+    color = ${float:scale} * textureLod($tex, pos, 0.0); \
+    }
+
+    return true;
+}
+
+static void describe_filter(pl_shader sh, const struct pl_filter_config *cfg,
+                            const char *stage, float rx, float ry)
+{
+    const char *dir;
+    if (rx > 1 && ry > 1) {
+        dir = "up";
+    } else if (rx < 1 && ry < 1) {
+        dir = "down";
+    } else if (rx == 1 && ry == 1) {
+        dir = "noop";
+    } else {
+        dir = "ana";
+    }
+
+    if (cfg->name) {
+        sh_describef(sh, "%s %sscaling (%s)", stage, dir, cfg->name);
+    } else if (cfg->window) {
+        sh_describef(sh, "%s %sscaling (%s+%s)", stage, dir,
+                     PL_DEF(cfg->kernel->name, "unknown"),
+                     PL_DEF(cfg->window->name, "unknown"));
+    } else {
+        sh_describef(sh, "%s %sscaling (%s)", stage, dir,
+                     PL_DEF(cfg->kernel->name, "unknown"));
+    }
+}
+
+// Subroutine for computing and adding an individual texel contribution
+// If `in` is NULL, samples directly
+// If `in` is set, takes the pixel from inX[idx] where X is the component,
+// `in` is the given identifier, and `idx` must be defined by the caller
+static void polar_sample(pl_shader sh, pl_filter filter,
+                         ident_t tex, ident_t lut, ident_t radius,
+                         int x, int y, uint8_t comp_mask, ident_t in,
+                         bool use_ar, ident_t scale)
+{
+    // Since we can't know the subpixel position in advance, assume a
+    // worst case scenario
+    int yy = y > 0 ? y-1 : y;
+    int xx = x > 0 ? x-1 : x;
+    float dmin = sqrt(xx*xx + yy*yy);
+    // Skip samples definitely outside the radius
+    if (dmin >= filter->radius)
+        return;
+
+    // Check for samples that might be skippable
+    bool maybe_skippable = dmin >= filter->radius - M_SQRT2;
+
+    // Check for samples that definitely won't contribute to anti-ringing
+    const float ar_radius = filter->radius_zero;
+    use_ar &= dmin < ar_radius;
+
+#pragma GLSL                                                    \
+    offset = ivec2(${const int: x}, ${const int: y});           \
+    d = length(vec2(offset) - fcoord);                          \
+    @if (maybe_skippable)                                       \
+        if (d < $radius) {                                      \
+    w = $lut(d * 1.0 / $radius);                                \
+    wsum += w;                                                  \
+    @if (in != NULL_IDENT) {                                    \
+        @for (c : comp_mask)                                    \
+            c[@c] = ${in}_@c[idx];                              \
+    @} else {                                                   \
+        c = textureLod($tex, base + pt * vec2(offset), 0.0);    \
+    @}                                                          \
+    @for (c : comp_mask)                                        \
+        color[@c] += w * c[@c];                                 \
+    @if (use_ar) {                                              \
+        if (d <= ${const float: ar_radius}) {                   \
+            @for (c : comp_mask) {                              \
+                cc = vec2($scale * c[@c]);                      \
+                cc.x = 1.0 - cc.x;                              \
+                ww = cc + vec2(0.10);                           \
+                ww = ww * ww;                                   \
+                ww = ww * ww;                                   \
+                ww = ww * ww;                                   \
+                ww = ww * ww;                                   \
+                ww = ww * ww;                                   \
+                ww = w * ww;                                    \
+                ar@c += ww * cc;                                \
+                wwsum@c += ww;                                  \
+            @}                                                  \
+        }                                                       \
+    @}                                                          \
+    @if (maybe_skippable)                                       \
+        }
+}
+
+struct sh_sampler_obj {
+    pl_filter filter;
+    pl_shader_obj lut;
+    pl_shader_obj pass2; // for pl_shader_sample_ortho
+};
+
+#define SCALER_LUT_SIZE     256
+#define SCALER_LUT_CUTOFF   1e-3f
+
+static void sh_sampler_uninit(pl_gpu gpu, void *ptr)
+{
+    struct sh_sampler_obj *obj = ptr;
+    pl_shader_obj_destroy(&obj->lut);
+    pl_shader_obj_destroy(&obj->pass2);
+    pl_filter_free(&obj->filter);
+    *obj = (struct sh_sampler_obj) {0};
+}
+
+static void fill_polar_lut(void *data, const struct sh_lut_params *params)
+{
+    const struct sh_sampler_obj *obj = params->priv;
+    pl_filter filt = obj->filter;
+
+    pl_assert(params->width == filt->params.lut_entries && params->comps == 1);
+    memcpy(data, filt->weights, params->width * sizeof(float));
+}
+
+bool pl_shader_sample_polar(pl_shader sh, const struct pl_sample_src *src,
+                            const struct pl_sample_filter_params *params)
+{
+    pl_assert(params);
+    if (!params->filter.polar) {
+        SH_FAIL(sh, "Trying to use polar sampling with a non-polar filter?");
+        return false;
+    }
+
+    uint8_t cmask;
+    float rx, ry, scalef;
+    ident_t src_tex, pos, pt, scale;
+    if (!setup_src(sh, src, &src_tex, &pos, &pt, &rx, &ry, &cmask, &scalef, false, FASTEST))
+        return false;
+
+    struct sh_sampler_obj *obj;
+    obj = SH_OBJ(sh, params->lut, PL_SHADER_OBJ_SAMPLER, struct sh_sampler_obj,
+                 sh_sampler_uninit);
+    if (!obj)
+        return false;
+
+    float inv_scale = 1.0 / PL_MIN(rx, ry);
+    inv_scale = PL_MAX(inv_scale, 1.0);
+    if (params->no_widening)
+        inv_scale = 1.0;
+    scale = sh_const_float(sh, "scale", scalef);
+
+    struct pl_filter_config cfg = params->filter;
+    cfg.antiring = PL_DEF(cfg.antiring, params->antiring);
+    cfg.blur = PL_DEF(cfg.blur, 1.0f) * inv_scale;
+    bool update = !obj->filter || !pl_filter_config_eq(&obj->filter->params.config, &cfg);
+    if (update) {
+        pl_filter_free(&obj->filter);
+        obj->filter = pl_filter_generate(sh->log, pl_filter_params(
+            .config         = cfg,
+            .lut_entries    = SCALER_LUT_SIZE,
+            .cutoff         = SCALER_LUT_CUTOFF,
+        ));
+
+        if (!obj->filter) {
+            // This should never happen, but just in case ..
+            SH_FAIL(sh, "Failed initializing polar filter!");
+            return false;
+        }
+    }
+
+    describe_filter(sh, &cfg, "polar", rx, ry);
+    GLSL("// pl_shader_sample_polar                     \n"
+         "vec4 color = vec4(0.0);                       \n"
+         "{                                             \n"
+         "vec2 pos = "$", pt = "$";                     \n"
+         "vec2 size = vec2(textureSize("$", 0));        \n"
+         "vec2 fcoord = fract(pos * size - vec2(0.5));  \n"
+         "vec2 base = pos - pt * fcoord;                \n"
+         "vec2 center = base + pt * vec2(0.5);          \n"
+         "ivec2 offset;                                 \n"
+         "float w, d, wsum = 0.0;                       \n"
+         "int idx;                                      \n"
+         "vec4 c;                                       \n",
+         pos, pt, src_tex);
+
+    bool use_ar = cfg.antiring > 0;
+    if (use_ar) {
+#pragma GLSL                                                                    \
+        vec2 ww, cc;                                                            \
+        @for (c : cmask)                                                        \
+            vec2 ar@c = vec2(0.0), wwsum@c = vec2(0.0);
+    }
+
+    int bound   = ceil(obj->filter->radius);
+    int offset  = bound - 1; // padding top/left
+    int padding = offset + bound; // total padding
+
+    // Determined experimentally on modern AMD and Nvidia hardware. 32 is a
+    // good tradeoff for the horizontal work group size. Apart from that,
+    // just use as many threads as possible.
+    const int bw = 32, bh = sh_glsl(sh).max_group_threads / bw;
+
+    // We need to sample everything from base_min to base_max, so make sure we
+    // have enough room in shmem. The extra margin on the ceilf guards against
+    // floating point inaccuracy on near-integer scaling ratios.
+    const float margin = 1e-5;
+    int iw = (int) ceilf(bw / rx - margin) + padding + 1,
+        ih = (int) ceilf(bh / ry - margin) + padding + 1;
+    int sizew = iw, sizeh = ih;
+
+    pl_gpu gpu = SH_GPU(sh);
+    bool dynamic_size = SH_PARAMS(sh).dynamic_constants ||
+                        !gpu || !gpu->limits.array_size_constants;
+    if (dynamic_size) {
+        // Overallocate the array slightly to reduce recompilation overhead
+        sizew = PL_ALIGN2(sizew, 8);
+        sizeh = PL_ALIGN2(sizeh, 8);
+    }
+
+    int num_comps = __builtin_popcount(cmask);
+    int shmem_req = (sizew * sizeh * num_comps + 2) * sizeof(float);
+    bool is_compute = !params->no_compute && sh_glsl(sh).compute &&
+                      sh_try_compute(sh, bw, bh, false, shmem_req);
+
+    // Note: SH_LUT_LITERAL might be faster in some specific cases, but not by
+    // much, and it's catastrophically slow on other platforms.
+    ident_t lut = sh_lut(sh, sh_lut_params(
+        .object     = &obj->lut,
+        .lut_type   = SH_LUT_TEXTURE,
+        .var_type   = PL_VAR_FLOAT,
+        .method     = SH_LUT_LINEAR,
+        .width      = SCALER_LUT_SIZE,
+        .comps      = 1,
+        .update     = update,
+        .fill       = fill_polar_lut,
+        .priv       = obj,
+    ));
+
+    if (!lut) {
+        SH_FAIL(sh, "Failed initializing polar LUT!");
+        return false;
+    }
+
+    ident_t radius_c = sh_const_float(sh, "radius", obj->filter->radius);
+    ident_t in = sh_fresh(sh, "in");
+
+    if (is_compute) {
+
+        // Compute shader kernel
+        GLSL("uvec2 base_id = uvec2(0u); \n");
+        if (src->rect.x0 > src->rect.x1)
+            GLSL("base_id.x = gl_WorkGroupSize.x - 1u; \n");
+        if (src->rect.y0 > src->rect.y1)
+            GLSL("base_id.y = gl_WorkGroupSize.y - 1u; \n");
+
+        GLSLH("shared vec2 "$"_base; \n", in);
+        GLSL("if (gl_LocalInvocationID.xy == base_id)               \n"
+             "    "$"_base = base;                                  \n"
+             "barrier();                                            \n"
+             "ivec2 rel = ivec2(round((base - "$"_base) * size));   \n",
+             in, in);
+
+        ident_t sizew_c = sh_const(sh, (struct pl_shader_const) {
+            .type = PL_VAR_SINT,
+            .compile_time = true,
+            .name = "sizew",
+            .data = &sizew,
+        });
+
+        ident_t sizeh_c = sh_const(sh, (struct pl_shader_const) {
+            .type = PL_VAR_SINT,
+            .compile_time = true,
+            .name = "sizeh",
+            .data = &sizeh,
+        });
+
+        ident_t iw_c = sizew_c, ih_c = sizeh_c;
+        if (dynamic_size) {
+            iw_c = sh_const_int(sh, "iw", iw);
+            ih_c = sh_const_int(sh, "ih", ih);
+        }
+
+        // Load all relevant texels into shmem
+        GLSL("for (int y = int(gl_LocalInvocationID.y); y < "$"; y += %d) {     \n"
+             "for (int x = int(gl_LocalInvocationID.x); x < "$"; x += %d) {     \n"
+             "c = textureLod("$", "$"_base + pt * vec2(x - %d, y - %d), 0.0);   \n",
+             ih_c, bh, iw_c, bw, src_tex, in, offset, offset);
+
+        for (uint8_t comps = cmask; comps;) {
+            uint8_t c = __builtin_ctz(comps);
+            GLSLH("shared float "$"_%d["$" * "$"]; \n", in, c, sizeh_c, sizew_c);
+            GLSL(""$"_%d["$" * y + x] = c[%d]; \n", in, c, sizew_c, c);
+            comps &= ~(1 << c);
+        }
+
+        GLSL("}}                     \n"
+             "barrier();             \n");
+
+        // Dispatch the actual samples
+        for (int y = 1 - bound; y <= bound; y++) {
+            for (int x = 1 - bound; x <= bound; x++) {
+                GLSL("idx = "$" * rel.y + rel.x + "$" * %d + %d; \n",
+                     sizew_c, sizew_c, y + offset, x + offset);
+                polar_sample(sh, obj->filter, src_tex, lut, radius_c,
+                             x, y, cmask, in, use_ar, scale);
+            }
+        }
+    } else {
+        // Fragment shader sampling
+        for (uint8_t comps = cmask; comps;) {
+            uint8_t c = __builtin_ctz(comps);
+            GLSL("vec4 "$"_%d; \n", in, c);
+            comps &= ~(1 << c);
+        }
+
+        // For maximum efficiency, we want to use textureGather() if
+        // possible, rather than direct sampling. Since this is not
+        // always possible/sensible, we need to possibly intermix gathering
+        // with regular sampling. This requires keeping track of which
+        // pixels in the next row were already gathered by the previous
+        // row.
+        uint32_t gathered_cur = 0x0, gathered_next = 0x0;
+        const float radius2 = PL_SQUARE(obj->filter->radius);
+        const int base = bound - 1;
+
+        if (base + bound >= 8 * sizeof(gathered_cur)) {
+            SH_FAIL(sh, "Polar radius %f exceeds implementation capacity!",
+                    obj->filter->radius);
+            return false;
+        }
+
+        for (int y = 1 - bound; y <= bound; y++) {
+            for (int x = 1 - bound; x <= bound; x++) {
+                // Skip already gathered texels
+                uint32_t bit = 1llu << (base + x);
+                if (gathered_cur & bit)
+                    continue;
+
+                // Using texture gathering is only more efficient than direct
+                // sampling in the case where we expect to be able to use all
+                // four gathered texels, without having to discard any. So
+                // only do it if we suspect it will be a win rather than a
+                // loss.
+                int xx = x*x, xx1 = (x+1)*(x+1);
+                int yy = y*y, yy1 = (y+1)*(y+1);
+                bool use_gather = PL_MAX(xx, xx1) + PL_MAX(yy, yy1) < radius2;
+                use_gather &= PL_MAX(x, y) <= sh_glsl(sh).max_gather_offset;
+                use_gather &= PL_MIN(x, y) >= sh_glsl(sh).min_gather_offset;
+                use_gather &= !src->tex || src->tex->params.format->gatherable;
+
+                // Gathering from components other than the R channel requires
+                // support for GLSL 400, which introduces the overload of
+                // textureGather* that allows specifying the component.
+                //
+                // This is also the minimum requirement if we don't know the
+                // texture format capabilities, for the sampler2D interface
+                if (cmask != 0x1 || !src->tex)
+                    use_gather &= sh_glsl(sh).version >= 400;
+
+                if (!use_gather) {
+                    // Switch to direct sampling instead
+                    polar_sample(sh, obj->filter, src_tex, lut, radius_c,
+                                 x, y, cmask, NULL_IDENT, use_ar, scale);
+                    continue;
+                }
+
+                // Gather the four surrounding texels simultaneously
+                for (uint8_t comps = cmask; comps;) {
+                    uint8_t c = __builtin_ctz(comps);
+                    if (x || y) {
+                        if (c) {
+                            GLSL($"_%d = textureGatherOffset("$", "
+                                 "center, ivec2(%d, %d), %d); \n",
+                                 in, c, src_tex, x, y, c);
+                        } else {
+                            GLSL($"_0 = textureGatherOffset("$", "
+                                 "center, ivec2(%d, %d)); \n",
+                                 in, src_tex, x, y);
+                        }
+                    } else {
+                        if (c) {
+                            GLSL($"_%d = textureGather("$", center, %d); \n",
+                                 in, c, src_tex, c);
+                        } else {
+                            GLSL($"_0 = textureGather("$", center); \n",
+                                 in, src_tex);
+                        }
+                    }
+                    comps &= ~(1 << c);
+                }
+
+                // Mix in all of the points with their weights
+                for (int p = 0; p < 4; p++) {
+                    // The four texels are gathered counterclockwise starting
+                    // from the bottom left
+                    static const int xo[4] = {0, 1, 1, 0};
+                    static const int yo[4] = {1, 1, 0, 0};
+                    if (x+xo[p] > bound || y+yo[p] > bound)
+                        continue; // next subpixel
+
+                    GLSL("idx = %d;\n", p);
+                    polar_sample(sh, obj->filter, src_tex, lut, radius_c,
+                                 x+xo[p], y+yo[p], cmask, in, use_ar, scale);
+                }
+
+                // Mark the other next row's pixels as already gathered
+                gathered_next |= bit | (bit << 1);
+                x++; // skip adjacent pixel
+            }
+
+            // Prepare for new row
+            gathered_cur = gathered_next;
+            gathered_next = 0;
+        }
+    }
+
+#pragma GLSL                                                                    \
+    color = $scale / wsum * color;                                              \
+    @if (use_ar) {                                                              \
+        @for (c : cmask) {                                                      \
+            ww = ar@c / wwsum@c;                                                \
+            ww.x = 1.0 - ww.x;                                                  \
+            w = clamp(color[@c], ww.x, ww.y);                                   \
+            w = mix(w, dot(ww, vec2(0.5)), ww.x > ww.y);                        \
+            color[@c] = mix(color[@c], w, ${float:cfg.antiring});               \
+        @}                                                                      \
+    @}                                                                          \
+    @if (!(cmask & (1 << PL_CHANNEL_A)))                                        \
+        color.a = 1.0;                                                          \
+    }
+
+    return true;
+}
+
+static void fill_ortho_lut(void *data, const struct sh_lut_params *params)
+{
+    const struct sh_sampler_obj *obj = params->priv;
+    pl_filter filt = obj->filter;
+
+    if (filt->radius == filt->radius_zero) {
+        // Main lobe covers entire radius, so all weights are positive, meaning
+        // we can use the linear resampling trick
+        for (int n = 0; n < SCALER_LUT_SIZE; n++) {
+            const float *weights = filt->weights + n * filt->row_stride;
+            float *row = (float *) data + n * filt->row_stride;
+            pl_assert(filt->row_size % 2 == 0);
+            for (int i = 0; i < filt->row_size; i += 2) {
+                const float w0 = weights[i], w1 = weights[i+1];
+                assert(w0 + w1 >= 0.0f);
+                row[i] = w0 + w1;
+                row[i+1] = w1 / (w0 + w1);
+            }
+        }
+    } else {
+        size_t entries = SCALER_LUT_SIZE * filt->row_stride;
+        pl_assert(params->width * params->height * params->comps == entries);
+        memcpy(data, filt->weights, entries * sizeof(float));
+    }
+}
+
+enum {
+    SEP_VERT = 0,
+    SEP_HORIZ,
+    SEP_PASSES
+};
+
+bool pl_shader_sample_ortho2(pl_shader sh, const struct pl_sample_src *src,
+                             const struct pl_sample_filter_params *params)
+{
+    pl_assert(params);
+    if (params->filter.polar) {
+        SH_FAIL(sh, "Trying to use separated sampling with a polar filter?");
+        return false;
+    }
+
+    pl_gpu gpu = SH_GPU(sh);
+    pl_assert(gpu);
+
+    uint8_t comps;
+    float ratio[SEP_PASSES], scale;
+    ident_t src_tex, pos, pt;
+    if (!setup_src(sh, src, &src_tex, &pos, &pt,
+                   &ratio[SEP_HORIZ], &ratio[SEP_VERT],
+                   &comps, &scale, false, LINEAR))
+        return false;
+
+
+    int pass;
+    if (fabs(ratio[SEP_HORIZ] - 1.0f) < 1e-6f) {
+        pass = SEP_VERT;
+    } else if (fabs(ratio[SEP_VERT] - 1.0f) < 1e-6f) {
+        pass = SEP_HORIZ;
+    } else {
+        SH_FAIL(sh, "Trying to use pl_shader_sample_ortho with a "
+                "pl_sample_src that requires scaling in multiple directions "
+                "(rx=%f, ry=%f), this is not possible!",
+                ratio[SEP_HORIZ], ratio[SEP_VERT]);
+        return false;
+    }
+
+    // We can store a separate sampler object per dimension, so dispatch the
+    // right one. This is needed for two reasons:
+    // 1. Anamorphic content can have a different scaling ratio for each
+    //    dimension. In particular, you could be upscaling in one and
+    //    downscaling in the other.
+    // 2. After fixing the source for `setup_src`, we lose information about
+    //    the scaling ratio of the other component. (Although this is only a
+    //    minor reason and could easily be changed with some boilerplate)
+    struct sh_sampler_obj *obj;
+    obj = SH_OBJ(sh, params->lut, PL_SHADER_OBJ_SAMPLER,
+                 struct sh_sampler_obj, sh_sampler_uninit);
+    if (!obj)
+        return false;
+
+    if (pass != 0) {
+        obj = SH_OBJ(sh, &obj->pass2, PL_SHADER_OBJ_SAMPLER,
+                     struct sh_sampler_obj, sh_sampler_uninit);
+        assert(obj);
+    }
+
+    float inv_scale = 1.0 / ratio[pass];
+    inv_scale = PL_MAX(inv_scale, 1.0);
+    if (params->no_widening)
+        inv_scale = 1.0;
+
+    struct pl_filter_config cfg = params->filter;
+    cfg.antiring = PL_DEF(cfg.antiring, params->antiring);
+    cfg.blur = PL_DEF(cfg.blur, 1.0f) * inv_scale;
+    bool update = !obj->filter || !pl_filter_config_eq(&obj->filter->params.config, &cfg);
+
+    if (update) {
+        pl_filter_free(&obj->filter);
+        obj->filter = pl_filter_generate(sh->log, pl_filter_params(
+            .config             = cfg,
+            .lut_entries        = SCALER_LUT_SIZE,
+            .max_row_size       = gpu->limits.max_tex_2d_dim / 4,
+            .row_stride_align   = 4,
+        ));
+
+        if (!obj->filter) {
+            // This should never happen, but just in case ..
+            SH_FAIL(sh, "Failed initializing separated filter!");
+            return false;
+        }
+    }
+
+    int N = obj->filter->row_size; // number of samples to convolve
+    int width = obj->filter->row_stride / 4; // width of the LUT texture
+    ident_t lut = sh_lut(sh, sh_lut_params(
+        .object     = &obj->lut,
+        .var_type   = PL_VAR_FLOAT,
+        .method     = SH_LUT_LINEAR,
+        .width      = width,
+        .height     = SCALER_LUT_SIZE,
+        .comps      = 4,
+        .update     = update,
+        .fill       = fill_ortho_lut,
+        .priv       = obj,
+    ));
+    if (!lut) {
+        SH_FAIL(sh, "Failed initializing separated LUT!");
+        return false;
+    }
+
+    const int dir[SEP_PASSES][2] = {
+        [SEP_HORIZ] = {1, 0},
+        [SEP_VERT]  = {0, 1},
+    };
+
+    static const char *names[SEP_PASSES] = {
+        [SEP_HORIZ] = "ortho (horiz)",
+        [SEP_VERT]  = "ortho (vert)",
+    };
+
+    describe_filter(sh, &cfg, names[pass], ratio[pass], ratio[pass]);
+
+    float denom = PL_MAX(1, width - 1); // avoid division by zero
+    bool use_ar = cfg.antiring > 0 && ratio[pass] > 1.0;
+    bool use_linear = obj->filter->radius == obj->filter->radius_zero;
+    use_ar &= !use_linear; // filter has no negative weights
+
+#pragma GLSL /* pl_shader_sample_ortho */                                       \
+    vec4 color = vec4(0.0, 0.0, 0.0, 1.0);                                      \
+    {                                                                           \
+    vec2 pos = $pos, pt = $pt;                                                  \
+    vec2 size = vec2(textureSize($src_tex, 0));                                 \
+    vec2 dir = vec2(${const float:dir[pass][0]}, ${const float: dir[pass][1]}); \
+    pt *= dir;                                                                  \
+    vec2 fcoord2 = fract(pos * size - vec2(0.5));                               \
+    float fcoord = dot(fcoord2, dir);                                           \
+    vec2 base = pos - fcoord * pt - pt * vec2(${const float: N / 2 - 1});       \
+    vec4 ws;                                                                    \
+    float off;                                                                  \
+    ${vecType: comps} c, ca = ${vecType: comps}(0.0);                           \
+    @if (use_ar) {                                                              \
+        ${vecType: comps} hi = ${vecType: comps}(0.0);                          \
+        ${vecType: comps} lo = ${vecType: comps}(1e9);                          \
+    @}                                                                          \
+    @for (n < N) {                                                              \
+        @if @(n % 4 == 0)                                                       \
+            ws = $lut(vec2(float(@n / 4) / ${const float: denom}, fcoord));     \
+        @if @(vars.use_ar && (n == vars.n / 2 - 1 || n == vars.n / 2)) {        \
+            c = textureLod($src_tex, base + pt * @n.0, 0.0).${swizzle: comps};  \
+            ca += ws[@n % 4] * c;                                               \
+            lo = min(lo, c);                                                    \
+            hi = max(hi, c);                                                    \
+        @} else {                                                               \
+            @if (use_linear) {                                                  \
+                @if @(n % 2 == 0) {                                             \
+                    off = @n.0 + ws[@n % 4 + 1];                                \
+                    ca += ws[@n % 4] * textureLod($src_tex, base + pt * off,    \
+                                                  0.0).${swizzle: comps};       \
+                @}                                                              \
+            @} else {                                                           \
+                ca += ws[@n % 4] * textureLod($src_tex, base + pt * @n.0,       \
+                                              0.0).${swizzle: comps};           \
+            @}                                                                  \
+        @}                                                                      \
+    @}                                                                          \
+    @if (use_ar)                                                                \
+        ca = mix(ca, clamp(ca, lo, hi), ${float: cfg.antiring});                \
+    color.${swizzle: comps} = ${float: scale} * ca;                             \
+    }
+
+    return true;
+}
+
+const struct pl_distort_params pl_distort_default_params = { PL_DISTORT_DEFAULTS };
+
+void pl_shader_distort(pl_shader sh, pl_tex src_tex, int out_w, int out_h,
+                       const struct pl_distort_params *params)
+{
+    pl_assert(params);
+    if (!sh_require(sh, PL_SHADER_SIG_NONE, out_w, out_h))
+        return;
+
+    const int src_w = src_tex->params.w, src_h = src_tex->params.h;
+    float rx = 1.0f, ry = 1.0f;
+    if (src_w > src_h) {
+        ry = (float) src_h / src_w;
+    } else {
+        rx = (float) src_w / src_h;
+    }
+
+    // Map from texel coordinates [0,1]² to aspect-normalized representation
+    const pl_transform2x2 tex2norm = {
+        .mat.m = {
+            { 2 * rx, 0 },
+            { 0, -2 * ry },
+        },
+        .c = { -rx, ry },
+    };
+
+    // Map from aspect-normalized representation to canvas coords [-1,1]²
+    const float sx = params->unscaled ? (float) src_w / out_w : 1.0f;
+    const float sy = params->unscaled ? (float) src_h / out_h : 1.0f;
+    const pl_transform2x2 norm2canvas = {
+        .mat.m = {
+            { sx / rx, 0 },
+            { 0, sy / ry },
+        },
+    };
+
+    struct pl_transform2x2 transform = params->transform;
+    pl_transform2x2_mul(&transform, &tex2norm);
+    pl_transform2x2_rmul(&norm2canvas, &transform);
+
+    if (params->constrain) {
+        pl_rect2df bb = pl_transform2x2_bounds(&transform, &(pl_rect2df) {
+            .x1 = 1, .y1 = 1,
+        });
+        const float k = fmaxf(fmaxf(pl_rect_w(bb), pl_rect_h(bb)), 2.0f);
+        pl_transform2x2_scale(&transform, 2.0f / k);
+    };
+
+    // Bind the canvas coordinates as [-1,1]², flipped vertically to correspond
+    // to normal mathematical axis conventions
+    static const pl_rect2df canvas = {
+        .x0 = -1.0f, .x1 =  1.0f,
+        .y0 =  1.0f, .y1 = -1.0f,
+    };
+
+    ident_t pos = sh_attr_vec2(sh, "pos", &canvas);
+    ident_t pt, tex = sh_bind(sh, src_tex, params->address_mode,
+                              PL_TEX_SAMPLE_LINEAR, "tex", NULL, NULL, &pt);
+
+    // Bind the inverse of the tex2canvas transform (i.e. canvas2tex)
+    pl_transform2x2_invert(&transform);
+    ident_t tf = sh_var(sh, (struct pl_shader_var) {
+        .var  = pl_var_mat2("tf"),
+        .data = PL_TRANSPOSE_2X2(transform.mat.m),
+    });
+
+    ident_t tf_c = sh_var(sh, (struct pl_shader_var) {
+        .var  = pl_var_vec2("tf_c"),
+        .data = transform.c,
+    });
+
+    // See pl_shader_sample_bicubic
+    sh_describe(sh, "distortion");
+#pragma GLSL /* pl_shader_sample_distort */                 \
+    vec4 color;                                             \
+    {                                                       \
+    vec2 pos = $tf * $pos + $tf_c;                          \
+    vec2 pt = $pt;                                          \
+    @if (params->bicubic) {                                 \
+        vec2 size = vec2(textureSize($tex, 0));             \
+        vec2 frac  = fract(pos * size + vec2(0.5));         \
+        vec2 frac2 = frac * frac;                           \
+        vec2 inv   = vec2(1.0) - frac;                      \
+        vec2 inv2  = inv * inv;                             \
+        vec2 w0 = 1.0/6.0 * inv2 * inv;                     \
+        vec2 w1 = 2.0/3.0 - 0.5 * frac2 * (2.0 - frac);     \
+        vec2 w2 = 2.0/3.0 - 0.5 * inv2  * (2.0 - inv);      \
+        vec2 w3 = 1.0/6.0 * frac2 * frac;                   \
+        vec4 g = vec4(w0 + w1, w2 + w3);                    \
+        vec4 h = vec4(w1, w3) / g + inv.xyxy;               \
+        h.xy -= vec2(2.0);                                  \
+        vec4 p = pos.xyxy + pt.xyxy * h;                    \
+        vec4 c00 = textureLod($tex, p.xy, 0.0);             \
+        vec4 c01 = textureLod($tex, p.xw, 0.0);             \
+        vec4 c0 = mix(c01, c00, g.y);                       \
+        vec4 c10 = textureLod($tex, p.zy, 0.0);             \
+        vec4 c11 = textureLod($tex, p.zw, 0.0);             \
+        vec4 c1 = mix(c11, c10, g.y);                       \
+        color = mix(c1, c0, g.x);                           \
+    @} else {                                               \
+        color = texture($tex, pos);                         \
+    @}                                                      \
+    @if (params->alpha_mode) {                              \
+        vec2 border = min(pos, vec2(1.0) - pos);            \
+        border = smoothstep(vec2(0.0), pt, border);         \
+        @if (params->alpha_mode == PL_ALPHA_PREMULTIPLIED)  \
+            color.rgba *= border.x * border.y;              \
+        @else                                               \
+            color.a *= border.x * border.y;                 \
+    @}                                                      \
+    }
+
+}
diff --git a/src/swapchain.c b/src/swapchain.c
new file mode 100644
index 0000000..2b9ed90
--- /dev/null
+++ b/src/swapchain.c
@@ -0,0 +1,92 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include "log.h"
+#include "swapchain.h"
+
+void pl_swapchain_destroy(pl_swapchain *ptr)
+{
+    pl_swapchain sw = *ptr;
+    if (!sw)
+        return;
+
+    const struct pl_sw_fns *impl = PL_PRIV(sw);
+    impl->destroy(sw);
+    *ptr = NULL;
+}
+
+int pl_swapchain_latency(pl_swapchain sw)
+{
+    const struct pl_sw_fns *impl = PL_PRIV(sw);
+    if (!impl->latency)
+        return 0;
+
+    return impl->latency(sw);
+}
+
+bool pl_swapchain_resize(pl_swapchain sw, int *width, int *height)
+{
+    int dummy[2] = {0};
+    width = PL_DEF(width, &dummy[0]);
+    height = PL_DEF(height, &dummy[1]);
+
+    const struct pl_sw_fns *impl = PL_PRIV(sw);
+    if (!impl->resize) {
+        *width = *height = 0;
+        return true;
+    }
+
+    return impl->resize(sw, width, height);
+}
+
+void pl_swapchain_colorspace_hint(pl_swapchain sw, const struct pl_color_space *csp)
+{
+    const struct pl_sw_fns *impl = PL_PRIV(sw);
+    if (!impl->colorspace_hint)
+        return;
+
+    struct pl_swapchain_colors fix = {0};
+    if (csp) {
+        fix = *csp;
+        // Ensure we have valid values set for all the fields
+        pl_color_space_infer(&fix);
+    }
+
+    impl->colorspace_hint(sw, &fix);
+}
+
+bool pl_swapchain_start_frame(pl_swapchain sw,
+                              struct pl_swapchain_frame *out_frame)
+{
+    *out_frame = (struct pl_swapchain_frame) {0}; // sanity
+
+    const struct pl_sw_fns *impl = PL_PRIV(sw);
+    return impl->start_frame(sw, out_frame);
+}
+
+bool pl_swapchain_submit_frame(pl_swapchain sw)
+{
+    const struct pl_sw_fns *impl = PL_PRIV(sw);
+    return impl->submit_frame(sw);
+}
+
+void pl_swapchain_swap_buffers(pl_swapchain sw)
+{
+    const struct pl_sw_fns *impl = PL_PRIV(sw);
+    impl->swap_buffers(sw);
+}
diff --git a/src/swapchain.h b/src/swapchain.h
new file mode 100644
index 0000000..934a2b9
--- /dev/null
+++ b/src/swapchain.h
@@ -0,0 +1,39 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+#include <libplacebo/swapchain.h>
+
+// This struct must be the first member of the swapchains's priv struct. The
+// `pl_swapchain` helpers will cast the priv struct to this struct!
+
+#define SW_PFN(name) __typeof__(pl_swapchain_##name) *name
+struct pl_sw_fns {
+    // This destructor follows the same rules as `pl_gpu_fns`
+    void (*destroy)(pl_swapchain sw);
+
+    SW_PFN(latency); // optional
+    SW_PFN(resize); // optional
+    SW_PFN(colorspace_hint); // optional
+    SW_PFN(start_frame);
+    SW_PFN(submit_frame);
+    SW_PFN(swap_buffers);
+};
+#undef SW_PFN
diff --git a/src/tests/bench.c b/src/tests/bench.c
new file mode 100644
index 0000000..22638d8
--- /dev/null
+++ b/src/tests/bench.c
@@ -0,0 +1,550 @@
+#include "tests.h"
+
+#include <libplacebo/dispatch.h>
+#include <libplacebo/vulkan.h>
+#include <libplacebo/shaders/colorspace.h>
+#include <libplacebo/shaders/deinterlacing.h>
+#include <libplacebo/shaders/sampling.h>
+
+enum {
+    // Image configuration
+    NUM_TEX     = 16,
+    WIDTH       = 2048,
+    HEIGHT      = 2048,
+    DEPTH       = 16,
+    COMPS       = 4,
+
+    // Queue configuration
+    NUM_QUEUES  = NUM_TEX,
+    ASYNC_TX    = 1,
+    ASYNC_COMP  = 1,
+
+    // Test configuration
+    TEST_MS     = 1000,
+    WARMUP_MS   = 500,
+};
+
+static pl_tex create_test_img(pl_gpu gpu)
+{
+    pl_fmt fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, COMPS, DEPTH, 32, PL_FMT_CAP_LINEAR);
+    REQUIRE(fmt);
+
+    const float xc = (WIDTH  - 1) / 2.0f;
+    const float yc = (HEIGHT - 1) / 2.0f;
+    const float kf = 0.5f / sqrtf(xc * xc + yc * yc);
+    const float invphi = 0.61803398874989;
+    const float freqR = kf * M_PI * 0.2f;
+    const float freqG = freqR * invphi;
+    const float freqB = freqG * invphi;
+    float *data = malloc(WIDTH * HEIGHT * COMPS * sizeof(float));
+    for (int y = 0; y < HEIGHT; y++) {
+        for (int x = 0; x < WIDTH; x++) {
+            float *color = &data[(y * WIDTH + x) * COMPS];
+            float xx = x - xc, yy = y - yc;
+            float r2 = xx * xx + yy * yy;
+            switch (COMPS) {
+            case 4: color[3] = 1.0;
+            case 3: color[2] = 0.5f * sinf(freqB * r2) + 0.5f;;
+            case 2: color[1] = 0.5f * sinf(freqG * r2) + 0.5f;;
+            case 1: color[0] = 0.5f * sinf(freqR * r2) + 0.5f;;
+            }
+        }
+    }
+
+    pl_tex tex = pl_tex_create(gpu, pl_tex_params(
+        .format         = fmt,
+        .w              = WIDTH,
+        .h              = HEIGHT,
+        .sampleable     = true,
+        .initial_data   = data,
+    ));
+
+    free(data);
+    REQUIRE(tex);
+    return tex;
+}
+
+struct bench {
+    void (*run_sh)(pl_shader sh, pl_shader_obj *state,
+                   pl_tex src);
+
+    void (*run_tex)(pl_gpu gpu, pl_tex tex);
+};
+
+static void run_bench(pl_gpu gpu, pl_dispatch dp,
+                      pl_shader_obj *state, pl_tex src,
+                      pl_tex fbo, pl_timer timer,
+                      const struct bench *bench)
+{
+    REQUIRE(bench);
+    REQUIRE(bench->run_sh || bench->run_tex);
+    if (bench->run_sh) {
+        pl_shader sh = pl_dispatch_begin(dp);
+        bench->run_sh(sh, state, src);
+
+        pl_dispatch_finish(dp, pl_dispatch_params(
+            .shader = &sh,
+            .target = fbo,
+            .timer = timer,
+        ));
+    } else {
+        bench->run_tex(gpu, fbo);
+    }
+}
+
+static void benchmark(pl_gpu gpu, const char *name,
+                      const struct bench *bench)
+{
+    pl_dispatch dp = pl_dispatch_create(gpu->log, gpu);
+    REQUIRE(dp);
+    pl_shader_obj state = NULL;
+    pl_tex src = create_test_img(gpu);
+
+    // Create the FBOs
+    pl_fmt fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, COMPS, DEPTH, 32,
+                             PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_BLITTABLE);
+    REQUIRE(fmt);
+
+    pl_tex fbos[NUM_TEX] = {0};
+    for (int i = 0; i < NUM_TEX; i++) {
+        fbos[i] = pl_tex_create(gpu, pl_tex_params(
+            .format         = fmt,
+            .w              = WIDTH,
+            .h              = HEIGHT,
+            .renderable     = true,
+            .blit_dst       = true,
+            .host_writable  = true,
+            .host_readable  = true,
+            .storable       = !!(fmt->caps & PL_FMT_CAP_STORABLE),
+        ));
+        REQUIRE(fbos[i]);
+
+        pl_tex_clear(gpu, fbos[i], (float[4]){ 0.0 });
+    }
+
+    // Run the benchmark and flush+block once to force shader compilation etc.
+    run_bench(gpu, dp, &state, src, fbos[0], NULL, bench);
+    pl_gpu_finish(gpu);
+
+    // Perform the actual benchmark
+    pl_clock_t start_warmup = 0, start_test = 0;
+    unsigned long frames = 0, frames_warmup = 0;
+
+    pl_timer timer = pl_timer_create(gpu);
+    uint64_t gputime_total = 0;
+    unsigned long gputime_count = 0;
+    uint64_t gputime;
+
+    start_warmup = pl_clock_now();
+    do {
+        const int idx = frames % NUM_TEX;
+        while (pl_tex_poll(gpu, fbos[idx], UINT64_MAX))
+            ; // do nothing
+        run_bench(gpu, dp, &state, src, fbos[idx], start_test ? timer : NULL, bench);
+        pl_gpu_flush(gpu);
+        frames++;
+
+        if (start_test) {
+            while ((gputime = pl_timer_query(gpu, timer))) {
+                gputime_total += gputime;
+                gputime_count++;
+            }
+        }
+
+        pl_clock_t now = pl_clock_now();
+        if (start_test) {
+            if (pl_clock_diff(now, start_test) > TEST_MS * 1e-3)
+                break;
+        } else if (pl_clock_diff(now, start_warmup) > WARMUP_MS * 1e-3) {
+            start_test = now;
+            frames_warmup = frames;
+        }
+    } while (true);
+
+    // Force the GPU to finish execution and re-measure the final stop time
+    pl_gpu_finish(gpu);
+
+    pl_clock_t stop = pl_clock_now();
+    while ((gputime = pl_timer_query(gpu, timer))) {
+        gputime_total += gputime;
+        gputime_count++;
+    }
+
+    frames -= frames_warmup;
+    double secs = pl_clock_diff(stop, start_test);
+    printf("'%s':\t%4lu frames in %1.6f seconds => %2.6f ms/frame (%5.2f FPS)",
+          name, frames, secs, 1000 * secs / frames, frames / secs);
+    if (gputime_count)
+        printf(", gpu time: %2.6f ms", 1e-6 * gputime_total / gputime_count);
+    printf("\n");
+
+    pl_timer_destroy(gpu, &timer);
+    pl_shader_obj_destroy(&state);
+    pl_dispatch_destroy(&dp);
+    pl_tex_destroy(gpu, &src);
+    for (int i = 0; i < NUM_TEX; i++)
+        pl_tex_destroy(gpu, &fbos[i]);
+}
+
+// List of benchmarks
+static void bench_deband(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    pl_shader_deband(sh, pl_sample_src( .tex = src ), NULL);
+}
+
+static void bench_deband_heavy(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    pl_shader_deband(sh, pl_sample_src( .tex = src ), pl_deband_params(
+        .iterations = 4,
+        .threshold  = 4.0,
+        .radius     = 4.0,
+        .grain      = 16.0,
+    ));
+}
+
+static void bench_bilinear(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    REQUIRE(pl_shader_sample_bilinear(sh, pl_sample_src( .tex = src )));
+}
+
+static void bench_bicubic(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    REQUIRE(pl_shader_sample_bicubic(sh, pl_sample_src( .tex = src )));
+}
+
+static void bench_hermite(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    REQUIRE(pl_shader_sample_hermite(sh, pl_sample_src( .tex = src )));
+}
+
+static void bench_gaussian(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    REQUIRE(pl_shader_sample_gaussian(sh, pl_sample_src( .tex = src )));
+}
+
+static void bench_dither_blue(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
+    pl_shader_dither(sh, 8, state, pl_dither_params(
+        .method = PL_DITHER_BLUE_NOISE,
+    ));
+}
+
+static void bench_dither_white(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
+    pl_shader_dither(sh, 8, state, pl_dither_params(
+        .method = PL_DITHER_WHITE_NOISE,
+    ));
+}
+
+static void bench_dither_ordered_fix(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
+    pl_shader_dither(sh, 8, state, pl_dither_params(
+        .method = PL_DITHER_ORDERED_FIXED,
+    ));
+}
+
+static void bench_polar(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    struct pl_sample_filter_params params = {
+        .filter = pl_filter_ewa_lanczos,
+        .lut = state,
+    };
+
+    REQUIRE(pl_shader_sample_polar(sh, pl_sample_src( .tex = src ), &params));
+}
+
+static void bench_polar_nocompute(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    struct pl_sample_filter_params params = {
+        .filter = pl_filter_ewa_lanczos,
+        .no_compute = true,
+        .lut = state,
+    };
+
+    REQUIRE(pl_shader_sample_polar(sh, pl_sample_src( .tex = src ), &params));
+}
+
+static void bench_hdr_peak(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
+    REQUIRE(pl_shader_detect_peak(sh, pl_color_space_hdr10, state, &pl_peak_detect_default_params));
+}
+
+static void bench_hdr_peak_hq(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
+    REQUIRE(pl_shader_detect_peak(sh, pl_color_space_hdr10, state, &pl_peak_detect_high_quality_params));
+}
+
+static void bench_hdr_lut(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    struct pl_color_map_params params = {
+        PL_COLOR_MAP_DEFAULTS
+        .tone_mapping_function  = &pl_tone_map_bt2390,
+        .tone_mapping_mode      = PL_TONE_MAP_RGB,
+    };
+
+    REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
+    pl_shader_color_map_ex(sh, &params, pl_color_map_args(
+        .src = pl_color_space_hdr10,
+        .dst = pl_color_space_monitor,
+        .state = state,
+    ));
+}
+
+static void bench_hdr_clip(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    struct pl_color_map_params params = {
+        PL_COLOR_MAP_DEFAULTS
+        .tone_mapping_function  = &pl_tone_map_clip,
+        .tone_mapping_mode      = PL_TONE_MAP_RGB,
+    };
+
+    REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
+    pl_shader_color_map_ex(sh, &params, pl_color_map_args(
+        .src = pl_color_space_hdr10,
+        .dst = pl_color_space_monitor,
+        .state = state,
+    ));
+}
+
+static void bench_weave(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    struct pl_deinterlace_source dsrc = {
+        .cur = pl_field_pair(src),
+        .field = PL_FIELD_TOP,
+    };
+
+    pl_shader_deinterlace(sh, &dsrc, pl_deinterlace_params(
+        .algo = PL_DEINTERLACE_WEAVE,
+    ));
+}
+
+static void bench_bob(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    struct pl_deinterlace_source dsrc = {
+        .cur = pl_field_pair(src),
+        .field = PL_FIELD_TOP,
+    };
+
+    pl_shader_deinterlace(sh, &dsrc, pl_deinterlace_params(
+        .algo = PL_DEINTERLACE_BOB,
+    ));
+}
+
+static void bench_yadif(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    struct pl_deinterlace_source dsrc = {
+        .prev = pl_field_pair(src),
+        .cur = pl_field_pair(src),
+        .next = pl_field_pair(src),
+        .field = PL_FIELD_TOP,
+    };
+
+    pl_shader_deinterlace(sh, &dsrc, pl_deinterlace_params(
+        .algo = PL_DEINTERLACE_YADIF,
+    ));
+}
+
+static void bench_av1_grain(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    struct pl_film_grain_params params = {
+        .data = {
+            .type = PL_FILM_GRAIN_AV1,
+            .params.av1 = av1_grain_data,
+            .seed = rand(),
+        },
+        .tex = src,
+        .components = 3,
+        .component_mapping = {0, 1, 2},
+        .repr = &(struct pl_color_repr) {0},
+    };
+
+    REQUIRE(pl_shader_film_grain(sh, state, &params));
+}
+
+static void bench_av1_grain_lap(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    struct pl_film_grain_params params = {
+        .data = {
+            .type = PL_FILM_GRAIN_AV1,
+            .params.av1 = av1_grain_data,
+            .seed = rand(),
+        },
+        .tex = src,
+        .components = 3,
+        .component_mapping = {0, 1, 2},
+        .repr = &(struct pl_color_repr) {0},
+    };
+
+    params.data.params.av1.overlap = true;
+    REQUIRE(pl_shader_film_grain(sh, state, &params));
+}
+
+static void bench_h274_grain(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    struct pl_film_grain_params params = {
+        .data = {
+            .type = PL_FILM_GRAIN_H274,
+            .params.h274 = h274_grain_data,
+            .seed = rand(),
+        },
+        .tex = src,
+        .components = 3,
+        .component_mapping = {0, 1, 2},
+        .repr = &(struct pl_color_repr) {0},
+    };
+
+    REQUIRE(pl_shader_film_grain(sh, state, &params));
+}
+
+static void bench_reshape_poly(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
+    pl_shader_dovi_reshape(sh, &(struct pl_dovi_metadata) { .comp = {
+        {
+            .num_pivots = 8,
+            .pivots = {0.0, 0.00488758553, 0.0420332365, 0.177908108,
+                       0.428152502, 0.678396881, 0.92864126, 1.0},
+            .method = {0, 0, 0, 0, 0, 0, 0},
+            .poly_coeffs = {
+                {0.00290930271, 2.30019712, 50.1446037},
+                {0.00725257397, 1.88119054, -4.49443769},
+                {0.0150123835, 1.61106598, -1.64833081},
+                {0.0498571396, 1.2059114, -0.430627108},
+                {0.0878019333, 1.01845241, -0.19669354},
+                {0.120447636, 0.920134187, -0.122338772},
+                {2.12430835, -3.30913281, 2.10893941},
+            },
+        }, {
+            .num_pivots = 2,
+            .pivots = {0.0, 1.0},
+            .method = {0},
+            .poly_coeffs = {{-0.397901177, 1.85908031, 0}},
+        }, {
+            .num_pivots = 2,
+            .pivots = {0.0, 1.0},
+            .method = {0},
+            .poly_coeffs = {{-0.399355531, 1.85591626, 0}},
+        },
+    }});
+}
+
+static void bench_reshape_mmr(pl_shader sh, pl_shader_obj *state, pl_tex src)
+{
+    REQUIRE(pl_shader_sample_direct(sh, pl_sample_src( .tex = src )));
+    pl_shader_dovi_reshape(sh, &dovi_meta); // this includes MMR
+}
+
+static float data[WIDTH * HEIGHT * COMPS + 8192];
+
+static void bench_download(pl_gpu gpu, pl_tex tex)
+{
+    REQUIRE(pl_tex_download(gpu, pl_tex_transfer_params(
+        .tex = tex,
+        .ptr = (uint8_t *) PL_ALIGN((uintptr_t) data, 4096),
+    )));
+}
+
+static void bench_upload(pl_gpu gpu, pl_tex tex)
+{
+    REQUIRE(pl_tex_upload(gpu, pl_tex_transfer_params(
+        .tex = tex,
+        .ptr = (uint8_t *) PL_ALIGN((uintptr_t) data, 4096),
+    )));
+}
+
+static void dummy_cb(void *arg) {}
+
+static void bench_download_async(pl_gpu gpu, pl_tex tex)
+{
+    REQUIRE(pl_tex_download(gpu, pl_tex_transfer_params(
+        .tex = tex,
+        .ptr = (uint8_t *) PL_ALIGN((uintptr_t) data, 4096),
+        .callback = dummy_cb,
+    )));
+}
+
+static void bench_upload_async(pl_gpu gpu, pl_tex tex)
+{
+    REQUIRE(pl_tex_upload(gpu, pl_tex_transfer_params(
+        .tex = tex,
+        .ptr = (uint8_t *) PL_ALIGN((uintptr_t) data, 4096),
+        .callback = dummy_cb,
+    )));
+}
+
+int main()
+{
+    setbuf(stdout, NULL);
+    setbuf(stderr, NULL);
+
+    pl_log log = pl_log_create(PL_API_VER, pl_log_params(
+        .log_cb     = isatty(fileno(stdout)) ? pl_log_color : pl_log_simple,
+        .log_level  = PL_LOG_WARN,
+    ));
+
+    pl_vulkan vk = pl_vulkan_create(log, pl_vulkan_params(
+        .allow_software = true,
+        .async_transfer = ASYNC_TX,
+        .async_compute  = ASYNC_COMP,
+        .queue_count    = NUM_QUEUES,
+    ));
+
+    if (!vk)
+        return SKIP;
+
+#define BENCH_SH(fn)  &(struct bench) { .run_sh = fn }
+#define BENCH_TEX(fn) &(struct bench) { .run_tex = fn }
+
+    printf("= Running benchmarks =\n");
+    benchmark(vk->gpu, "tex_download ptr", BENCH_TEX(bench_download));
+    benchmark(vk->gpu, "tex_download ptr async", BENCH_TEX(bench_download_async));
+    benchmark(vk->gpu, "tex_upload ptr", BENCH_TEX(bench_upload));
+    benchmark(vk->gpu, "tex_upload ptr async", BENCH_TEX(bench_upload_async));
+    benchmark(vk->gpu, "bilinear", BENCH_SH(bench_bilinear));
+    benchmark(vk->gpu, "bicubic", BENCH_SH(bench_bicubic));
+    benchmark(vk->gpu, "hermite", BENCH_SH(bench_hermite));
+    benchmark(vk->gpu, "gaussian", BENCH_SH(bench_gaussian));
+    benchmark(vk->gpu, "deband", BENCH_SH(bench_deband));
+    benchmark(vk->gpu, "deband_heavy", BENCH_SH(bench_deband_heavy));
+
+    // Deinterlacing
+    benchmark(vk->gpu, "weave", BENCH_SH(bench_weave));
+    benchmark(vk->gpu, "bob", BENCH_SH(bench_bob));
+    benchmark(vk->gpu, "yadif", BENCH_SH(bench_yadif));
+
+    // Polar sampling
+    benchmark(vk->gpu, "polar", BENCH_SH(bench_polar));
+    if (vk->gpu->glsl.compute)
+        benchmark(vk->gpu, "polar_nocompute", BENCH_SH(bench_polar_nocompute));
+
+    // Dithering algorithms
+    benchmark(vk->gpu, "dither_blue", BENCH_SH(bench_dither_blue));
+    benchmark(vk->gpu, "dither_white", BENCH_SH(bench_dither_white));
+    benchmark(vk->gpu, "dither_ordered_fixed", BENCH_SH(bench_dither_ordered_fix));
+
+    // HDR peak detection
+    if (vk->gpu->glsl.compute) {
+        benchmark(vk->gpu, "hdr_peakdetect",    BENCH_SH(bench_hdr_peak));
+        benchmark(vk->gpu, "hdr_peakdetect_hq", BENCH_SH(bench_hdr_peak_hq));
+    }
+
+    // Tone mapping
+    benchmark(vk->gpu, "hdr_lut", BENCH_SH(bench_hdr_lut));
+    benchmark(vk->gpu, "hdr_clip", BENCH_SH(bench_hdr_clip));
+
+    // Misc stuff
+    benchmark(vk->gpu, "av1_grain", BENCH_SH(bench_av1_grain));
+    benchmark(vk->gpu, "av1_grain_lap", BENCH_SH(bench_av1_grain_lap));
+    benchmark(vk->gpu, "h274_grain", BENCH_SH(bench_h274_grain));
+    benchmark(vk->gpu, "reshape_poly", BENCH_SH(bench_reshape_poly));
+    benchmark(vk->gpu, "reshape_mmr", BENCH_SH(bench_reshape_mmr));
+
+    pl_vulkan_destroy(&vk);
+    pl_log_destroy(&log);
+    return 0;
+}
diff --git a/src/tests/cache.c b/src/tests/cache.c
new file mode 100644
index 0000000..667435d
--- /dev/null
+++ b/src/tests/cache.c
@@ -0,0 +1,215 @@
+#include "tests.h"
+
+#include <libplacebo/cache.h>
+
+// Returns "foo" for even keys, "bar" for odd
+static pl_cache_obj lookup_foobar(void *priv, uint64_t key)
+{
+    return (pl_cache_obj) {
+        .key = 0xFFFF, // test key sanity
+        .data = (key & 1) ? "bar" : "foo",
+        .size = 3,
+    };
+}
+
+static void update_count(void *priv, pl_cache_obj obj)
+{
+    int *count = priv;
+    *count += obj.size ? 1 : -1;
+}
+
+enum {
+    KEY1 = 0x9c65575f419288f5,
+    KEY2 = 0x92da969be9b88086,
+    KEY3 = 0x7fcb62540b00bc8b,
+    KEY4 = 0x46c60ec11af9dde3,
+    KEY5 = 0xcb6760b98ece2477,
+    KEY6 = 0xf37dc72b7f9e5c88,
+    KEY7 = 0x30c18c962d82e5f5,
+};
+
+int main()
+{
+    pl_log log = pl_test_logger();
+    pl_cache test = pl_cache_create(pl_cache_params(
+        .log             = log,
+        .max_object_size = 16,
+        .max_total_size  = 32,
+    ));
+
+    pl_cache_obj obj1 = { .key  = KEY1, .data = "abc",  .size = 3 };
+    pl_cache_obj obj2 = { .key  = KEY2, .data = "de",   .size = 2 };
+    pl_cache_obj obj3 = { .key  = KEY3, .data = "xyzw", .size = 4 };
+
+    REQUIRE(pl_cache_try_set(test, &obj1));
+    REQUIRE(pl_cache_try_set(test, &obj2));
+    REQUIRE(pl_cache_try_set(test, &obj3));
+    REQUIRE_CMP(pl_cache_size(test), ==, 9, "zu");
+    REQUIRE_CMP(pl_cache_objects(test), ==, 3, "d");
+    REQUIRE(pl_cache_try_set(test, &obj2)); // delete KEY2
+    REQUIRE_CMP(pl_cache_size(test), ==, 7, "zu");
+    REQUIRE_CMP(pl_cache_objects(test), ==, 2, "d");
+
+    REQUIRE(pl_cache_get(test, &obj1));
+    REQUIRE(!pl_cache_get(test, &obj2));
+    REQUIRE(pl_cache_get(test, &obj3));
+    REQUIRE_CMP(pl_cache_size(test), ==, 0, "zu");
+    REQUIRE_CMP(pl_cache_objects(test), ==, 0, "d");
+    REQUIRE_MEMEQ(obj1.data, "abc", 3);
+    REQUIRE_MEMEQ(obj3.data, "xyzw", 4);
+
+    // Re-insert removed objects (in reversed order)
+    REQUIRE(pl_cache_try_set(test, &obj3));
+    REQUIRE(pl_cache_try_set(test, &obj1));
+    REQUIRE_CMP(pl_cache_size(test), ==, 7, "zu");
+    REQUIRE_CMP(pl_cache_objects(test), ==, 2, "d");
+
+    uint8_t ref[72];
+    memset(ref, 0xbe, sizeof(ref));
+    uint8_t *refp = ref;
+
+#define PAD_ALIGN(x) PL_ALIGN2(x, sizeof(uint32_t))
+#define W(type, ...)                                    \
+    do {                                                \
+        size_t sz = sizeof((type){__VA_ARGS__});        \
+        pl_assert(ref + sizeof(ref) - refp >= sz);      \
+        memcpy(refp, &(type){__VA_ARGS__}, sz);         \
+        refp += sz;                                     \
+        size_t pad_sz = PAD_ALIGN(sz) - sz;             \
+        pl_assert(ref + sizeof(ref) - refp >= pad_sz);  \
+        memcpy(refp, &(char[PAD_ALIGN(1)]){0}, pad_sz); \
+        refp += pad_sz;                                 \
+    } while (0)
+
+    W(char[], 'p', 'l', '_', 'c', 'a', 'c', 'h', 'e');  // cache magic
+    W(uint32_t, 1);                                     // cache version
+    W(uint32_t, 2);                                     // number of objects
+
+    // object 3
+    W(uint64_t, KEY3);                // key
+    W(uint64_t, 4);                   // size
+#ifdef PL_HAVE_XXHASH
+    W(uint64_t, 0xd43612ef3fbee8be);  // hash
+#else
+    W(uint64_t, 0xec18884e5e471117);  // hash
+#endif
+    W(char[], 'x', 'y', 'z', 'w');    // data
+
+    // object 1
+    W(uint64_t, KEY1);                // key
+    W(uint64_t, 3);                   // size
+#ifdef PL_HAVE_XXHASH
+    W(uint64_t, 0x78af5f94892f3950);  // hash
+#else
+    W(uint64_t, 0x3a204d408a2e2d77);  // hash
+#endif
+    W(char[], 'a', 'b', 'c');         // data
+
+#undef W
+#undef PAD_ALIGN
+
+    uint8_t data[100];
+    pl_static_assert(sizeof(data) >= sizeof(ref));
+    REQUIRE_CMP(pl_cache_save(test, data, sizeof(data)), ==, sizeof(ref), "zu");
+    REQUIRE_MEMEQ(data, ref, sizeof(ref));
+
+    pl_cache test2 = pl_cache_create(pl_cache_params( .log = log ));
+    REQUIRE_CMP(pl_cache_load(test2, data, sizeof(data)), ==, 2, "d");
+    REQUIRE_CMP(pl_cache_size(test2), ==, 7, "zu");
+    REQUIRE_CMP(pl_cache_save(test2, NULL, 0), ==, sizeof(ref), "zu");
+    REQUIRE_CMP(pl_cache_save(test2, data, sizeof(data)), ==, sizeof(ref), "zu");
+    REQUIRE_MEMEQ(data, ref, sizeof(ref));
+
+    // Test loading invalid data
+    REQUIRE_CMP(pl_cache_load(test2, ref, 0),   <, 0, "d"); // empty file
+    REQUIRE_CMP(pl_cache_load(test2, ref, 5),   <, 0, "d"); // truncated header
+    REQUIRE_CMP(pl_cache_load(test2, ref, 64), ==, 1, "d"); // truncated object data
+    data[sizeof(ref) - 2] = 'X'; // corrupt data
+    REQUIRE_CMP(pl_cache_load(test2, data, sizeof(ref)), ==, 1, "d"); // bad checksum
+    pl_cache_destroy(&test2);
+
+    // Inserting too large object should fail
+    uint8_t zero[32] = {0};
+    pl_cache_obj obj4 = { .key = KEY4, .data = zero, .size = 32 };
+    REQUIRE(!pl_cache_try_set(test, &obj4));
+    REQUIRE(!pl_cache_get(test, &obj4));
+    REQUIRE_CMP(pl_cache_size(test), ==, 7, "zu");
+    REQUIRE_CMP(pl_cache_objects(test), ==, 2, "d");
+
+    // Inserting 16-byte object should succeed, and not purge old entries
+    obj4 = (pl_cache_obj) { .key = KEY4, .data = zero, .size = 16 };
+    REQUIRE(pl_cache_try_set(test, &obj4));
+    REQUIRE_CMP(pl_cache_size(test), ==, 23, "zu");
+    REQUIRE_CMP(pl_cache_objects(test), ==, 3, "d");
+    REQUIRE(pl_cache_get(test, &obj1));
+    REQUIRE(pl_cache_get(test, &obj3));
+    REQUIRE(pl_cache_get(test, &obj4));
+    pl_cache_set(test, &obj1);
+    pl_cache_set(test, &obj3);
+    pl_cache_set(test, &obj4);
+    REQUIRE_CMP(pl_cache_size(test), ==, 23, "zu");
+    REQUIRE_CMP(pl_cache_objects(test), ==, 3, "d");
+
+    // Inserting another 10-byte object should purge entry KEY1
+    pl_cache_obj obj5 = { .key = KEY5, .data = zero, .size = 10 };
+    REQUIRE(pl_cache_try_set(test, &obj5));
+    REQUIRE_CMP(pl_cache_size(test), ==, 30, "zu");
+    REQUIRE_CMP(pl_cache_objects(test), ==, 3, "d");
+    REQUIRE(!pl_cache_get(test, &obj1));
+    REQUIRE(pl_cache_get(test, &obj3));
+    REQUIRE(pl_cache_get(test, &obj4));
+    REQUIRE(pl_cache_get(test, &obj5));
+    pl_cache_set(test, &obj3);
+    pl_cache_set(test, &obj4);
+    pl_cache_set(test, &obj5);
+    REQUIRE_CMP(pl_cache_size(test), ==, 30, "zu");
+    REQUIRE_CMP(pl_cache_objects(test), ==, 3, "d");
+
+    // Inserting final 6-byte object should purge entry KEY3
+    pl_cache_obj obj6 = { .key = KEY6, .data = zero, .size = 6 };
+    REQUIRE(pl_cache_try_set(test, &obj6));
+    REQUIRE_CMP(pl_cache_size(test), ==, 32, "zu");
+    REQUIRE_CMP(pl_cache_objects(test), ==, 3, "d");
+    REQUIRE(!pl_cache_get(test, &obj3));
+    REQUIRE(pl_cache_get(test, &obj4));
+    REQUIRE(pl_cache_get(test, &obj5));
+    REQUIRE(pl_cache_get(test, &obj6));
+    REQUIRE_CMP(pl_cache_size(test), ==, 0, "zu");
+    REQUIRE_CMP(pl_cache_objects(test), ==, 0, "d");
+    pl_cache_obj_free(&obj4);
+    pl_cache_obj_free(&obj5);
+    pl_cache_obj_free(&obj6);
+
+    // Test callback API
+    int num_objects = 0;
+    test2 = pl_cache_create(pl_cache_params(
+        .get  = lookup_foobar,
+        .set  = update_count,
+        .priv = &num_objects,
+    ));
+
+    REQUIRE(pl_cache_get(test2, &obj1));
+    REQUIRE_CMP(obj1.key, ==, KEY1, PRIu64);
+    REQUIRE_CMP(obj1.size, ==, 3, "zu");
+    REQUIRE_MEMEQ(obj1.data, "bar", 3);
+    REQUIRE(pl_cache_get(test2, &obj2));
+    REQUIRE_CMP(obj2.key, ==, KEY2, PRIu64);
+    REQUIRE_CMP(obj2.size, ==, 3, "zu");
+    REQUIRE_MEMEQ(obj2.data, "foo", 3);
+    REQUIRE_CMP(pl_cache_objects(test2), ==, 0, "d");
+    REQUIRE_CMP(num_objects, ==, 0, "d");
+    REQUIRE(pl_cache_try_set(test2, &obj1));
+    REQUIRE(pl_cache_try_set(test2, &obj2));
+    REQUIRE(pl_cache_try_set(test2, &(pl_cache_obj) { .key = KEY7, .data = "abcde", .size = 5 }));
+    REQUIRE_CMP(pl_cache_objects(test2), ==, 3, "d");
+    REQUIRE_CMP(num_objects, ==, 3, "d");
+    REQUIRE(pl_cache_try_set(test2, &obj1));
+    REQUIRE(pl_cache_try_set(test2, &obj2));
+    REQUIRE_CMP(pl_cache_objects(test2), ==, 1, "d");
+    REQUIRE_CMP(num_objects, ==, 1, "d");
+    pl_cache_destroy(&test2);
+
+    pl_cache_destroy(&test);
+    pl_log_destroy(&log);
+    return 0;
+}
diff --git a/src/tests/colorspace.c b/src/tests/colorspace.c
new file mode 100644
index 0000000..4b0662b
--- /dev/null
+++ b/src/tests/colorspace.c
@@ -0,0 +1,488 @@
+#include "tests.h"
+
+int main()
+{
+    for (enum pl_color_system sys = 0; sys < PL_COLOR_SYSTEM_COUNT; sys++) {
+        bool ycbcr = sys >= PL_COLOR_SYSTEM_BT_601 && sys <= PL_COLOR_SYSTEM_YCGCO;
+        REQUIRE_CMP(ycbcr, ==, pl_color_system_is_ycbcr_like(sys), "d");
+    }
+
+    for (enum pl_color_transfer trc = 0; trc < PL_COLOR_TRC_COUNT; trc++) {
+        bool hdr = trc >= PL_COLOR_TRC_PQ && trc <= PL_COLOR_TRC_S_LOG2;
+        REQUIRE_CMP(hdr, ==, pl_color_transfer_is_hdr(trc), "d");
+        REQUIRE_CMP(pl_color_transfer_nominal_peak(trc), >=, 1.0, "f");
+    }
+
+    float pq_peak = pl_color_transfer_nominal_peak(PL_COLOR_TRC_PQ);
+    REQUIRE_FEQ(PL_COLOR_SDR_WHITE * pq_peak, 10000, 1e-7);
+
+    struct pl_color_repr tv_repr = {
+        .sys       = PL_COLOR_SYSTEM_BT_709,
+        .levels    = PL_COLOR_LEVELS_LIMITED,
+    };
+
+    struct pl_color_repr pc_repr = {
+        .sys       = PL_COLOR_SYSTEM_RGB,
+        .levels    = PL_COLOR_LEVELS_FULL,
+    };
+
+    // Ensure this is a no-op for bits == bits
+    for (int bits = 1; bits <= 16; bits++) {
+        tv_repr.bits.color_depth = tv_repr.bits.sample_depth = bits;
+        pc_repr.bits.color_depth = pc_repr.bits.sample_depth = bits;
+        REQUIRE_FEQ(pl_color_repr_normalize(&tv_repr), 1.0, 1e-7);
+        REQUIRE_FEQ(pl_color_repr_normalize(&pc_repr), 1.0, 1e-7);
+    }
+
+    tv_repr.bits.color_depth  = 8;
+    tv_repr.bits.sample_depth = 10;
+    float tv8to10 = pl_color_repr_normalize(&tv_repr);
+
+    tv_repr.bits.color_depth  = 8;
+    tv_repr.bits.sample_depth = 12;
+    float tv8to12 = pl_color_repr_normalize(&tv_repr);
+
+    // Simulate the effect of GPU texture sampling on UNORM texture
+    REQUIRE_FEQ(tv8to10 * 16 /1023.,  64/1023., 1e-7); // black
+    REQUIRE_FEQ(tv8to10 * 235/1023., 940/1023., 1e-7); // nominal white
+    REQUIRE_FEQ(tv8to10 * 128/1023., 512/1023., 1e-7); // achromatic
+    REQUIRE_FEQ(tv8to10 * 240/1023., 960/1023., 1e-7); // nominal chroma peak
+
+    REQUIRE_FEQ(tv8to12 * 16 /4095., 256 /4095., 1e-7); // black
+    REQUIRE_FEQ(tv8to12 * 235/4095., 3760/4095., 1e-7); // nominal white
+    REQUIRE_FEQ(tv8to12 * 128/4095., 2048/4095., 1e-7); // achromatic
+    REQUIRE_FEQ(tv8to12 * 240/4095., 3840/4095., 1e-7); // nominal chroma peak
+
+    // Ensure lavc's xyz12 is handled correctly
+    struct pl_color_repr xyz12 = {
+        .sys    = PL_COLOR_SYSTEM_XYZ,
+        .levels = PL_COLOR_LEVELS_UNKNOWN,
+        .bits   = {
+            .sample_depth = 16,
+            .color_depth  = 12,
+            .bit_shift    = 4,
+        },
+    };
+
+    float xyz = pl_color_repr_normalize(&xyz12);
+    REQUIRE_FEQ(xyz * (4095 << 4), 65535, 1e-7);
+
+    // Assume we uploaded a 10-bit source directly (unshifted) as a 16-bit
+    // texture. This texture multiplication factor should make it behave as if
+    // it was uploaded as a 10-bit texture instead.
+    pc_repr.bits.color_depth = 10;
+    pc_repr.bits.sample_depth = 16;
+    float pc10to16 = pl_color_repr_normalize(&pc_repr);
+    REQUIRE_FEQ(pc10to16 * 1000/65535., 1000/1023., 1e-7);
+
+    const struct pl_raw_primaries *bt709, *bt2020, *dcip3;
+    bt709 = pl_raw_primaries_get(PL_COLOR_PRIM_BT_709);
+    bt2020 = pl_raw_primaries_get(PL_COLOR_PRIM_BT_2020);
+    dcip3 = pl_raw_primaries_get(PL_COLOR_PRIM_DCI_P3);
+    REQUIRE(pl_primaries_superset(bt2020, bt709));
+    REQUIRE(!pl_primaries_superset(bt2020, dcip3)); // small region doesn't overlap
+    REQUIRE(pl_primaries_superset(dcip3, bt709));
+    REQUIRE(!pl_primaries_superset(bt709, bt2020));
+    REQUIRE(pl_primaries_compatible(bt2020, bt2020));
+    REQUIRE(pl_primaries_compatible(bt2020, bt709));
+    REQUIRE(pl_primaries_compatible(bt709, bt2020));
+    REQUIRE(pl_primaries_compatible(bt2020, dcip3));
+    REQUIRE(pl_primaries_compatible(bt709, dcip3));
+
+    struct pl_raw_primaries bt709_2020 = pl_primaries_clip(bt709, bt2020);
+    struct pl_raw_primaries bt2020_709 = pl_primaries_clip(bt2020, bt709);
+    REQUIRE(pl_raw_primaries_similar(&bt709_2020, bt709));
+    REQUIRE(pl_raw_primaries_similar(&bt2020_709, bt709));
+
+    struct pl_raw_primaries dcip3_bt2020 = pl_primaries_clip(dcip3, bt2020);
+    struct pl_raw_primaries dcip3_bt709  = pl_primaries_clip(dcip3, bt709);
+    REQUIRE(pl_primaries_superset(dcip3,  &dcip3_bt2020));
+    REQUIRE(pl_primaries_superset(dcip3,  &dcip3_bt709));
+    REQUIRE(pl_primaries_superset(bt2020, &dcip3_bt2020));
+    REQUIRE(pl_primaries_superset(bt709,  &dcip3_bt709));
+
+    pl_matrix3x3 rgb2xyz, rgb2xyz_;
+    rgb2xyz = rgb2xyz_ = pl_get_rgb2xyz_matrix(bt709);
+    pl_matrix3x3_invert(&rgb2xyz_);
+    pl_matrix3x3_invert(&rgb2xyz_);
+
+    // Make sure the double-inversion round trips
+    for (int y = 0; y < 3; y++) {
+        for (int x = 0; x < 3; x++)
+            REQUIRE_FEQ(rgb2xyz.m[y][x], rgb2xyz_.m[y][x], 1e-6);
+    }
+
+    // Make sure mapping the spectral RGB colors (i.e. the matrix rows) matches
+    // our original primaries
+    float Y = rgb2xyz.m[1][0];
+    REQUIRE_FEQ(rgb2xyz.m[0][0], pl_cie_X(bt709->red) * Y, 1e-7);
+    REQUIRE_FEQ(rgb2xyz.m[2][0], pl_cie_Z(bt709->red) * Y, 1e-7);
+    Y = rgb2xyz.m[1][1];
+    REQUIRE_FEQ(rgb2xyz.m[0][1], pl_cie_X(bt709->green) * Y, 1e-7);
+    REQUIRE_FEQ(rgb2xyz.m[2][1], pl_cie_Z(bt709->green) * Y, 1e-7);
+    Y = rgb2xyz.m[1][2];
+    REQUIRE_FEQ(rgb2xyz.m[0][2], pl_cie_X(bt709->blue) * Y, 1e-7);
+    REQUIRE_FEQ(rgb2xyz.m[2][2], pl_cie_Z(bt709->blue) * Y, 1e-7);
+
+    // Make sure the gamut mapping round-trips
+    pl_matrix3x3 bt709_bt2020, bt2020_bt709;
+    bt709_bt2020 = pl_get_color_mapping_matrix(bt709, bt2020, PL_INTENT_RELATIVE_COLORIMETRIC);
+    bt2020_bt709 = pl_get_color_mapping_matrix(bt2020, bt709, PL_INTENT_RELATIVE_COLORIMETRIC);
+    for (int n = 0; n < 10; n++) {
+        float vec[3] = { RANDOM, RANDOM, RANDOM };
+        float dst[3] = { vec[0],    vec[1],    vec[2]    };
+        pl_matrix3x3_apply(&bt709_bt2020, dst);
+        pl_matrix3x3_apply(&bt2020_bt709, dst);
+        for (int i = 0; i < 3; i++)
+            REQUIRE_FEQ(dst[i], vec[i], 1e-6);
+    }
+
+    // Ensure the decoding matrix round-trips to white/black
+    for (enum pl_color_system sys = 0; sys < PL_COLOR_SYSTEM_COUNT; sys++) {
+        if (!pl_color_system_is_linear(sys))
+            continue;
+
+        printf("testing color system %u\n", (unsigned) sys);
+        struct pl_color_repr repr = {
+            .levels = PL_COLOR_LEVELS_LIMITED,
+            .sys = sys,
+            .bits = {
+                // synthetic test
+                .color_depth = 8,
+                .sample_depth = 10,
+            },
+        };
+
+        float scale = pl_color_repr_normalize(&repr);
+        pl_transform3x3 yuv2rgb = pl_color_repr_decode(&repr, NULL);
+        pl_matrix3x3_scale(&yuv2rgb.mat, scale);
+
+        static const float white_ycbcr[3] = { 235/1023., 128/1023., 128/1023. };
+        static const float black_ycbcr[3] = {  16/1023., 128/1023., 128/1023. };
+        static const float white_other[3] = { 235/1023., 235/1023., 235/1023. };
+        static const float black_other[3] = {  16/1023.,  16/1023.,  16/1023. };
+
+        float white[3], black[3];
+        for (int i = 0; i < 3; i++) {
+            if (pl_color_system_is_ycbcr_like(sys)) {
+                white[i] = white_ycbcr[i];
+                black[i] = black_ycbcr[i];
+            } else {
+                white[i] = white_other[i];
+                black[i] = black_other[i];
+            }
+        }
+
+        pl_transform3x3_apply(&yuv2rgb, white);
+        REQUIRE_FEQ(white[0], 1.0, 1e-6);
+        REQUIRE_FEQ(white[1], 1.0, 1e-6);
+        REQUIRE_FEQ(white[2], 1.0, 1e-6);
+
+        pl_transform3x3_apply(&yuv2rgb, black);
+        REQUIRE_FEQ(black[0], 0.0, 1e-6);
+        REQUIRE_FEQ(black[1], 0.0, 1e-6);
+        REQUIRE_FEQ(black[2], 0.0, 1e-6);
+    }
+
+    // Make sure chromatic adaptation works
+    struct pl_raw_primaries bt709_d50;
+    bt709_d50 = *pl_raw_primaries_get(PL_COLOR_PRIM_BT_709);
+    bt709_d50.white = (struct pl_cie_xy) { 0.34567, 0.35850 };
+
+    pl_matrix3x3 d50_d65;
+    d50_d65 = pl_get_color_mapping_matrix(&bt709_d50, bt709, PL_INTENT_RELATIVE_COLORIMETRIC);
+
+    float white[3] = { 1.0, 1.0, 1.0 };
+    pl_matrix3x3_apply(&d50_d65, white);
+    REQUIRE_FEQ(white[0], 1.0, 1e-6);
+    REQUIRE_FEQ(white[1], 1.0, 1e-6);
+    REQUIRE_FEQ(white[2], 1.0, 1e-6);
+
+    // Simulate a typical 10-bit YCbCr -> 16 bit texture conversion
+    tv_repr.bits.color_depth  = 10;
+    tv_repr.bits.sample_depth = 16;
+    pl_transform3x3 yuv2rgb;
+    yuv2rgb = pl_color_repr_decode(&tv_repr, NULL);
+    float test[3] = { 575/65535., 336/65535., 640/65535. };
+    pl_transform3x3_apply(&yuv2rgb, test);
+    REQUIRE_FEQ(test[0], 0.808305, 1e-6);
+    REQUIRE_FEQ(test[1], 0.553254, 1e-6);
+    REQUIRE_FEQ(test[2], 0.218841, 1e-6);
+
+    // DVD
+    REQUIRE_CMP(pl_color_system_guess_ycbcr(720, 480), ==, PL_COLOR_SYSTEM_BT_601, "u");
+    REQUIRE_CMP(pl_color_system_guess_ycbcr(720, 576), ==, PL_COLOR_SYSTEM_BT_601, "u");
+    REQUIRE_CMP(pl_color_primaries_guess(720, 576), ==, PL_COLOR_PRIM_BT_601_625, "u");
+    REQUIRE_CMP(pl_color_primaries_guess(720, 480), ==, PL_COLOR_PRIM_BT_601_525, "u");
+    // PAL 16:9
+    REQUIRE_CMP(pl_color_system_guess_ycbcr(1024, 576), ==, PL_COLOR_SYSTEM_BT_601, "u");
+    REQUIRE_CMP(pl_color_primaries_guess(1024, 576), ==, PL_COLOR_PRIM_BT_601_625, "u");
+    // HD
+    REQUIRE_CMP(pl_color_system_guess_ycbcr(1280, 720),  ==, PL_COLOR_SYSTEM_BT_709, "u");
+    REQUIRE_CMP(pl_color_system_guess_ycbcr(1920, 1080), ==, PL_COLOR_SYSTEM_BT_709, "u");
+    REQUIRE_CMP(pl_color_primaries_guess(1280, 720),  ==, PL_COLOR_PRIM_BT_709, "u");
+    REQUIRE_CMP(pl_color_primaries_guess(1920, 1080), ==, PL_COLOR_PRIM_BT_709, "u");
+
+    // Odd/weird videos
+    REQUIRE_CMP(pl_color_primaries_guess(2000, 576), ==, PL_COLOR_PRIM_BT_709, "u");
+    REQUIRE_CMP(pl_color_primaries_guess(200, 200),  ==, PL_COLOR_PRIM_BT_709, "u");
+
+    REQUIRE(pl_color_repr_equal(&pl_color_repr_sdtv, &pl_color_repr_sdtv));
+    REQUIRE(!pl_color_repr_equal(&pl_color_repr_sdtv, &pl_color_repr_hdtv));
+
+    struct pl_color_repr repr = pl_color_repr_unknown;
+    pl_color_repr_merge(&repr, &pl_color_repr_uhdtv);
+    REQUIRE(pl_color_repr_equal(&repr, &pl_color_repr_uhdtv));
+
+    REQUIRE(!pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_UNKNOWN));
+    REQUIRE(!pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_BT_601_525));
+    REQUIRE(!pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_BT_601_625));
+    REQUIRE(!pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_BT_709));
+    REQUIRE(!pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_BT_470M));
+    REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_BT_2020));
+    REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_APPLE));
+    REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_ADOBE));
+    REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_PRO_PHOTO));
+    REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_CIE_1931));
+    REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_DCI_P3));
+    REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_DISPLAY_P3));
+    REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_V_GAMUT));
+    REQUIRE(pl_color_primaries_is_wide_gamut(PL_COLOR_PRIM_S_GAMUT));
+
+    struct pl_color_space space = pl_color_space_unknown;
+    pl_color_space_merge(&space, &pl_color_space_bt709);
+    REQUIRE(pl_color_space_equal(&space, &pl_color_space_bt709));
+
+    // Infer some color spaces
+    struct pl_color_space hlg = {
+        .primaries = PL_COLOR_PRIM_BT_2020,
+        .transfer = PL_COLOR_TRC_HLG,
+    };
+
+    pl_color_space_infer(&hlg);
+    REQUIRE_CMP(hlg.hdr.max_luma, ==, PL_COLOR_HLG_PEAK, "f");
+
+    struct pl_color_space unknown = {0};
+    struct pl_color_space display = {
+        .primaries = PL_COLOR_PRIM_BT_709,
+        .transfer = PL_COLOR_TRC_BT_1886,
+    };
+
+    pl_color_space_infer(&unknown);
+    pl_color_space_infer(&display);
+    REQUIRE(pl_color_space_equal(&unknown, &display));
+
+    float x, y;
+    pl_chroma_location_offset(PL_CHROMA_LEFT, &x, &y);
+    REQUIRE_CMP(x, ==, -0.5f, "f");
+    REQUIRE_CMP(y, ==,  0.0f, "f");
+    pl_chroma_location_offset(PL_CHROMA_TOP_LEFT, &x, &y);
+    REQUIRE_CMP(x, ==, -0.5f, "f");
+    REQUIRE_CMP(y, ==, -0.5f, "f");
+    pl_chroma_location_offset(PL_CHROMA_CENTER, &x, &y);
+    REQUIRE_CMP(x, ==,  0.0f, "f");
+    REQUIRE_CMP(y, ==,  0.0f, "f");
+    pl_chroma_location_offset(PL_CHROMA_BOTTOM_CENTER, &x, &y);
+    REQUIRE_CMP(x, ==,  0.0f, "f");
+    REQUIRE_CMP(y, ==,  0.5f, "f");
+
+    REQUIRE_CMP(pl_raw_primaries_get(PL_COLOR_PRIM_UNKNOWN), ==,
+                pl_raw_primaries_get(PL_COLOR_PRIM_BT_709), "p");
+
+    // Color blindness tests
+    float red[3]   = { 1.0, 0.0, 0.0 };
+    float green[3] = { 0.0, 1.0, 0.0 };
+    float blue[3]  = { 0.0, 0.0, 1.0 };
+
+#define TEST_CONE(model, color)                                                 \
+    do {                                                                        \
+        float tmp[3] = { (color)[0], (color)[1], (color)[2] };                  \
+        pl_matrix3x3 mat = pl_get_cone_matrix(&(model), bt709);                 \
+        pl_matrix3x3_apply(&mat, tmp);                                          \
+        printf("%s + %s = %f %f %f\n", #model, #color, tmp[0], tmp[1], tmp[2]); \
+        for (int i = 0; i < 3; i++)                                             \
+            REQUIRE_FEQ((color)[i], tmp[i], 1e-5f);                             \
+    } while(0)
+
+    struct pl_cone_params red_only = { .cones = PL_CONE_MS };
+    struct pl_cone_params green_only = { .cones = PL_CONE_LS };
+    struct pl_cone_params blue_only = pl_vision_monochromacy;
+
+    // These models should all round-trip white
+    TEST_CONE(pl_vision_normal, white);
+    TEST_CONE(pl_vision_protanopia, white);
+    TEST_CONE(pl_vision_protanomaly, white);
+    TEST_CONE(pl_vision_deuteranomaly, white);
+    TEST_CONE(pl_vision_tritanomaly, white);
+    TEST_CONE(pl_vision_achromatopsia, white);
+    TEST_CONE(red_only, white);
+    TEST_CONE(green_only, white);
+    TEST_CONE(blue_only, white);
+
+    // These models should round-trip blue
+    TEST_CONE(pl_vision_normal, blue);
+    TEST_CONE(pl_vision_protanomaly, blue);
+    TEST_CONE(pl_vision_deuteranomaly, blue);
+
+    // These models should round-trip red
+    TEST_CONE(pl_vision_normal, red);
+    TEST_CONE(pl_vision_tritanomaly, red);
+    TEST_CONE(pl_vision_tritanopia, red);
+
+    // These models should round-trip green
+    TEST_CONE(pl_vision_normal, green);
+
+    // Color adaptation tests
+    struct pl_cie_xy d65 = pl_white_from_temp(6504);
+    REQUIRE_FEQ(d65.x, 0.31271, 1e-3);
+    REQUIRE_FEQ(d65.y, 0.32902, 1e-3);
+    struct pl_cie_xy d55 = pl_white_from_temp(5503);
+    REQUIRE_FEQ(d55.x, 0.33242, 1e-3);
+    REQUIRE_FEQ(d55.y, 0.34743, 1e-3);
+
+    // Make sure we infer the correct set of metadata parameters
+#define TEST_METADATA(CSP, TYPE, MIN, MAX, AVG)                             \
+    do {                                                                    \
+        float _min, _max, _avg;                                             \
+        pl_color_space_nominal_luma_ex(pl_nominal_luma_params(              \
+            .color    = &(CSP),                                             \
+            .metadata = TYPE,                                               \
+            .scaling  = PL_HDR_PQ,                                          \
+            .out_min  = &_min,                                              \
+            .out_max  = &_max,                                              \
+            .out_avg  = &_avg,                                              \
+        ));                                                                 \
+        const float _min_ref = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_PQ, MIN); \
+        const float _max_ref = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_PQ, MAX); \
+        const float _avg_ref = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_PQ, AVG); \
+        REQUIRE_FEQ(_min, _min_ref, 1e-5);                                  \
+        REQUIRE_FEQ(_max, _max_ref, 1e-5);                                  \
+        REQUIRE_FEQ(_avg, _avg_ref, 1e-5);                                  \
+    } while (0)
+
+    const struct pl_color_space hdr10plus = {
+        .primaries = PL_COLOR_PRIM_BT_2020,
+        .transfer  = PL_COLOR_TRC_PQ,
+        .hdr = {
+            .min_luma  = 0.005,
+            .max_luma  = 4000,
+            .scene_max = {596.69, 1200, 500},
+            .scene_avg = 300,
+        },
+    };
+
+    REQUIRE(pl_hdr_metadata_contains(&hdr10plus.hdr, PL_HDR_METADATA_ANY));
+    REQUIRE(pl_hdr_metadata_contains(&hdr10plus.hdr, PL_HDR_METADATA_NONE));
+    REQUIRE(pl_hdr_metadata_contains(&hdr10plus.hdr, PL_HDR_METADATA_HDR10));
+    REQUIRE(pl_hdr_metadata_contains(&hdr10plus.hdr, PL_HDR_METADATA_HDR10PLUS));
+    REQUIRE(!pl_hdr_metadata_contains(&hdr10plus.hdr, PL_HDR_METADATA_CIE_Y));
+
+    TEST_METADATA(hdr10plus, PL_HDR_METADATA_NONE,      PL_COLOR_HDR_BLACK, 10000, 0);
+    TEST_METADATA(hdr10plus, PL_HDR_METADATA_CIE_Y,     PL_COLOR_HDR_BLACK, 4000, 0);
+    TEST_METADATA(hdr10plus, PL_HDR_METADATA_HDR10,     PL_COLOR_HDR_BLACK, 4000, 0);
+    TEST_METADATA(hdr10plus, PL_HDR_METADATA_HDR10PLUS, PL_COLOR_HDR_BLACK, 1000, 250);
+    TEST_METADATA(hdr10plus, PL_HDR_METADATA_ANY,       PL_COLOR_HDR_BLACK, 1000, 250);
+
+    const struct pl_color_space dovi = {
+        .primaries = PL_COLOR_PRIM_BT_2020,
+        .transfer  = PL_COLOR_TRC_PQ,
+        .hdr = {
+            .min_luma = 0.005,
+            .max_luma = 4000,
+            .max_pq_y = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_PQ, 1000),
+            .avg_pq_y = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_PQ, 250),
+        },
+    };
+
+    REQUIRE(pl_hdr_metadata_contains(&dovi.hdr, PL_HDR_METADATA_ANY));
+    REQUIRE(pl_hdr_metadata_contains(&dovi.hdr, PL_HDR_METADATA_NONE));
+    REQUIRE(pl_hdr_metadata_contains(&dovi.hdr, PL_HDR_METADATA_HDR10));
+    REQUIRE(pl_hdr_metadata_contains(&dovi.hdr, PL_HDR_METADATA_CIE_Y));
+    REQUIRE(!pl_hdr_metadata_contains(&dovi.hdr, PL_HDR_METADATA_HDR10PLUS));
+
+    TEST_METADATA(dovi, PL_HDR_METADATA_NONE,      PL_COLOR_HDR_BLACK, 10000, 0);
+    TEST_METADATA(dovi, PL_HDR_METADATA_HDR10,     PL_COLOR_HDR_BLACK, 4000, 0);
+    TEST_METADATA(dovi, PL_HDR_METADATA_HDR10PLUS, PL_COLOR_HDR_BLACK, 4000, 0);
+    TEST_METADATA(dovi, PL_HDR_METADATA_CIE_Y,     PL_COLOR_HDR_BLACK, 1000, 250);
+    TEST_METADATA(dovi, PL_HDR_METADATA_ANY,       PL_COLOR_HDR_BLACK, 1000, 250);
+
+    const struct pl_color_space hlg4000 = {
+        .primaries    = PL_COLOR_PRIM_BT_2020,
+        .transfer     = PL_COLOR_TRC_HLG,
+        .hdr.max_luma = 4000,
+        .hdr.min_luma = 0.005,
+    };
+
+    TEST_METADATA(hlg4000, PL_HDR_METADATA_NONE,  PL_COLOR_HDR_BLACK, PL_COLOR_HLG_PEAK, 0);
+    TEST_METADATA(hlg4000, PL_HDR_METADATA_HDR10, 0.005, 4000, 0);
+    TEST_METADATA(hlg4000, PL_HDR_METADATA_ANY,   0.005, 4000, 0);
+
+    const struct pl_color_space untagged = {
+        .primaries = PL_COLOR_PRIM_BT_709,
+        .transfer  = PL_COLOR_TRC_BT_1886,
+    };
+
+    REQUIRE(pl_hdr_metadata_contains(&untagged.hdr, PL_HDR_METADATA_NONE));
+    REQUIRE(!pl_hdr_metadata_contains(&untagged.hdr, PL_HDR_METADATA_ANY));
+    REQUIRE(!pl_hdr_metadata_contains(&untagged.hdr, PL_HDR_METADATA_HDR10));
+    REQUIRE(!pl_hdr_metadata_contains(&untagged.hdr, PL_HDR_METADATA_CIE_Y));
+    REQUIRE(!pl_hdr_metadata_contains(&untagged.hdr, PL_HDR_METADATA_HDR10PLUS));
+
+    const float sdr_black = PL_COLOR_SDR_WHITE / PL_COLOR_SDR_CONTRAST;
+    TEST_METADATA(untagged, PL_HDR_METADATA_NONE, sdr_black, PL_COLOR_SDR_WHITE, 0);
+    TEST_METADATA(untagged, PL_HDR_METADATA_ANY,  sdr_black, PL_COLOR_SDR_WHITE, 0);
+
+    const struct pl_color_space sdr50 = {
+        .primaries    = PL_COLOR_PRIM_BT_709,
+        .transfer     = PL_COLOR_TRC_BT_1886,
+        .hdr.max_luma = 50,
+    };
+
+    REQUIRE(pl_hdr_metadata_contains(&sdr50.hdr, PL_HDR_METADATA_NONE));
+    REQUIRE(pl_hdr_metadata_contains(&sdr50.hdr, PL_HDR_METADATA_ANY));
+    REQUIRE(pl_hdr_metadata_contains(&sdr50.hdr, PL_HDR_METADATA_HDR10));
+    REQUIRE(!pl_hdr_metadata_contains(&sdr50.hdr, PL_HDR_METADATA_CIE_Y));
+    REQUIRE(!pl_hdr_metadata_contains(&sdr50.hdr, PL_HDR_METADATA_HDR10PLUS));
+
+    TEST_METADATA(sdr50, PL_HDR_METADATA_NONE,  sdr_black, PL_COLOR_SDR_WHITE, 0);
+    TEST_METADATA(sdr50, PL_HDR_METADATA_HDR10, 50 / PL_COLOR_SDR_CONTRAST, 50, 0);
+    TEST_METADATA(sdr50, PL_HDR_METADATA_ANY,   50 / PL_COLOR_SDR_CONTRAST, 50, 0);
+
+    const struct pl_color_space sdr10k = {
+        .primaries    = PL_COLOR_PRIM_BT_709,
+        .transfer     = PL_COLOR_TRC_BT_1886,
+        .hdr.min_luma = PL_COLOR_SDR_WHITE / 10000,
+    };
+
+    REQUIRE(pl_hdr_metadata_contains(&sdr10k.hdr, PL_HDR_METADATA_NONE));
+    REQUIRE(!pl_hdr_metadata_contains(&sdr10k.hdr, PL_HDR_METADATA_ANY));
+    REQUIRE(!pl_hdr_metadata_contains(&sdr10k.hdr, PL_HDR_METADATA_HDR10));
+    TEST_METADATA(sdr10k, PL_HDR_METADATA_NONE,  sdr_black, PL_COLOR_SDR_WHITE, 0);
+    TEST_METADATA(sdr10k, PL_HDR_METADATA_HDR10, PL_COLOR_SDR_WHITE / 10000, PL_COLOR_SDR_WHITE, 0);
+    TEST_METADATA(sdr10k, PL_HDR_METADATA_ANY,   PL_COLOR_SDR_WHITE / 10000, PL_COLOR_SDR_WHITE, 0);
+
+    const struct pl_color_space bogus_vals = {
+        .primaries = PL_COLOR_PRIM_BT_2020,
+        .transfer  = PL_COLOR_TRC_HLG,
+        .hdr.min_luma = 1e-9,
+        .hdr.max_luma = 1000000,
+    };
+
+    const struct pl_color_space bogus_flip = {
+        .primaries = PL_COLOR_PRIM_BT_2020,
+        .transfer  = PL_COLOR_TRC_PQ,
+        .hdr.min_luma = 4000,
+        .hdr.max_luma = 0.05,
+    };
+
+    const struct pl_color_space bogus_sign = {
+        .primaries = PL_COLOR_PRIM_BT_2020,
+        .transfer  = PL_COLOR_TRC_HLG,
+        .hdr.min_luma = -0.5,
+        .hdr.max_luma = -4000,
+    };
+
+    TEST_METADATA(bogus_vals, PL_HDR_METADATA_HDR10, PL_COLOR_HDR_BLACK, 10000, 0);
+    TEST_METADATA(bogus_flip, PL_HDR_METADATA_HDR10, PL_COLOR_HDR_BLACK, 10000, 0);
+    TEST_METADATA(bogus_sign, PL_HDR_METADATA_HDR10, PL_COLOR_HDR_BLACK, PL_COLOR_HLG_PEAK, 0);
+}
diff --git a/src/tests/common.c b/src/tests/common.c
new file mode 100644
index 0000000..849971e
--- /dev/null
+++ b/src/tests/common.c
@@ -0,0 +1,136 @@
+#include "tests.h"
+
+static int irand()
+{
+    return rand() - RAND_MAX / 2;
+}
+
+int main()
+{
+    pl_log log = pl_test_logger();
+    pl_log_update(log, NULL);
+    pl_log_destroy(&log);
+
+    // Test some misc helper functions
+    pl_rect2d rc2 = {
+        irand(), irand(),
+        irand(), irand(),
+    };
+
+    pl_rect3d rc3 = {
+        irand(), irand(), irand(),
+        irand(), irand(), irand(),
+    };
+
+    pl_rect2d_normalize(&rc2);
+    REQUIRE_CMP(rc2.x1, >=, rc2.x0, "d");
+    REQUIRE_CMP(rc2.y1, >=, rc2.y0, "d");
+
+    pl_rect3d_normalize(&rc3);
+    REQUIRE_CMP(rc3.x1, >=, rc3.x0, "d");
+    REQUIRE_CMP(rc3.y1, >=, rc3.y0, "d");
+    REQUIRE_CMP(rc3.z1, >=, rc3.z0, "d");
+
+    pl_rect2df rc2f = {
+        RANDOM, RANDOM,
+        RANDOM, RANDOM,
+    };
+
+    pl_rect3df rc3f = {
+        RANDOM, RANDOM, RANDOM,
+        RANDOM, RANDOM, RANDOM,
+    };
+
+    pl_rect2df_normalize(&rc2f);
+    REQUIRE_CMP(rc2f.x1, >=, rc2f.x0, "f");
+    REQUIRE_CMP(rc2f.y1, >=, rc2f.y0, "f");
+
+    pl_rect3df_normalize(&rc3f);
+    REQUIRE_CMP(rc3f.x1, >=, rc3f.x0, "f");
+    REQUIRE_CMP(rc3f.y1, >=, rc3f.y0, "f");
+    REQUIRE_CMP(rc3f.z1, >=, rc3f.z0, "f");
+
+    pl_rect2d rc2r = pl_rect2df_round(&rc2f);
+    pl_rect3d rc3r = pl_rect3df_round(&rc3f);
+
+    REQUIRE_CMP(fabs(rc2r.x0 - rc2f.x0), <=, 0.5, "f");
+    REQUIRE_CMP(fabs(rc2r.x1 - rc2f.x1), <=, 0.5, "f");
+    REQUIRE_CMP(fabs(rc2r.y0 - rc2f.y0), <=, 0.5, "f");
+    REQUIRE_CMP(fabs(rc2r.y1 - rc2f.y1), <=, 0.5, "f");
+
+    REQUIRE_CMP(fabs(rc3r.x0 - rc3f.x0), <=, 0.5, "f");
+    REQUIRE_CMP(fabs(rc3r.x1 - rc3f.x1), <=, 0.5, "f");
+    REQUIRE_CMP(fabs(rc3r.y0 - rc3f.y0), <=, 0.5, "f");
+    REQUIRE_CMP(fabs(rc3r.y1 - rc3f.y1), <=, 0.5, "f");
+    REQUIRE_CMP(fabs(rc3r.z0 - rc3f.z0), <=, 0.5, "f");
+    REQUIRE_CMP(fabs(rc3r.z1 - rc3f.z1), <=, 0.5, "f");
+
+    pl_transform3x3 tr = {
+        .mat = {{
+            { RANDOM, RANDOM, RANDOM },
+            { RANDOM, RANDOM, RANDOM },
+            { RANDOM, RANDOM, RANDOM },
+        }},
+        .c = { RANDOM, RANDOM, RANDOM },
+    };
+
+    pl_transform3x3 tr2 = tr;
+    float scale = 1.0 + RANDOM;
+    pl_transform3x3_scale(&tr2, scale);
+    pl_transform3x3_invert(&tr2);
+    pl_transform3x3_invert(&tr2);
+    pl_transform3x3_scale(&tr2, 1.0 / scale);
+
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            printf("%f %f\n", tr.mat.m[i][j], tr2.mat.m[i][j]);
+            REQUIRE_FEQ(tr.mat.m[i][j], tr2.mat.m[i][j], 1e-4);
+        }
+        REQUIRE_FEQ(tr.c[i], tr2.c[i], 1e-4);
+    }
+
+    // Test aspect ratio code
+    const pl_rect2df rc1080p = {0, 0, 1920, 1080};
+    const pl_rect2df rc43 = {0, 0, 1024, 768};
+    pl_rect2df rc;
+
+    REQUIRE_FEQ(pl_rect2df_aspect(&rc1080p), 16.0/9.0, 1e-8);
+    REQUIRE_FEQ(pl_rect2df_aspect(&rc43), 4.0/3.0, 1e-8);
+
+#define pl_rect2df_midx(rc) (((rc).x0 + (rc).x1) / 2.0)
+#define pl_rect2df_midy(rc) (((rc).y0 + (rc).y1) / 2.0)
+
+    for (float aspect = 0.2; aspect < 3.0; aspect += 0.4) {
+        for (float scan = 0.0; scan <= 1.0; scan += 0.5) {
+            rc = rc1080p;
+            pl_rect2df_aspect_set(&rc, aspect, scan);
+            printf("aspect %.2f, panscan %.1f: {%f %f} -> {%f %f}\n",
+                   aspect, scan, rc.x0, rc.y0, rc.x1, rc.y1);
+            REQUIRE_FEQ(pl_rect2df_aspect(&rc), aspect, 1e-6);
+            REQUIRE_FEQ(pl_rect2df_midx(rc), pl_rect2df_midx(rc1080p), 1e-6);
+            REQUIRE_FEQ(pl_rect2df_midy(rc), pl_rect2df_midy(rc1080p), 1e-6);
+        }
+    }
+
+    rc = rc1080p;
+    pl_rect2df_aspect_fit(&rc, &rc43, 0.0);
+    REQUIRE_FEQ(pl_rect2df_aspect(&rc), pl_rect2df_aspect(&rc43), 1e-6);
+    REQUIRE_FEQ(pl_rect2df_midx(rc), pl_rect2df_midx(rc1080p), 1e-6);
+    REQUIRE_FEQ(pl_rect2df_midy(rc), pl_rect2df_midy(rc1080p), 1e-6);
+    REQUIRE_FEQ(pl_rect_w(rc), pl_rect_w(rc43), 1e-6);
+    REQUIRE_FEQ(pl_rect_h(rc), pl_rect_h(rc43), 1e-6);
+
+    rc = rc43;
+    pl_rect2df_aspect_fit(&rc, &rc1080p, 0.0);
+    REQUIRE_FEQ(pl_rect2df_aspect(&rc), pl_rect2df_aspect(&rc1080p), 1e-6);
+    REQUIRE_FEQ(pl_rect2df_midx(rc), pl_rect2df_midx(rc43), 1e-6);
+    REQUIRE_FEQ(pl_rect2df_midy(rc), pl_rect2df_midy(rc43), 1e-6);
+    REQUIRE_FEQ(pl_rect_w(rc), pl_rect_w(rc43), 1e-6);
+
+    rc = (pl_rect2df) { 1920, 1080, 0, 0 };
+    pl_rect2df_offset(&rc, 50, 100);
+    REQUIRE_FEQ(rc.x0, 1870, 1e-6);
+    REQUIRE_FEQ(rc.x1, -50, 1e-6);
+    REQUIRE_FEQ(rc.y0, 980, 1e-6);
+    REQUIRE_FEQ(rc.y1, -100, 1e-6);
+}
diff --git a/src/tests/d3d11.c b/src/tests/d3d11.c
new file mode 100644
index 0000000..256af35
--- /dev/null
+++ b/src/tests/d3d11.c
@@ -0,0 +1,59 @@
+#include "gpu_tests.h"
+#include "d3d11/gpu.h"
+#include <dxgi1_2.h>
+
+#include <libplacebo/d3d11.h>
+
+int main()
+{
+    pl_log log = pl_test_logger();
+    IDXGIFactory1 *factory = NULL;
+    IDXGIAdapter1 *adapter1 = NULL;
+    HRESULT hr;
+
+    HMODULE dxgi = LoadLibraryW(L"dxgi.dll");
+    if (!dxgi)
+        return SKIP;
+
+    __typeof__(&CreateDXGIFactory1) pCreateDXGIFactory1 =
+        (void *) GetProcAddress(dxgi, "CreateDXGIFactory1");
+    if (!pCreateDXGIFactory1)
+        return SKIP;
+
+    hr = pCreateDXGIFactory1(&IID_IDXGIFactory1, (void **) &factory);
+    if (FAILED(hr)) {
+        printf("Failed to create DXGI factory\n");
+        return SKIP;
+    }
+
+    // Test all attached devices
+    for (int i = 0;; i++) {
+        hr = IDXGIFactory1_EnumAdapters1(factory, i, &adapter1);
+        if (hr == DXGI_ERROR_NOT_FOUND)
+            break;
+        if (FAILED(hr)) {
+            printf("Failed to enumerate adapters\n");
+            return SKIP;
+        }
+
+        DXGI_ADAPTER_DESC1 desc;
+        hr = IDXGIAdapter1_GetDesc1(adapter1, &desc);
+        if (FAILED(hr)) {
+            printf("Failed to enumerate adapters\n");
+            return SKIP;
+        }
+        SAFE_RELEASE(adapter1);
+
+        const struct pl_d3d11_t *d3d11 = pl_d3d11_create(log, pl_d3d11_params(
+            .debug = true,
+            .adapter_luid = desc.AdapterLuid,
+        ));
+        REQUIRE(d3d11);
+
+        gpu_shader_tests(d3d11->gpu);
+
+        pl_d3d11_destroy(&d3d11);
+    }
+
+    SAFE_RELEASE(factory);
+}
diff --git a/src/tests/dav1d.c b/src/tests/dav1d.c
new file mode 100644
index 0000000..7e2439f
--- /dev/null
+++ b/src/tests/dav1d.c
@@ -0,0 +1,45 @@
+#include "tests.h"
+#include "libplacebo/utils/dav1d.h"
+
+int main()
+{
+    // Test enum functions
+    for (enum pl_color_system sys = 0; sys < PL_COLOR_SYSTEM_COUNT; sys++) {
+        // Exceptions to the rule, due to different handling in dav1d
+        if (sys == PL_COLOR_SYSTEM_BT_2100_HLG || sys == PL_COLOR_SYSTEM_XYZ)
+            continue;
+
+        enum Dav1dMatrixCoefficients mc = pl_system_to_dav1d(sys);
+        enum pl_color_system sys2 = pl_system_from_dav1d(mc);
+        if (sys2)
+            REQUIRE_CMP(sys, ==, sys2, "u");
+    }
+
+    for (enum pl_color_levels lev = 0; lev < PL_COLOR_LEVELS_COUNT; lev++) {
+        int range = pl_levels_to_dav1d(lev);
+        enum pl_color_levels lev2 = pl_levels_from_dav1d(range);
+        if (lev != PL_COLOR_LEVELS_UNKNOWN)
+            REQUIRE_CMP(lev, ==, lev2, "u");
+    }
+
+    for (enum pl_color_primaries prim = 0; prim < PL_COLOR_PRIM_COUNT; prim++) {
+        enum Dav1dColorPrimaries dpri = pl_primaries_to_dav1d(prim);
+        enum pl_color_primaries prim2 = pl_primaries_from_dav1d(dpri);
+        if (prim2)
+            REQUIRE_CMP(prim, ==, prim2, "u");
+    }
+
+    for (enum pl_color_transfer trc = 0; trc < PL_COLOR_TRC_COUNT; trc++) {
+        enum Dav1dTransferCharacteristics dtrc = pl_transfer_to_dav1d(trc);
+        enum pl_color_transfer trc2 = pl_transfer_from_dav1d(dtrc);
+        if (trc2)
+            REQUIRE_CMP(trc, ==, trc2, "u");
+    }
+
+    for (enum pl_chroma_location loc = 0; loc < PL_CHROMA_COUNT; loc++) {
+        enum Dav1dChromaSamplePosition dloc = pl_chroma_to_dav1d(loc);
+        enum pl_chroma_location loc2 = pl_chroma_from_dav1d(dloc);
+        if (loc2)
+            REQUIRE_CMP(loc, ==, loc2, "u");
+    }
+}
diff --git a/src/tests/dither.c b/src/tests/dither.c
new file mode 100644
index 0000000..c9f639c
--- /dev/null
+++ b/src/tests/dither.c
@@ -0,0 +1,41 @@
+#include "tests.h"
+
+#include <libplacebo/dither.h>
+#include <libplacebo/shaders/dithering.h>
+
+#define SHIFT 4
+#define SIZE (1 << SHIFT)
+float data[SIZE][SIZE];
+
+int main()
+{
+    printf("Ordered dither matrix:\n");
+    pl_generate_bayer_matrix(&data[0][0], SIZE);
+    for (int y = 0; y < SIZE; y++) {
+        for (int x = 0; x < SIZE; x++)
+            printf(" %3d", (int)(data[y][x] * SIZE * SIZE));
+        printf("\n");
+    }
+
+    printf("Blue noise dither matrix:\n");
+    pl_generate_blue_noise(&data[0][0], SHIFT);
+    for (int y = 0; y < SIZE; y++) {
+        for (int x = 0; x < SIZE; x++)
+            printf(" %3d", (int)(data[y][x] * SIZE * SIZE));
+        printf("\n");
+    }
+
+    // Generate an example of a dither shader
+    pl_log log = pl_test_logger();
+    pl_shader sh = pl_shader_alloc(log, NULL);
+    pl_shader_obj obj = NULL;
+
+    pl_shader_dither(sh, 8, &obj, NULL);
+    const struct pl_shader_res *res = pl_shader_finalize(sh);
+    REQUIRE(res);
+    printf("Generated dither shader:\n%s\n", res->glsl);
+
+    pl_shader_obj_destroy(&obj);
+    pl_shader_free(&sh);
+    pl_log_destroy(&log);
+}
diff --git a/src/tests/dummy.c b/src/tests/dummy.c
new file mode 100644
index 0000000..0e87a2c
--- /dev/null
+++ b/src/tests/dummy.c
@@ -0,0 +1,70 @@
+#include "gpu_tests.h"
+
+#include <libplacebo/dummy.h>
+
+int main()
+{
+    pl_log log = pl_test_logger();
+    pl_gpu gpu = pl_gpu_dummy_create(log, NULL);
+    pl_buffer_tests(gpu);
+    pl_texture_tests(gpu);
+
+    // Attempt creating a shader and accessing the resulting LUT
+    pl_tex dummy = pl_tex_dummy_create(gpu, pl_tex_dummy_params(
+        .w = 100,
+        .h = 100,
+        .format = pl_find_named_fmt(gpu, "rgba8"),
+    ));
+
+    struct pl_sample_src src = {
+        .tex = dummy,
+        .new_w = 1000,
+        .new_h = 1000,
+    };
+
+    pl_shader_obj lut = NULL;
+    struct pl_sample_filter_params filter_params = {
+        .filter = pl_filter_ewa_lanczos,
+        .lut = &lut,
+    };
+
+    pl_shader sh = pl_shader_alloc(log, pl_shader_params( .gpu = gpu ));
+    REQUIRE(pl_shader_sample_polar(sh, &src, &filter_params));
+    const struct pl_shader_res *res = pl_shader_finalize(sh);
+    REQUIRE(res);
+
+    for (int n = 0; n < res->num_descriptors; n++) {
+        const struct pl_shader_desc *sd = &res->descriptors[n];
+        if (sd->desc.type != PL_DESC_SAMPLED_TEX)
+            continue;
+
+        pl_tex tex = sd->binding.object;
+        const float *data = (float *) pl_tex_dummy_data(tex);
+        if (!data)
+            continue; // means this was the `dummy` texture
+
+#ifdef PRINT_LUTS
+        for (int i = 0; i < tex->params.w; i++)
+            printf("lut[%d] = %f\n", i, data[i]);
+#endif
+    }
+
+    // Try out generation of the sampler2D interface
+    src.tex = NULL;
+    src.tex_w = 100;
+    src.tex_h = 100;
+    src.format = PL_FMT_UNORM;
+    src.sampler = PL_SAMPLER_NORMAL;
+    src.mode = PL_TEX_SAMPLE_LINEAR;
+
+    pl_shader_reset(sh, pl_shader_params( .gpu = gpu ));
+    REQUIRE(pl_shader_sample_polar(sh, &src, &filter_params));
+    REQUIRE((res = pl_shader_finalize(sh)));
+    REQUIRE_CMP(res->input, ==, PL_SHADER_SIG_SAMPLER, "u");
+
+    pl_shader_free(&sh);
+    pl_shader_obj_destroy(&lut);
+    pl_tex_destroy(gpu, &dummy);
+    pl_gpu_dummy_destroy(&gpu);
+    pl_log_destroy(&log);
+}
diff --git a/src/tests/filters.c b/src/tests/filters.c
new file mode 100644
index 0000000..b6b323c
--- /dev/null
+++ b/src/tests/filters.c
@@ -0,0 +1,81 @@
+#include "tests.h"
+
+#include <libplacebo/filters.h>
+
+int main()
+{
+    pl_log log = pl_test_logger();
+
+    for (int i = 0; i < pl_num_filter_functions; i++) {
+        const struct pl_filter_function *fun = pl_filter_functions[i];
+        if (fun->opaque)
+            continue;
+
+        printf("Testing filter function '%s'\n", fun->name);
+
+        struct pl_filter_ctx ctx = { .radius = fun->radius };
+        memcpy(ctx.params, fun->params, sizeof(ctx.params));
+
+        // Ensure the kernel is correctly scaled
+        REQUIRE_FEQ(fun->weight(&ctx, 0.0), 1.0, 1e-7);
+
+        // Only box filters are radius 1, these are unwindowed by design.
+        // Gaussian technically never reaches 0 even at its preconfigured radius.
+        if (fun->radius > 1.0 && fun != &pl_filter_function_gaussian)
+            REQUIRE_FEQ(fun->weight(&ctx, fun->radius), 0.0, 1e-7);
+    }
+
+    for (int c = 0; c < pl_num_filter_configs; c++) {
+        const struct pl_filter_config *conf = pl_filter_configs[c];
+        if (conf->kernel->opaque)
+            continue;
+
+        printf("Testing filter config '%s'\n", conf->name);
+        pl_filter flt = pl_filter_generate(log, pl_filter_params(
+            .config      = *conf,
+            .lut_entries = 256,
+            .cutoff      = 1e-3,
+        ));
+        REQUIRE(flt);
+        const float radius = PL_DEF(conf->radius, conf->kernel->radius);
+        REQUIRE_CMP(flt->radius, <=, radius, "f");
+        REQUIRE_CMP(flt->radius_zero, >, 0.0, "f");
+        REQUIRE_CMP(flt->radius_zero, <=, flt->radius, "f");
+
+        if (conf->polar) {
+
+            // Test LUT accuracy
+            const int range = flt->params.lut_entries - 1;
+            double scale = flt->weights[0] / pl_filter_sample(conf, 0.0);
+            double err = 0.0;
+            for (float k = 0.0; k <= 1.0; k += 1e-3f) {
+                double ref = scale * pl_filter_sample(conf, k * flt->radius);
+                double idx = k * range;
+                int base = floorf(idx);
+                double fpart = idx - base;
+                int next = PL_MIN(base + 1, range);
+                double interp = PL_MIX(flt->weights[base], flt->weights[next], fpart);
+                err = fmaxf(err, fabs(interp - ref));
+            }
+            REQUIRE_CMP(err, <=, 1e-4, "g");
+
+        } else {
+
+            // Ensure the weights for each row add up to unity
+            for (int i = 0; i < flt->params.lut_entries; i++) {
+                const float *row = flt->weights + i * flt->row_stride;
+                float sum = 0.0;
+                REQUIRE(flt->row_size);
+                REQUIRE_CMP(flt->row_stride, >=, flt->row_size, "d");
+                for (int n = 0; n < flt->row_size; n++)
+                    sum += row[n];
+                REQUIRE_FEQ(sum, 1.0, 1e-6);
+            }
+
+        }
+
+        pl_filter_free(&flt);
+    }
+
+    pl_log_destroy(&log);
+}
diff --git a/src/tests/fuzz/lut.c b/src/tests/fuzz/lut.c
new file mode 100644
index 0000000..24e5f89
--- /dev/null
+++ b/src/tests/fuzz/lut.c
@@ -0,0 +1,24 @@
+#include "../tests.h"
+
+#include <libplacebo/shaders/lut.h>
+
+__AFL_FUZZ_INIT();
+
+#pragma clang optimize off
+
+int main()
+{
+    struct pl_custom_lut *lut;
+
+#ifdef __AFL_HAVE_MANUAL_CONTROL
+    __AFL_INIT();
+#endif
+
+    unsigned char *buf = __AFL_FUZZ_TESTCASE_BUF;
+
+    while (__AFL_LOOP(100000)) {
+        size_t len = __AFL_FUZZ_TESTCASE_LEN;
+        lut = pl_lut_parse_cube(NULL, (char *) buf, len);
+        pl_lut_free(&lut);
+    }
+}
diff --git a/src/tests/fuzz/options.c b/src/tests/fuzz/options.c
new file mode 100644
index 0000000..c88e462
--- /dev/null
+++ b/src/tests/fuzz/options.c
@@ -0,0 +1,26 @@
+#include "../tests.h"
+
+#include <libplacebo/options.h>
+
+__AFL_FUZZ_INIT();
+
+#pragma clang optimize off
+
+int main()
+{
+    pl_options opts = pl_options_alloc(NULL);
+
+#ifdef __AFL_HAVE_MANUAL_CONTROL
+    __AFL_INIT();
+#endif
+
+    unsigned char *buf = __AFL_FUZZ_TESTCASE_BUF;
+
+    while (__AFL_LOOP(100000)) {
+        size_t len = __AFL_FUZZ_TESTCASE_LEN;
+        buf[len - 1] = '\0'; // ensure proper null termination
+        pl_options_load(opts, (const char *) buf);
+        pl_options_save(opts);
+        pl_options_reset(opts, NULL);
+    }
+}
diff --git a/src/tests/fuzz/shaders.c b/src/tests/fuzz/shaders.c
new file mode 100644
index 0000000..2e3e92c
--- /dev/null
+++ b/src/tests/fuzz/shaders.c
@@ -0,0 +1,166 @@
+#include "../tests.h"
+#include "shaders.h"
+
+#include <libplacebo/dummy.h>
+#include <libplacebo/shaders/colorspace.h>
+#include <libplacebo/shaders/custom.h>
+#include <libplacebo/shaders/sampling.h>
+
+__AFL_FUZZ_INIT();
+
+#pragma clang optimize off
+
+int main()
+{
+    pl_gpu gpu = pl_gpu_dummy_create(NULL, NULL);
+
+#define WIDTH 64
+#define HEIGHT 64
+#define COMPS 4
+
+    static const float empty[HEIGHT][WIDTH][COMPS] = {0};
+
+    struct pl_sample_src src = {
+        .tex = pl_tex_create(gpu, pl_tex_params(
+            .format = pl_find_fmt(gpu, PL_FMT_FLOAT, COMPS, 0, 32, PL_FMT_CAP_SAMPLEABLE),
+            .initial_data = empty,
+            .sampleable = true,
+            .w = WIDTH,
+            .h = HEIGHT,
+        )),
+        .new_w = WIDTH * 2,
+        .new_h = HEIGHT * 2,
+    };
+
+    if (!src.tex)
+        return 1;
+
+#ifdef __AFL_HAVE_MANUAL_CONTROL
+    __AFL_INIT();
+#endif
+
+    unsigned char *buf = __AFL_FUZZ_TESTCASE_BUF;
+    while (__AFL_LOOP(10000)) {
+
+#define STACK_SIZE 16
+        pl_shader stack[STACK_SIZE] = {0};
+        int idx = 0;
+
+        stack[0] = pl_shader_alloc(NULL, pl_shader_params(
+            .gpu = gpu,
+        ));
+
+        pl_shader sh = stack[idx];
+        pl_shader_obj polar = NULL, ortho = NULL, peak = NULL, dither = NULL;
+
+        size_t len = __AFL_FUZZ_TESTCASE_LEN;
+        for (size_t pos = 0; pos < len; pos++) {
+            switch (buf[pos]) {
+            // Sampling steps
+            case 'S':
+                pl_shader_sample_direct(sh, &src);
+                break;
+            case 'D':
+                pl_shader_deband(sh, &src, NULL);
+                break;
+            case 'P':
+                pl_shader_sample_polar(sh, &src, pl_sample_filter_params(
+                    .filter = pl_filter_ewa_lanczos,
+                    .lut = &polar,
+                ));
+                break;
+            case 'O': ;
+                struct pl_sample_src srcfix = src;
+                srcfix.new_w = WIDTH;
+                pl_shader_sample_ortho2(sh, &srcfix, pl_sample_filter_params(
+                    .filter = pl_filter_spline36,
+                    .lut = &ortho,
+                ));
+                break;
+            case 'X':
+                pl_shader_custom(sh, &(struct pl_custom_shader) {
+                    .input = PL_SHADER_SIG_NONE,
+                    .output = PL_SHADER_SIG_COLOR,
+                    .body = "// merge subpasses",
+                });
+                break;
+
+            // Colorspace transformation steps
+            case 'y': {
+                struct pl_color_repr repr = pl_color_repr_jpeg;
+                pl_shader_decode_color(sh, &repr, NULL);
+                break;
+            }
+            case 'p':
+                pl_shader_detect_peak(sh, pl_color_space_hdr10, &peak, NULL);
+                break;
+            case 'm':
+                pl_shader_color_map(sh, NULL, pl_color_space_bt709,
+                                    pl_color_space_monitor, NULL, false);
+                break;
+            case 't':
+                pl_shader_color_map(sh, NULL, pl_color_space_hdr10,
+                                    pl_color_space_monitor, &peak, false);
+                break;
+            case 'd':
+                pl_shader_dither(sh, 8, &dither, pl_dither_params(
+                    // Picked to speed up calculation
+                    .method = PL_DITHER_ORDERED_LUT,
+                    .lut_size = 2,
+                ));
+                break;
+
+            // Push and pop subshader commands
+            case '(':
+                if (idx+1 == STACK_SIZE)
+                    goto invalid;
+
+                idx++;
+                if (!stack[idx]) {
+                    stack[idx] = pl_shader_alloc(NULL, pl_shader_params(
+                        .gpu = gpu,
+                        .id = idx,
+                    ));
+                }
+                sh = stack[idx];
+                break;
+
+            case ')':
+                if (idx == 0)
+                    goto invalid;
+
+                idx--;
+                sh_subpass(stack[idx], stack[idx + 1]);
+                pl_shader_reset(stack[idx + 1], pl_shader_params(
+                    .gpu = gpu,
+                    .id = idx + 1,
+                ));
+                sh = stack[idx];
+                break;
+
+            default:
+                goto invalid;
+            }
+        }
+
+        // Merge remaining shaders
+        while (idx > 0) {
+            sh_subpass(stack[idx - 1], stack[idx]);
+            idx--;
+        }
+
+        pl_shader_finalize(stack[0]);
+
+invalid:
+        for (int i = 0; i < STACK_SIZE; i++)
+            pl_shader_free(&stack[i]);
+
+        pl_shader_obj_destroy(&polar);
+        pl_shader_obj_destroy(&ortho);
+        pl_shader_obj_destroy(&peak);
+        pl_shader_obj_destroy(&dither);
+    }
+
+    pl_tex_destroy(gpu, &src.tex);
+    pl_gpu_dummy_destroy(&gpu);
+}
diff --git a/src/tests/fuzz/user_shaders.c b/src/tests/fuzz/user_shaders.c
new file mode 100644
index 0000000..bbb98c8
--- /dev/null
+++ b/src/tests/fuzz/user_shaders.c
@@ -0,0 +1,28 @@
+#include "../tests.h"
+
+#include <libplacebo/dummy.h>
+#include <libplacebo/shaders/custom.h>
+
+__AFL_FUZZ_INIT();
+
+#pragma clang optimize off
+
+int main()
+{
+    pl_gpu gpu = pl_gpu_dummy_create(NULL, NULL);
+    const struct pl_hook *hook;
+
+#ifdef __AFL_HAVE_MANUAL_CONTROL
+    __AFL_INIT();
+#endif
+
+    unsigned char *buf = __AFL_FUZZ_TESTCASE_BUF;
+
+    while (__AFL_LOOP(100000)) {
+        size_t len = __AFL_FUZZ_TESTCASE_LEN;
+        hook = pl_mpv_user_shader_parse(gpu, (char *) buf, len);
+        pl_mpv_user_shader_destroy(&hook);
+    }
+
+    pl_gpu_dummy_destroy(&gpu);
+}
diff --git a/src/tests/gpu_tests.h b/src/tests/gpu_tests.h
new file mode 100644
index 0000000..f14f260
--- /dev/null
+++ b/src/tests/gpu_tests.h
@@ -0,0 +1,1741 @@
+#include "tests.h"
+#include "shaders.h"
+
+#include <libplacebo/renderer.h>
+#include <libplacebo/utils/frame_queue.h>
+#include <libplacebo/utils/upload.h>
+
+//#define PRINT_OUTPUT
+
+static void pl_buffer_tests(pl_gpu gpu)
+{
+    const size_t buf_size = 1024;
+    if (buf_size > gpu->limits.max_buf_size)
+        return;
+
+    uint8_t *test_src = malloc(buf_size * 2);
+    uint8_t *test_dst = test_src + buf_size;
+    assert(test_src && test_dst);
+    memset(test_dst, 0, buf_size);
+    for (int i = 0; i < buf_size; i++)
+        test_src[i] = RANDOM_U8;
+
+    pl_buf buf = NULL, tbuf = NULL;
+
+    printf("test buffer static creation and readback\n");
+    buf = pl_buf_create(gpu, pl_buf_params(
+        .size = buf_size,
+        .host_readable = true,
+        .initial_data = test_src,
+    ));
+
+    REQUIRE(buf);
+    REQUIRE(pl_buf_read(gpu, buf, 0, test_dst, buf_size));
+    REQUIRE_MEMEQ(test_src, test_dst, buf_size);
+    pl_buf_destroy(gpu, &buf);
+
+    printf("test buffer empty creation, update and readback\n");
+    memset(test_dst, 0, buf_size);
+    buf = pl_buf_create(gpu, pl_buf_params(
+        .size = buf_size,
+        .host_writable = true,
+        .host_readable = true,
+    ));
+
+    REQUIRE(buf);
+    pl_buf_write(gpu, buf, 0, test_src, buf_size);
+    REQUIRE(pl_buf_read(gpu, buf, 0, test_dst, buf_size));
+    REQUIRE_MEMEQ(test_src, test_dst, buf_size);
+    pl_buf_destroy(gpu, &buf);
+
+    printf("test buffer-buffer copy and readback\n");
+    memset(test_dst, 0, buf_size);
+    buf = pl_buf_create(gpu, pl_buf_params(
+        .size = buf_size,
+        .initial_data = test_src,
+    ));
+
+    tbuf = pl_buf_create(gpu, pl_buf_params(
+        .size = buf_size,
+        .host_readable = true,
+    ));
+
+    REQUIRE(buf && tbuf);
+    pl_buf_copy(gpu, tbuf, 0, buf, 0, buf_size);
+    REQUIRE(pl_buf_read(gpu, tbuf, 0, test_dst, buf_size));
+    REQUIRE_MEMEQ(test_src, test_dst, buf_size);
+    pl_buf_destroy(gpu, &buf);
+    pl_buf_destroy(gpu, &tbuf);
+
+    if (buf_size <= gpu->limits.max_mapped_size) {
+        printf("test host mapped buffer readback\n");
+        buf = pl_buf_create(gpu, pl_buf_params(
+            .size = buf_size,
+            .host_mapped = true,
+            .initial_data = test_src,
+        ));
+
+        REQUIRE(buf);
+        REQUIRE(!pl_buf_poll(gpu, buf, 0));
+        REQUIRE_MEMEQ(test_src, buf->data, buf_size);
+        pl_buf_destroy(gpu, &buf);
+    }
+
+    // `compute_queues` check is to exclude dummy GPUs here
+    if (buf_size <= gpu->limits.max_ssbo_size && gpu->limits.compute_queues)
+    {
+        printf("test endian swapping\n");
+        buf = pl_buf_create(gpu, pl_buf_params(
+            .size = buf_size,
+            .storable = true,
+            .initial_data = test_src,
+        ));
+
+        tbuf = pl_buf_create(gpu, pl_buf_params(
+            .size = buf_size,
+            .storable = true,
+            .host_readable = true,
+        ));
+
+        REQUIRE(buf && tbuf);
+        REQUIRE(pl_buf_copy_swap(gpu, &(struct pl_buf_copy_swap_params) {
+            .src = buf,
+            .dst = tbuf,
+            .size = buf_size,
+            .wordsize = 2,
+        }));
+        REQUIRE(pl_buf_read(gpu, tbuf, 0, test_dst, buf_size));
+        for (int i = 0; i < buf_size / 2; i++) {
+            REQUIRE_CMP(test_src[2 * i + 0], ==, test_dst[2 * i + 1], PRIu8);
+            REQUIRE_CMP(test_src[2 * i + 1], ==, test_dst[2 * i + 0], PRIu8);
+        }
+        // test endian swap in-place
+        REQUIRE(pl_buf_copy_swap(gpu, &(struct pl_buf_copy_swap_params) {
+            .src = tbuf,
+            .dst = tbuf,
+            .size = buf_size,
+            .wordsize = 4,
+        }));
+        REQUIRE(pl_buf_read(gpu, tbuf, 0, test_dst, buf_size));
+        for (int i = 0; i < buf_size / 4; i++) {
+            REQUIRE_CMP(test_src[4 * i + 0], ==, test_dst[4 * i + 2], PRIu8);
+            REQUIRE_CMP(test_src[4 * i + 1], ==, test_dst[4 * i + 3], PRIu8);
+            REQUIRE_CMP(test_src[4 * i + 2], ==, test_dst[4 * i + 0], PRIu8);
+            REQUIRE_CMP(test_src[4 * i + 3], ==, test_dst[4 * i + 1], PRIu8);
+        }
+        pl_buf_destroy(gpu, &buf);
+        pl_buf_destroy(gpu, &tbuf);
+    }
+
+    free(test_src);
+}
+
+static void test_cb(void *priv)
+{
+    bool *flag = priv;
+    *flag = true;
+}
+
+static void pl_test_roundtrip(pl_gpu gpu, pl_tex tex[2],
+                              uint8_t *src, uint8_t *dst)
+{
+    if (!tex[0] || !tex[1]) {
+        printf("failed creating test textures... skipping this test\n");
+        return;
+    }
+
+    int texels = tex[0]->params.w;
+    texels *= tex[0]->params.h ? tex[0]->params.h : 1;
+    texels *= tex[0]->params.d ? tex[0]->params.d : 1;
+
+    pl_fmt fmt = tex[0]->params.format;
+    size_t bytes = texels * fmt->texel_size;
+    memset(src, 0, bytes);
+    memset(dst, 0, bytes);
+
+    for (size_t i = 0; i < bytes; i++)
+        src[i] = RANDOM_U8;
+
+    pl_timer ul, dl;
+    ul = pl_timer_create(gpu);
+    dl = pl_timer_create(gpu);
+
+    bool ran_ul = false, ran_dl = false;
+
+    REQUIRE(pl_tex_upload(gpu, &(struct pl_tex_transfer_params){
+        .tex = tex[0],
+        .ptr = src,
+        .timer = ul,
+        .callback = gpu->limits.callbacks ? test_cb : NULL,
+        .priv = &ran_ul,
+    }));
+
+    // Test blitting, if possible for this format
+    pl_tex dst_tex = tex[0];
+    if (tex[0]->params.blit_src && tex[1]->params.blit_dst) {
+        pl_tex_clear_ex(gpu, tex[1], (union pl_clear_color){0}); // for testing
+        pl_tex_blit(gpu, &(struct pl_tex_blit_params) {
+            .src = tex[0],
+            .dst = tex[1],
+        });
+        dst_tex = tex[1];
+    }
+
+    REQUIRE(pl_tex_download(gpu, &(struct pl_tex_transfer_params){
+        .tex = dst_tex,
+        .ptr = dst,
+        .timer = dl,
+        .callback = gpu->limits.callbacks ? test_cb : NULL,
+        .priv = &ran_dl,
+    }));
+
+    pl_gpu_finish(gpu);
+    if (gpu->limits.callbacks)
+        REQUIRE(ran_ul && ran_dl);
+
+    if (fmt->emulated && fmt->type == PL_FMT_FLOAT) {
+        // TODO: can't memcmp here because bits might be lost due to the
+        // emulated 16/32 bit upload paths, figure out a better way to
+        // generate data and verify the roundtrip!
+    } else {
+        REQUIRE_MEMEQ(src, dst, bytes);
+    }
+
+    // Report timer results
+    printf("upload time: %"PRIu64", download time: %"PRIu64"\n",
+           pl_timer_query(gpu, ul), pl_timer_query(gpu, dl));
+
+    pl_timer_destroy(gpu, &ul);
+    pl_timer_destroy(gpu, &dl);
+}
+
+static void pl_texture_tests(pl_gpu gpu)
+{
+    const size_t max_size = 16*16*16 * 4 *sizeof(double);
+    uint8_t *test_src = malloc(max_size * 2);
+    uint8_t *test_dst = test_src + max_size;
+
+    for (int f = 0; f < gpu->num_formats; f++) {
+        pl_fmt fmt = gpu->formats[f];
+        if (fmt->opaque || !(fmt->caps & PL_FMT_CAP_HOST_READABLE))
+            continue;
+
+        printf("testing texture roundtrip for format %s\n", fmt->name);
+        assert(fmt->texel_size <= 4 * sizeof(double));
+
+        struct pl_tex_params ref_params = {
+            .format        = fmt,
+            .blit_src      = (fmt->caps & PL_FMT_CAP_BLITTABLE),
+            .blit_dst      = (fmt->caps & PL_FMT_CAP_BLITTABLE),
+            .host_writable = true,
+            .host_readable = true,
+            .debug_tag     = PL_DEBUG_TAG,
+        };
+
+        pl_tex tex[2];
+
+        if (gpu->limits.max_tex_1d_dim >= 16) {
+            printf("... 1D\n");
+            struct pl_tex_params params = ref_params;
+            params.w = 16;
+            if (!gpu->limits.blittable_1d_3d)
+                params.blit_src = params.blit_dst = false;
+            for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
+                tex[i] = pl_tex_create(gpu, &params);
+            pl_test_roundtrip(gpu, tex, test_src, test_dst);
+            for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
+                pl_tex_destroy(gpu, &tex[i]);
+        }
+
+        if (gpu->limits.max_tex_2d_dim >= 16) {
+            printf("... 2D\n");
+            struct pl_tex_params params = ref_params;
+            params.w = params.h = 16;
+            for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
+                tex[i] = pl_tex_create(gpu, &params);
+            pl_test_roundtrip(gpu, tex, test_src, test_dst);
+            for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
+                pl_tex_destroy(gpu, &tex[i]);
+        }
+
+        if (gpu->limits.max_tex_3d_dim >= 16) {
+            printf("... 3D\n");
+            struct pl_tex_params params = ref_params;
+            params.w = params.h = params.d = 16;
+            if (!gpu->limits.blittable_1d_3d)
+                params.blit_src = params.blit_dst = false;
+            for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
+                tex[i] = pl_tex_create(gpu, &params);
+            pl_test_roundtrip(gpu, tex, test_src, test_dst);
+            for (int i = 0; i < PL_ARRAY_SIZE(tex); i++)
+                pl_tex_destroy(gpu, &tex[i]);
+        }
+    }
+
+    free(test_src);
+}
+
+static void pl_planar_tests(pl_gpu gpu)
+{
+    pl_fmt fmt = pl_find_named_fmt(gpu, "g8_b8_r8_420");
+    if (!fmt)
+        return;
+    REQUIRE_CMP(fmt->num_planes, ==, 3, "d");
+
+    const int width = 64, height = 32;
+    pl_tex tex = pl_tex_create(gpu, pl_tex_params(
+        .w              = width,
+        .h              = height,
+        .format         = fmt,
+        .blit_dst       = true,
+        .host_readable  = true,
+    ));
+    if (!tex)
+        return;
+    for (int i = 0; i < fmt->num_planes; i++)
+        REQUIRE(tex->planes[i]);
+
+    pl_tex plane = tex->planes[1];
+    uint8_t data[(width * height) >> 2];
+    REQUIRE_CMP(plane->params.w * plane->params.h, ==, PL_ARRAY_SIZE(data), "d");
+
+    pl_tex_clear(gpu, plane, (float[]){ (float) 0x80 / 0xFF, 0.0, 0.0, 1.0 });
+    REQUIRE(pl_tex_download(gpu, pl_tex_transfer_params(
+        .tex = plane,
+        .ptr = data,
+    )));
+
+    uint8_t ref[PL_ARRAY_SIZE(data)];
+    memset(ref, 0x80, sizeof(ref));
+    REQUIRE_MEMEQ(data, ref, PL_ARRAY_SIZE(data));
+
+    pl_tex_destroy(gpu, &tex);
+}
+
+static void pl_shader_tests(pl_gpu gpu)
+{
+    if (gpu->glsl.version < 410)
+        return;
+
+    const char *vert_shader =
+        "#version 410                               \n"
+        "layout(location=0) in vec2 vertex_pos;     \n"
+        "layout(location=1) in vec3 vertex_color;   \n"
+        "layout(location=0) out vec3 frag_color;    \n"
+        "void main() {                              \n"
+        "    gl_Position = vec4(vertex_pos, 0, 1);  \n"
+        "    frag_color = vertex_color;             \n"
+        "}";
+
+    const char *frag_shader =
+        "#version 410                               \n"
+        "layout(location=0) in vec3 frag_color;     \n"
+        "layout(location=0) out vec4 out_color;     \n"
+        "void main() {                              \n"
+        "    out_color = vec4(frag_color, 1.0);     \n"
+        "}";
+
+    pl_fmt fbo_fmt;
+    enum pl_fmt_caps caps = PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_BLITTABLE |
+                            PL_FMT_CAP_LINEAR;
+
+    fbo_fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, 4, 16, 32, caps);
+    if (!fbo_fmt)
+        return;
+
+#define FBO_W 16
+#define FBO_H 16
+
+    pl_tex fbo;
+    fbo = pl_tex_create(gpu, &(struct pl_tex_params) {
+        .format         = fbo_fmt,
+        .w              = FBO_W,
+        .h              = FBO_H,
+        .renderable     = true,
+        .storable       = !!(fbo_fmt->caps & PL_FMT_CAP_STORABLE),
+        .host_readable  = true,
+        .blit_dst       = true,
+    });
+    REQUIRE(fbo);
+
+    pl_tex_clear_ex(gpu, fbo, (union pl_clear_color){0});
+
+    pl_fmt vert_fmt;
+    vert_fmt = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 3);
+    REQUIRE(vert_fmt);
+
+    static const struct vertex { float pos[2]; float color[3]; } vertices[] = {
+        {{-1.0, -1.0}, {0, 0, 0}},
+        {{ 1.0, -1.0}, {1, 0, 0}},
+        {{-1.0,  1.0}, {0, 1, 0}},
+        {{ 1.0,  1.0}, {1, 1, 0}},
+    };
+
+    pl_pass pass;
+    pass = pl_pass_create(gpu, &(struct pl_pass_params) {
+        .type           = PL_PASS_RASTER,
+        .target_format  = fbo_fmt,
+        .vertex_shader  = vert_shader,
+        .glsl_shader    = frag_shader,
+
+        .vertex_type    = PL_PRIM_TRIANGLE_STRIP,
+        .vertex_stride  = sizeof(struct vertex),
+        .num_vertex_attribs = 2,
+        .vertex_attribs = (struct pl_vertex_attrib[]) {{
+            .name     = "vertex_pos",
+            .fmt      = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2),
+            .location = 0,
+            .offset   = offsetof(struct vertex, pos),
+        }, {
+            .name     = "vertex_color",
+            .fmt      = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 3),
+            .location = 1,
+            .offset   = offsetof(struct vertex, color),
+        }},
+    });
+    REQUIRE(pass);
+    if (pass->params.cached_program || pass->params.cached_program_len) {
+        // Ensure both are set if either one is set
+        REQUIRE(pass->params.cached_program);
+        REQUIRE(pass->params.cached_program_len);
+    }
+
+    pl_timer timer = pl_timer_create(gpu);
+    pl_pass_run(gpu, &(struct pl_pass_run_params) {
+        .pass           = pass,
+        .target         = fbo,
+        .vertex_count   = PL_ARRAY_SIZE(vertices),
+        .vertex_data    = vertices,
+        .timer          = timer,
+    });
+
+    // Wait until this pass is complete and report the timer result
+    pl_gpu_finish(gpu);
+    printf("timer query result: %"PRIu64"\n", pl_timer_query(gpu, timer));
+    pl_timer_destroy(gpu, &timer);
+
+    static float test_data[FBO_H * FBO_W * 4] = {0};
+
+    // Test against the known pattern of `src`, only useful for roundtrip tests
+#define TEST_FBO_PATTERN(eps, fmt, ...)                                     \
+    do {                                                                    \
+        printf("testing pattern of " fmt "\n", __VA_ARGS__);                \
+        REQUIRE(pl_tex_download(gpu, &(struct pl_tex_transfer_params) {     \
+            .tex = fbo,                                                     \
+            .ptr = test_data,                                               \
+        }));                                                                \
+                                                                            \
+        for (int y = 0; y < FBO_H; y++) {                                   \
+            for (int x = 0; x < FBO_W; x++) {                               \
+                float *color = &test_data[(y * FBO_W + x) * 4];             \
+                REQUIRE_FEQ(color[0], (x + 0.5) / FBO_W, eps);              \
+                REQUIRE_FEQ(color[1], (y + 0.5) / FBO_H, eps);              \
+                REQUIRE_FEQ(color[2], 0.0, eps);                            \
+                REQUIRE_FEQ(color[3], 1.0, eps);                            \
+            }                                                               \
+        }                                                                   \
+    } while (0)
+
+    TEST_FBO_PATTERN(1e-6, "%s", "initial rendering");
+
+    if (sizeof(vertices) <= gpu->limits.max_vbo_size) {
+        // Test the use of an explicit vertex buffer
+        pl_buf vert = pl_buf_create(gpu, &(struct pl_buf_params) {
+            .size = sizeof(vertices),
+            .initial_data = vertices,
+            .drawable = true,
+        });
+
+        REQUIRE(vert);
+        pl_pass_run(gpu, &(struct pl_pass_run_params) {
+            .pass           = pass,
+            .target         = fbo,
+            .vertex_count   = sizeof(vertices) / sizeof(struct vertex),
+            .vertex_buf     = vert,
+            .buf_offset     = 0,
+        });
+
+        pl_buf_destroy(gpu, &vert);
+        TEST_FBO_PATTERN(1e-6, "%s", "using vertex buffer");
+    }
+
+    // Test the use of index buffers
+    static const uint16_t indices[] = { 3, 2, 1, 0 };
+    pl_pass_run(gpu, &(struct pl_pass_run_params) {
+        .pass           = pass,
+        .target         = fbo,
+        .vertex_count   = PL_ARRAY_SIZE(indices),
+        .vertex_data    = vertices,
+        .index_data     = indices,
+    });
+
+    pl_pass_destroy(gpu, &pass);
+    TEST_FBO_PATTERN(1e-6, "%s", "using indexed rendering");
+
+    // Test the use of pl_dispatch
+    pl_dispatch dp = pl_dispatch_create(gpu->log, gpu);
+    pl_shader sh = pl_dispatch_begin(dp);
+    REQUIRE(pl_shader_custom(sh, &(struct pl_custom_shader) {
+        .body       = "color = vec4(col, 1.0);",
+        .input      = PL_SHADER_SIG_NONE,
+        .output     = PL_SHADER_SIG_COLOR,
+    }));
+
+    REQUIRE(pl_dispatch_vertex(dp, &(struct pl_dispatch_vertex_params) {
+        .shader         = &sh,
+        .target         = fbo,
+        .vertex_stride  = sizeof(struct vertex),
+        .vertex_position_idx = 0,
+        .num_vertex_attribs = 2,
+        .vertex_attribs = (struct pl_vertex_attrib[]) {{
+            .name   = "pos",
+            .fmt    = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 2),
+            .offset = offsetof(struct vertex, pos),
+        }, {
+            .name   = "col",
+            .fmt    = pl_find_vertex_fmt(gpu, PL_FMT_FLOAT, 3),
+            .offset = offsetof(struct vertex, color),
+        }},
+
+        .vertex_type    = PL_PRIM_TRIANGLE_STRIP,
+        .vertex_coords  = PL_COORDS_NORMALIZED,
+        .vertex_count   = PL_ARRAY_SIZE(vertices),
+        .vertex_data    = vertices,
+    }));
+
+    TEST_FBO_PATTERN(1e-6, "%s", "using custom vertices");
+
+    static float src_data[FBO_H * FBO_W * 4] = {0};
+    memcpy(src_data, test_data, sizeof(src_data));
+
+    pl_tex src;
+    src = pl_tex_create(gpu, &(struct pl_tex_params) {
+        .format         = fbo_fmt,
+        .w              = FBO_W,
+        .h              = FBO_H,
+        .storable       = fbo->params.storable,
+        .sampleable     = true,
+        .initial_data   = src_data,
+    });
+
+    if (fbo->params.storable) {
+        // Test 1x1 blit, to make sure the scaling code runs
+        REQUIRE(pl_tex_blit_compute(gpu, &(struct pl_tex_blit_params) {
+            .src = src,
+            .dst = fbo,
+            .src_rc = {0, 0, 0, 1, 1, 1},
+            .dst_rc = {0, 0, 0, FBO_W, FBO_H, 1},
+            .sample_mode = PL_TEX_SAMPLE_NEAREST,
+        }));
+
+        // Test non-resizing blit, which uses the efficient imageLoad path
+        REQUIRE(pl_tex_blit_compute(gpu, &(struct pl_tex_blit_params) {
+            .src = src,
+            .dst = fbo,
+            .src_rc = {0, 0, 0, FBO_W, FBO_H, 1},
+            .dst_rc = {0, 0, 0, FBO_W, FBO_H, 1},
+            .sample_mode = PL_TEX_SAMPLE_NEAREST,
+        }));
+
+        TEST_FBO_PATTERN(1e-6, "%s", "pl_tex_blit_compute");
+    }
+
+    // Test encoding/decoding of all gamma functions, color spaces, etc.
+    for (enum pl_color_transfer trc = 0; trc < PL_COLOR_TRC_COUNT; trc++) {
+        struct pl_color_space test_csp = {
+            .transfer = trc,
+            .hdr.min_luma = PL_COLOR_HDR_BLACK,
+        };
+        sh = pl_dispatch_begin(dp);
+        pl_shader_sample_nearest(sh, pl_sample_src( .tex = src ));
+        pl_shader_delinearize(sh, &test_csp);
+        pl_shader_linearize(sh, &test_csp);
+        REQUIRE(pl_dispatch_finish(dp, pl_dispatch_params(
+            .shader = &sh,
+            .target = fbo,
+        )));
+
+        float epsilon = pl_color_transfer_is_hdr(trc) ? 1e-4 : 1e-6;
+        TEST_FBO_PATTERN(epsilon, "transfer function %d", (int) trc);
+    }
+
+    for (enum pl_color_system sys = 0; sys < PL_COLOR_SYSTEM_COUNT; sys++) {
+        if (sys == PL_COLOR_SYSTEM_DOLBYVISION)
+            continue; // requires metadata
+        sh = pl_dispatch_begin(dp);
+        pl_shader_sample_nearest(sh, pl_sample_src( .tex = src ));
+        pl_shader_encode_color(sh, &(struct pl_color_repr) { .sys = sys });
+        pl_shader_decode_color(sh, &(struct pl_color_repr) { .sys = sys }, NULL);
+        REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
+            .shader = &sh,
+            .target = fbo,
+        }));
+
+        float epsilon;
+        switch (sys) {
+        case PL_COLOR_SYSTEM_BT_2020_C:
+        case PL_COLOR_SYSTEM_XYZ:
+            epsilon = 1e-5;
+            break;
+
+        case PL_COLOR_SYSTEM_BT_2100_PQ:
+        case PL_COLOR_SYSTEM_BT_2100_HLG:
+            // These seem to be horrifically noisy and prone to breaking on
+            // edge cases for some reason
+            // TODO: figure out why!
+            continue;
+
+        default: epsilon = 1e-6; break;
+        }
+
+        TEST_FBO_PATTERN(epsilon, "color system %d", (int) sys);
+    }
+
+    // Repeat this a few times to test the caching
+    pl_cache cache = pl_cache_create(pl_cache_params( .log = gpu->log ));
+    pl_gpu_set_cache(gpu, cache);
+    for (int i = 0; i < 10; i++) {
+        if (i == 5) {
+            printf("Recreating pl_dispatch to test the caching\n");
+            size_t size = pl_dispatch_save(dp, NULL);
+            REQUIRE(size);
+            uint8_t *cache_data = malloc(size);
+            REQUIRE(cache_data);
+            REQUIRE_CMP(pl_dispatch_save(dp, cache_data), ==, size, "zu");
+
+            pl_dispatch_destroy(&dp);
+            dp = pl_dispatch_create(gpu->log, gpu);
+            pl_dispatch_load(dp, cache_data);
+
+            // Test to make sure the pass regenerates the same cache
+            uint64_t hash = pl_str_hash((pl_str) { cache_data, size });
+            REQUIRE_CMP(pl_dispatch_save(dp, NULL), ==, size, "zu");
+            REQUIRE_CMP(pl_dispatch_save(dp, cache_data), ==, size, "zu");
+            REQUIRE_CMP(pl_str_hash((pl_str) { cache_data, size }), ==, hash, PRIu64);
+            free(cache_data);
+        }
+
+        sh = pl_dispatch_begin(dp);
+
+        // For testing, force the use of CS if possible
+        if (gpu->glsl.compute) {
+            sh->type = SH_COMPUTE;
+            sh->group_size[0] = 8;
+            sh->group_size[1] = 8;
+        }
+
+        pl_shader_deband(sh, pl_sample_src( .tex = src ), pl_deband_params(
+            .iterations     = 0,
+            .grain          = 0.0,
+        ));
+
+        REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
+            .shader = &sh,
+            .target = fbo,
+        }));
+        TEST_FBO_PATTERN(1e-6, "deband iter %d", i);
+    }
+
+    pl_gpu_set_cache(gpu, NULL);
+    pl_cache_destroy(&cache);
+
+    // Test peak detection and readback if possible
+    sh = pl_dispatch_begin(dp);
+    pl_shader_sample_nearest(sh, pl_sample_src( .tex = src ));
+
+    pl_shader_obj peak_state = NULL;
+    struct pl_color_space csp_gamma22 = { .transfer = PL_COLOR_TRC_GAMMA22 };
+    struct pl_peak_detect_params peak_params = { .minimum_peak = 0.01 };
+    if (pl_shader_detect_peak(sh, csp_gamma22, &peak_state, &peak_params)) {
+        REQUIRE(pl_dispatch_compute(dp, &(struct pl_dispatch_compute_params) {
+            .shader = &sh,
+            .width = fbo->params.w,
+            .height = fbo->params.h,
+        }));
+
+        float peak, avg;
+        REQUIRE(pl_get_detected_peak(peak_state, &peak, &avg));
+
+        float real_peak = 0, real_avg = 0;
+        for (int y = 0; y < FBO_H; y++) {
+            for (int x = 0; x < FBO_W; x++) {
+                float *color = &src_data[(y * FBO_W + x) * 4];
+                float luma = 0.212639f * powf(color[0], 2.2f) +
+                             0.715169f * powf(color[1], 2.2f) +
+                             0.072192f * powf(color[2], 2.2f);
+                luma = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, luma);
+                real_peak = PL_MAX(real_peak, luma);
+                real_avg += luma;
+            }
+        }
+        real_avg = real_avg / (FBO_W * FBO_H);
+
+        real_avg  = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, real_avg);
+        real_peak = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, real_peak);
+        REQUIRE_FEQ(peak, real_peak, 1e-3);
+        REQUIRE_FEQ(avg, real_avg, 1e-2);
+    }
+
+    pl_dispatch_abort(dp, &sh);
+    pl_shader_obj_destroy(&peak_state);
+
+    // Test film grain synthesis
+    pl_shader_obj grain = NULL;
+    struct pl_film_grain_params grain_params = {
+        .tex = src,
+        .components = 3,
+        .component_mapping = { 0, 1, 2},
+        .repr = &(struct pl_color_repr) {
+            .sys = PL_COLOR_SYSTEM_BT_709,
+            .levels = PL_COLOR_LEVELS_LIMITED,
+            .bits = { .color_depth = 10, .sample_depth = 10 },
+        },
+    };
+
+    for (int i = 0; i < 2; i++) {
+        grain_params.data.type = PL_FILM_GRAIN_AV1;
+        grain_params.data.params.av1 = av1_grain_data;
+        grain_params.data.params.av1.overlap = !!i;
+        grain_params.data.seed = rand();
+
+        sh = pl_dispatch_begin(dp);
+        pl_shader_film_grain(sh, &grain, &grain_params);
+        REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
+            .shader = &sh,
+            .target = fbo,
+        }));
+    }
+
+    if (gpu->glsl.compute) {
+        grain_params.data.type = PL_FILM_GRAIN_H274;
+        grain_params.data.params.h274 = h274_grain_data;
+        grain_params.data.seed = rand();
+
+        sh = pl_dispatch_begin(dp);
+        pl_shader_film_grain(sh, &grain, &grain_params);
+        REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
+            .shader = &sh,
+            .target = fbo,
+        }));
+    }
+    pl_shader_obj_destroy(&grain);
+
+    // Test custom shaders
+    struct pl_custom_shader custom = {
+        .header =
+            "vec3 invert(vec3 color)            \n"
+            "{                                  \n"
+            "    return vec3(1.0) - color;      \n"
+            "}                                  \n",
+
+        .body =
+            "color = vec4(gl_FragCoord.xy, 0.0, 1.0);   \n"
+            "color.rgb = invert(color.rgb) + offset;    \n",
+
+        .input = PL_SHADER_SIG_NONE,
+        .output = PL_SHADER_SIG_COLOR,
+
+        .num_variables = 1,
+        .variables = &(struct pl_shader_var) {
+            .var = pl_var_float("offset"),
+            .data = &(float) { 0.1 },
+        },
+    };
+
+    sh = pl_dispatch_begin(dp);
+    REQUIRE(pl_shader_custom(sh, &custom));
+    REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
+        .shader = &sh,
+        .target = fbo,
+    }));
+
+    // Test dolbyvision
+    struct pl_color_repr repr = {
+        .sys = PL_COLOR_SYSTEM_DOLBYVISION,
+        .dovi = &dovi_meta,
+    };
+
+    sh = pl_dispatch_begin(dp);
+    pl_shader_sample_direct(sh, pl_sample_src( .tex = src ));
+    pl_shader_decode_color(sh, &repr, NULL);
+    REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
+        .shader = &sh,
+        .target = fbo,
+    }));
+
+    // Test deinterlacing
+    sh = pl_dispatch_begin(dp);
+    pl_shader_deinterlace(sh, pl_deinterlace_source( .cur = pl_field_pair(src) ), NULL);
+    REQUIRE(pl_dispatch_finish(dp, pl_dispatch_params(
+        .shader = &sh,
+        .target = fbo,
+    )));
+
+    // Test error diffusion
+    if (fbo->params.storable) {
+        for (int i = 0; i < pl_num_error_diffusion_kernels; i++) {
+            const struct pl_error_diffusion_kernel *k = pl_error_diffusion_kernels[i];
+            printf("testing error diffusion kernel '%s'\n", k->name);
+            sh = pl_dispatch_begin(dp);
+            bool ok = pl_shader_error_diffusion(sh, pl_error_diffusion_params(
+                .input_tex  = src,
+                .output_tex = fbo,
+                .new_depth  = 8,
+                .kernel     = k,
+            ));
+
+            if (!ok) {
+                fprintf(stderr, "kernel '%s' exceeds GPU limits, skipping...\n", k->name);
+                continue;
+            }
+
+            REQUIRE(pl_dispatch_compute(dp, pl_dispatch_compute_params(
+                .shader = &sh,
+                .dispatch_size = {1, 1, 1},
+            )));
+        }
+    }
+
+    pl_dispatch_destroy(&dp);
+    pl_tex_destroy(gpu, &src);
+    pl_tex_destroy(gpu, &fbo);
+}
+
+static void pl_scaler_tests(pl_gpu gpu)
+{
+    pl_fmt src_fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, 1, 16, 32, PL_FMT_CAP_LINEAR);
+    pl_fmt fbo_fmt = pl_find_fmt(gpu, PL_FMT_FLOAT, 1, 16, 32, PL_FMT_CAP_RENDERABLE);
+    if (!src_fmt || !fbo_fmt)
+        return;
+
+    float *fbo_data = NULL;
+    pl_shader_obj lut = NULL;
+
+    static float data_5x5[5][5] = {
+        { 0, 0, 0, 0, 0 },
+        { 0, 0, 0, 0, 0 },
+        { 0, 0, 1, 0, 0 },
+        { 0, 0, 0, 0, 0 },
+        { 0, 0, 0, 0, 0 },
+    };
+
+    pl_tex dot5x5 = pl_tex_create(gpu, &(struct pl_tex_params) {
+        .w              = 5,
+        .h              = 5,
+        .format         = src_fmt,
+        .sampleable     = true,
+        .initial_data   = &data_5x5[0][0],
+    });
+
+    struct pl_tex_params fbo_params = {
+        .w              = 100,
+        .h              = 100,
+        .format         = fbo_fmt,
+        .renderable     = true,
+        .storable       = fbo_fmt->caps & PL_FMT_CAP_STORABLE,
+        .host_readable  = fbo_fmt->caps & PL_FMT_CAP_HOST_READABLE,
+    };
+
+    pl_tex fbo = pl_tex_create(gpu, &fbo_params);
+    pl_dispatch dp = pl_dispatch_create(gpu->log, gpu);
+    if (!dot5x5 || !fbo || !dp)
+        goto error;
+
+    pl_shader sh = pl_dispatch_begin(dp);
+    REQUIRE(pl_shader_sample_polar(sh,
+        pl_sample_src(
+            .tex        = dot5x5,
+            .new_w      = fbo->params.w,
+            .new_h      = fbo->params.h,
+        ),
+        pl_sample_filter_params(
+            .filter     = pl_filter_ewa_lanczos,
+            .lut        = &lut,
+            .no_compute = !fbo->params.storable,
+        )
+    ));
+    REQUIRE(pl_dispatch_finish(dp, &(struct pl_dispatch_params) {
+        .shader = &sh,
+        .target = fbo,
+    }));
+
+    if (fbo->params.host_readable) {
+        fbo_data = malloc(fbo->params.w * fbo->params.h * sizeof(float));
+        REQUIRE(pl_tex_download(gpu, &(struct pl_tex_transfer_params) {
+            .tex            = fbo,
+            .ptr            = fbo_data,
+        }));
+
+#ifdef PRINT_OUTPUT
+        int max = 255;
+        printf("P2\n%d %d\n%d\n", fbo->params.w, fbo->params.h, max);
+        for (int y = 0; y < fbo->params.h; y++) {
+            for (int x = 0; x < fbo->params.w; x++) {
+                float v = fbo_data[y * fbo->params.h + x];
+                printf("%d ", (int) round(fmin(fmax(v, 0.0), 1.0) * max));
+            }
+            printf("\n");
+        }
+#endif
+    }
+
+error:
+    free(fbo_data);
+    pl_shader_obj_destroy(&lut);
+    pl_dispatch_destroy(&dp);
+    pl_tex_destroy(gpu, &dot5x5);
+    pl_tex_destroy(gpu, &fbo);
+}
+
+static const char *user_shader_tests[] = {
+    // Test hooking, saving and loading
+    "// Example of a comment at the beginning                               \n"
+    "                                                                       \n"
+    "//!HOOK NATIVE                                                         \n"
+    "//!DESC upscale image                                                  \n"
+    "//!BIND HOOKED                                                         \n"
+    "//!WIDTH HOOKED.w 10 *                                                 \n"
+    "//!HEIGHT HOOKED.h 10 *                                                \n"
+    "//!SAVE NATIVEBIG                                                      \n"
+    "//!WHEN NATIVE.w 500 <                                                 \n"
+    "                                                                       \n"
+    "vec4 hook()                                                            \n"
+    "{                                                                      \n"
+    "    return HOOKED_texOff(0);                                           \n"
+    "}                                                                      \n"
+    "                                                                       \n"
+    "//!HOOK MAIN                                                           \n"
+    "//!DESC downscale bigger image                                         \n"
+    "//!WHEN NATIVE.w 500 <                                                 \n"
+    "//!BIND NATIVEBIG                                                      \n"
+    "                                                                       \n"
+    "vec4 hook()                                                            \n"
+    "{                                                                      \n"
+    "    return NATIVEBIG_texOff(0);                                        \n"
+    "}                                                                      \n",
+
+    // Test use of textures
+    "//!HOOK MAIN                                                           \n"
+    "//!DESC turn everything into colorful pixels                           \n"
+    "//!BIND HOOKED                                                         \n"
+    "//!BIND DISCO                                                          \n"
+    "//!COMPONENTS 3                                                        \n"
+    "                                                                       \n"
+    "vec4 hook()                                                            \n"
+    "{                                                                      \n"
+    "    return vec4(DISCO_tex(HOOKED_pos * 10.0).rgb, 1);                  \n"
+    "}                                                                      \n"
+    "                                                                       \n"
+    "//!TEXTURE DISCO                                                       \n"
+    "//!SIZE 3 3                                                            \n"
+    "//!FORMAT rgba8                                                        \n"
+    "//!FILTER NEAREST                                                      \n"
+    "//!BORDER REPEAT                                                       \n"
+    "ff0000ff00ff00ff0000ffff00ffffffff00ffffffff00ff4c4c4cff999999ffffffffff\n"
+
+    // Test custom parameters
+    "//!PARAM test                                                          \n"
+    "//!DESC test parameter                                                 \n"
+    "//!TYPE DYNAMIC float                                                  \n"
+    "//!MINIMUM 0.0                                                         \n"
+    "//!MAXIMUM 100.0                                                       \n"
+    "1.0                                                                    \n"
+    "                                                                       \n"
+    "//!PARAM testconst                                                     \n"
+    "//!TYPE CONSTANT uint                                                  \n"
+    "//!MAXIMUM 16                                                          \n"
+    "3                                                                      \n"
+    "                                                                       \n"
+    "//!PARAM testdefine                                                    \n"
+    "//!TYPE DEFINE                                                         \n"
+    "100                                                                    \n"
+    "                                                                       \n"
+    "//!PARAM testenum                                                      \n"
+    "//!TYPE ENUM DEFINE                                                    \n"
+    "FOO                                                                    \n"
+    "BAR                                                                    \n"
+    "                                                                       \n"
+    "//!HOOK MAIN                                                           \n"
+    "//!WHEN testconst 30 >                                                 \n"
+    "#error should not be run                                               \n"
+    "                                                                       \n"
+    "//!HOOK MAIN                                                           \n"
+    "//!WHEN testenum FOO =                                                 \n"
+    "#if testenum == BAR                                                    \n"
+    " #error bad                                                            \n"
+    "#endif                                                                 \n"
+    "vec4 hook() { return vec4(0.0); }                                      \n"
+};
+
+static const char *compute_shader_tests[] = {
+    // Test use of storage/buffer resources
+    "//!HOOK MAIN                                                           \n"
+    "//!DESC attach some storage objects                                    \n"
+    "//!BIND tex_storage                                                    \n"
+    "//!BIND buf_uniform                                                    \n"
+    "//!BIND buf_storage                                                    \n"
+    "//!COMPONENTS 4                                                        \n"
+    "                                                                       \n"
+    "vec4 hook()                                                            \n"
+    "{                                                                      \n"
+    "    return vec4(foo, bar, bat);                                        \n"
+    "}                                                                      \n"
+    "                                                                       \n"
+    "//!TEXTURE tex_storage                                                 \n"
+    "//!SIZE 100 100                                                        \n"
+    "//!FORMAT r32f                                                         \n"
+    "//!STORAGE                                                             \n"
+    "                                                                       \n"
+    "//!BUFFER buf_uniform                                                  \n"
+    "//!VAR float foo                                                       \n"
+    "//!VAR float bar                                                       \n"
+    "0000000000000000                                                       \n"
+    "                                                                       \n"
+    "//!BUFFER buf_storage                                                  \n"
+    "//!VAR vec2 bat                                                        \n"
+    "//!VAR int big[32];                                                    \n"
+    "//!STORAGE                                                             \n",
+
+};
+
+static const char *test_luts[] = {
+
+    "TITLE \"1D identity\"  \n"
+    "LUT_1D_SIZE 2          \n"
+    "0.0 0.0 0.0            \n"
+    "1.0 1.0 1.0            \n",
+
+    "TITLE \"3D identity\"  \n"
+    "LUT_3D_SIZE 2          \n"
+    "0.0 0.0 0.0            \n"
+    "1.0 0.0 0.0            \n"
+    "0.0 1.0 0.0            \n"
+    "1.0 1.0 0.0            \n"
+    "0.0 0.0 1.0            \n"
+    "1.0 0.0 1.0            \n"
+    "0.0 1.0 1.0            \n"
+    "1.0 1.0 1.0            \n"
+
+};
+
+static bool frame_passthrough(pl_gpu gpu, pl_tex *tex,
+                              const struct pl_source_frame *src, struct pl_frame *out_frame)
+{
+    const struct pl_frame *frame = src->frame_data;
+    *out_frame = *frame;
+    return true;
+}
+
+static enum pl_queue_status get_frame_ptr(struct pl_source_frame *out_frame,
+                                          const struct pl_queue_params *qparams)
+{
+    const struct pl_source_frame **pframe = qparams->priv;
+    if (!(*pframe)->frame_data)
+        return PL_QUEUE_EOF;
+
+    *out_frame = *(*pframe)++;
+    return PL_QUEUE_OK;
+}
+
+static void render_info_cb(void *priv, const struct pl_render_info *info)
+{
+    printf("{%d} Executed shader: %s\n", info->index,
+           info->pass->shader->description);
+}
+
+static void pl_render_tests(pl_gpu gpu)
+{
+    pl_tex img_tex = NULL, fbo = NULL;
+    pl_renderer rr = NULL;
+
+    enum { width = 50, height = 50 };
+    static float data[width][height];
+    for (int y = 0; y < height; y++) {
+        for (int x = 0; x < width; x++)
+            data[y][x] = RANDOM;
+    }
+
+    struct pl_plane img_plane = {0};
+    struct pl_plane_data plane_data = {
+        .type = PL_FMT_FLOAT,
+        .width = width,
+        .height = height,
+        .component_size = { 8 * sizeof(float) },
+        .component_map  = { 0 },
+        .pixel_stride = sizeof(float),
+        .pixels = data,
+    };
+
+    if (!pl_recreate_plane(gpu, NULL, &fbo, &plane_data))
+        return;
+
+    if (!pl_upload_plane(gpu, &img_plane, &img_tex, &plane_data))
+        goto error;
+
+    rr = pl_renderer_create(gpu->log, gpu);
+    pl_tex_clear_ex(gpu, fbo, (union pl_clear_color){0});
+
+    struct pl_frame image = {
+        .num_planes     = 1,
+        .planes         = { img_plane },
+        .repr = {
+            .sys        = PL_COLOR_SYSTEM_BT_709,
+            .levels     = PL_COLOR_LEVELS_FULL,
+        },
+        .color          = pl_color_space_srgb,
+    };
+
+    struct pl_frame target = {
+        .num_planes     = 1,
+        .planes         = {{
+            .texture            = fbo,
+            .components         = 3,
+            .component_mapping  = {0, 1, 2},
+        }},
+        .repr = {
+            .sys        = PL_COLOR_SYSTEM_RGB,
+            .levels     = PL_COLOR_LEVELS_FULL,
+            .bits.color_depth = 32,
+        },
+        .color          = pl_color_space_srgb,
+    };
+
+    REQUIRE(pl_render_image(rr, &image, &target, NULL));
+    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+
+    // TODO: embed a reference texture and ensure it matches
+
+    // Test a bunch of different params
+#define TEST(SNAME, STYPE, DEFAULT, FIELD, LIMIT)                       \
+    do {                                                                \
+        for (int i = 0; i <= LIMIT; i++) {                              \
+            printf("testing `" #STYPE "." #FIELD " = %d`\n", i);        \
+            struct pl_render_params params = pl_render_default_params;  \
+            params.force_dither = true;                                 \
+            struct STYPE tmp = DEFAULT;                                 \
+            tmp.FIELD = i;                                              \
+            params.SNAME = &tmp;                                        \
+            REQUIRE(pl_render_image(rr, &image, &target, &params));     \
+            pl_gpu_flush(gpu);                                          \
+            REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE); \
+        }                                                               \
+    } while (0)
+
+#define TEST_PARAMS(NAME, FIELD, LIMIT) \
+    TEST(NAME##_params, pl_##NAME##_params, pl_##NAME##_default_params, FIELD, LIMIT)
+
+    image.crop.x1 = width / 2.0;
+    image.crop.y1 = height / 2.0;
+    for (int i = 0; i < pl_num_scale_filters; i++) {
+        struct pl_render_params params = pl_render_default_params;
+        params.upscaler = pl_scale_filters[i].filter;
+        printf("testing `params.upscaler = /* %s */`\n", pl_scale_filters[i].name);
+        REQUIRE(pl_render_image(rr, &image, &target, &params));
+        pl_gpu_flush(gpu);
+        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+    }
+    image.crop.x1 = image.crop.y1 = 0;
+
+    target.crop.x1 = width / 2.0;
+    target.crop.y1 = height / 2.0;
+    for (int i = 0; i < pl_num_scale_filters; i++) {
+        struct pl_render_params params = pl_render_default_params;
+        params.downscaler = pl_scale_filters[i].filter;
+        printf("testing `params.downscaler = /* %s */`\n", pl_scale_filters[i].name);
+        REQUIRE(pl_render_image(rr, &image, &target, &params));
+        pl_gpu_flush(gpu);
+        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+    }
+    target.crop.x1 = target.crop.y1 = 0;
+
+    TEST_PARAMS(deband, iterations, 3);
+    TEST_PARAMS(sigmoid, center, 1);
+    TEST_PARAMS(color_map, intent, PL_INTENT_ABSOLUTE_COLORIMETRIC);
+    TEST_PARAMS(dither, method, PL_DITHER_WHITE_NOISE);
+    TEST_PARAMS(dither, temporal, true);
+    TEST_PARAMS(distort, alpha_mode, PL_ALPHA_INDEPENDENT);
+    TEST_PARAMS(distort, constrain, true);
+    TEST_PARAMS(distort, bicubic, true);
+    TEST(cone_params, pl_cone_params, pl_vision_deuteranomaly, strength, 0);
+
+    // Test gamma-correct dithering
+    target.repr.bits.color_depth = 2;
+    TEST_PARAMS(dither, transfer, PL_COLOR_TRC_GAMMA22);
+    target.repr.bits.color_depth = 32;
+
+    // Test HDR tone mapping
+    image.color = pl_color_space_hdr10;
+    TEST_PARAMS(color_map, visualize_lut, true);
+    if (gpu->limits.max_ssbo_size)
+        TEST_PARAMS(peak_detect, allow_delayed, true);
+
+    // Test inverse tone-mapping and pure BPC
+    image.color.hdr.max_luma = 1000;
+    target.color.hdr.max_luma = 4000;
+    target.color.hdr.min_luma = 0.02;
+    TEST_PARAMS(color_map, inverse_tone_mapping, true);
+
+    image.color = pl_color_space_srgb;
+    target.color = pl_color_space_srgb;
+
+    // Test some misc stuff
+    struct pl_render_params params = pl_render_default_params;
+    params.color_adjustment = &(struct pl_color_adjustment) {
+        .brightness = 0.1,
+        .contrast = 0.9,
+        .saturation = 1.5,
+        .gamma = 0.8,
+        .temperature = 0.3,
+    };
+    REQUIRE(pl_render_image(rr, &image, &target, &params));
+    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+    params = pl_render_default_params;
+
+    struct pl_frame inferred_image = image, inferred_target = target;
+    pl_frames_infer(rr, &inferred_image, &inferred_target);
+    REQUIRE(pl_render_image(rr, &inferred_image, &inferred_target, &params));
+    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+
+    // Test background blending and alpha transparency
+    params.blend_against_tiles = true;
+    params.corner_rounding = 0.25f;
+    REQUIRE(pl_render_image(rr, &image, &target, &params));
+    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+    params = pl_render_default_params;
+
+    // Test film grain synthesis
+    image.film_grain.type = PL_FILM_GRAIN_AV1;
+    image.film_grain.params.av1 = av1_grain_data;
+    REQUIRE(pl_render_image(rr, &image, &target, &params));
+    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+
+    image.film_grain.type = PL_FILM_GRAIN_H274;
+    image.film_grain.params.h274 = h274_grain_data;
+    REQUIRE(pl_render_image(rr, &image, &target, &params));
+    // H.274 film grain synthesis requires compute shaders
+    if (gpu->glsl.compute) {
+        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+    } else {
+        const struct pl_render_errors rr_err = pl_renderer_get_errors(rr);
+        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_FILM_GRAIN);
+        pl_renderer_reset_errors(rr, &rr_err);
+    }
+    image.film_grain = (struct pl_film_grain_data) {0};
+
+    // Test mpv-style custom shaders
+    for (int i = 0; i < PL_ARRAY_SIZE(user_shader_tests); i++) {
+        printf("testing user shader:\n\n%s\n", user_shader_tests[i]);
+        const struct pl_hook *hook;
+        hook = pl_mpv_user_shader_parse(gpu, user_shader_tests[i],
+                                        strlen(user_shader_tests[i]));
+        REQUIRE(hook);
+
+        params.hooks = &hook;
+        params.num_hooks = 1;
+        REQUIRE(pl_render_image(rr, &image, &target, &params));
+        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+
+        pl_mpv_user_shader_destroy(&hook);
+    }
+
+    if (gpu->glsl.compute && gpu->limits.max_ssbo_size) {
+        for (int i = 0; i < PL_ARRAY_SIZE(compute_shader_tests); i++) {
+            printf("testing user shader:\n\n%s\n", compute_shader_tests[i]);
+            const struct pl_hook *hook;
+            hook = pl_mpv_user_shader_parse(gpu, compute_shader_tests[i],
+                                            strlen(compute_shader_tests[i]));
+            REQUIRE(hook);
+
+            params.hooks = &hook;
+            params.num_hooks = 1;
+            REQUIRE(pl_render_image(rr, &image, &target, &params));
+            REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+
+            pl_mpv_user_shader_destroy(&hook);
+        }
+    }
+    params = pl_render_default_params;
+
+    // Test custom LUTs
+    for (int i = 0; i < PL_ARRAY_SIZE(test_luts); i++) {
+        printf("testing custom lut %d\n", i);
+        struct pl_custom_lut *lut;
+        lut = pl_lut_parse_cube(gpu->log, test_luts[i], strlen(test_luts[i]));
+        REQUIRE(lut);
+
+        bool has_3dlut = gpu->limits.max_tex_3d_dim && gpu->glsl.version > 100;
+        if (lut->size[2] && !has_3dlut) {
+            pl_lut_free(&lut);
+            continue;
+        }
+
+        // Test all three at the same time to reduce the number of tests
+        image.lut = target.lut = params.lut = lut;
+
+        for (enum pl_lut_type t = PL_LUT_UNKNOWN; t <= PL_LUT_CONVERSION; t++) {
+            printf("testing LUT method %d\n", t);
+            image.lut_type = target.lut_type = params.lut_type = t;
+            REQUIRE(pl_render_image(rr, &image, &target, &params));
+            REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+        }
+
+        image.lut = target.lut = params.lut = NULL;
+        pl_lut_free(&lut);
+    }
+
+#ifdef PL_HAVE_LCMS
+
+    // It doesn't fit without use of 3D textures on GLES2
+    if (gpu->glsl.version > 100) {
+        // Test ICC profiles
+        image.profile = TEST_PROFILE(sRGB_v2_nano_icc);
+        REQUIRE(pl_render_image(rr, &image, &target, &params));
+        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+        image.profile = (struct pl_icc_profile) {0};
+
+        target.profile = TEST_PROFILE(sRGB_v2_nano_icc);
+        REQUIRE(pl_render_image(rr, &image, &target, &params));
+        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+        target.profile = (struct pl_icc_profile) {0};
+
+        image.profile = TEST_PROFILE(sRGB_v2_nano_icc);
+        target.profile = image.profile;
+        REQUIRE(pl_render_image(rr, &image, &target, &params));
+        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+        image.profile = (struct pl_icc_profile) {0};
+        target.profile = (struct pl_icc_profile) {0};
+    }
+
+#endif
+
+    // Test overlays
+    image.num_overlays = 1;
+    image.overlays = &(struct pl_overlay) {
+        .tex = img_plane.texture,
+        .mode = PL_OVERLAY_NORMAL,
+        .num_parts = 2,
+        .parts = (struct pl_overlay_part[]) {{
+            .src = {0, 0, 2, 2},
+            .dst = {30, 100, 40, 200},
+        }, {
+            .src = {2, 2, 5, 5},
+            .dst = {1000, -1, 3, 5},
+        }},
+    };
+    REQUIRE(pl_render_image(rr, &image, &target, &params));
+    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+    params.disable_fbos = true;
+    REQUIRE(pl_render_image(rr, &image, &target, &params));
+    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+    image.num_overlays = 0;
+    params = pl_render_default_params;
+
+    target.num_overlays = 1;
+    target.overlays = &(struct pl_overlay) {
+        .tex = img_plane.texture,
+        .mode = PL_OVERLAY_MONOCHROME,
+        .num_parts = 1,
+        .parts = &(struct pl_overlay_part) {
+            .src = {5, 5, 15, 15},
+            .dst = {5, 5, 15, 15},
+            .color = {1.0, 0.5, 0.0},
+        },
+    };
+    REQUIRE(pl_render_image(rr, &image, &target, &params));
+    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+    REQUIRE(pl_render_image(rr, NULL, &target, &params));
+    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+    target.num_overlays = 0;
+
+    // Test rotation
+    for (pl_rotation rot = 0; rot < PL_ROTATION_360; rot += PL_ROTATION_90) {
+        image.rotation = rot;
+        REQUIRE(pl_render_image(rr, &image, &target, &params));
+        REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+    }
+
+    // Attempt frame mixing, using the mixer queue helper
+    printf("testing frame mixing \n");
+    struct pl_render_params mix_params = {
+        .frame_mixer = &pl_filter_mitchell_clamp,
+        .info_callback = render_info_cb,
+    };
+
+    struct pl_queue_params qparams = {
+        .radius = pl_frame_mix_radius(&mix_params),
+        .vsync_duration = 1.0 / 60.0,
+    };
+
+    // Test large PTS jumps in frame mix
+    struct pl_frame_mix mix = (struct pl_frame_mix) {
+        .num_frames = 2,
+        .frames = (const struct pl_frame *[]) { &image, &image },
+        .signatures = (uint64_t[]) { 0xFFF1, 0xFFF2 },
+        .timestamps = (float[]) { -100, 100 },
+        .vsync_duration = 1.6,
+    };
+    REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
+
+    // Test inferring frame mix
+    inferred_target = target;
+    pl_frames_infer_mix(rr, &mix, &inferred_target, &inferred_image);
+    REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
+
+    // Test empty frame mix
+    mix = (struct pl_frame_mix) {0};
+    REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
+
+    // Test inferring empty frame mix
+    inferred_target = target;
+    pl_frames_infer_mix(rr, &mix, &inferred_target, &inferred_image);
+    REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
+
+    // Test mixer queue
+#define NUM_MIX_FRAMES 20
+    const float frame_duration = 1.0 / 24.0;
+    struct pl_source_frame srcframes[NUM_MIX_FRAMES+1];
+    srcframes[NUM_MIX_FRAMES] = (struct pl_source_frame) {0};
+    for (int i = 0; i < NUM_MIX_FRAMES; i++) {
+        srcframes[i] = (struct pl_source_frame) {
+            .pts = i * frame_duration,
+            .duration = frame_duration,
+            .map = frame_passthrough,
+            .frame_data = &image,
+        };
+    }
+
+    pl_queue queue = pl_queue_create(gpu);
+    enum pl_queue_status ret;
+
+    // Test pre-pushing all frames, with delayed EOF.
+    for (int i = 0; i < NUM_MIX_FRAMES; i++) {
+        const struct pl_source_frame *src = &srcframes[i];
+        if (i > 10) // test pushing in reverse order
+            src = &srcframes[NUM_MIX_FRAMES + 10 - i];
+        if (!pl_queue_push_block(queue, 1, src)) // mini-sleep
+            pl_queue_push(queue, src); // push it anyway, for testing
+    }
+
+    while ((ret = pl_queue_update(queue, &mix, &qparams)) != PL_QUEUE_EOF) {
+        if (ret == PL_QUEUE_MORE) {
+            REQUIRE_CMP(qparams.pts, >, 0.0f, "f");
+            pl_queue_push(queue, NULL); // push delayed EOF
+            continue;
+        }
+
+        REQUIRE_CMP(ret, ==, PL_QUEUE_OK, "u");
+        REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
+
+        // Simulate advancing vsync
+        qparams.pts += qparams.vsync_duration;
+    }
+
+    // Test dynamically pulling all frames, with oversample mixer
+    const struct pl_source_frame *frame_ptr = &srcframes[0];
+    mix_params.frame_mixer = &pl_oversample_frame_mixer;
+
+    qparams = (struct pl_queue_params) {
+        .radius = pl_frame_mix_radius(&mix_params),
+        .vsync_duration = qparams.vsync_duration,
+        .get_frame = get_frame_ptr,
+        .priv = &frame_ptr,
+    };
+
+    pl_queue_reset(queue);
+    while ((ret = pl_queue_update(queue, &mix, &qparams)) != PL_QUEUE_EOF) {
+        REQUIRE_CMP(ret, ==, PL_QUEUE_OK, "u");
+        REQUIRE_CMP(mix.num_frames, <=, 2, "d");
+        REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
+        qparams.pts += qparams.vsync_duration;
+    }
+
+    // Test large PTS jump
+    pl_queue_reset(queue);
+    REQUIRE(pl_queue_update(queue, &mix, &qparams) == PL_QUEUE_EOF);
+
+    // Test deinterlacing
+    pl_queue_reset(queue);
+    printf("testing deinterlacing \n");
+    for (int i = 0; i < NUM_MIX_FRAMES; i++) {
+        struct pl_source_frame *src = &srcframes[i];
+        if (i > 10)
+            src = &srcframes[NUM_MIX_FRAMES + 10 - i];
+        src->first_field = PL_FIELD_EVEN;
+        pl_queue_push(queue, src);
+    }
+    pl_queue_push(queue, NULL);
+
+    qparams.pts = 0;
+    qparams.get_frame = NULL;
+    while ((ret = pl_queue_update(queue, &mix, &qparams)) != PL_QUEUE_EOF) {
+        REQUIRE_CMP(ret, ==, PL_QUEUE_OK, "u");
+        REQUIRE(pl_render_image_mix(rr, &mix, &target, &mix_params));
+        qparams.pts += qparams.vsync_duration;
+    }
+
+    pl_queue_destroy(&queue);
+
+error:
+    pl_renderer_destroy(&rr);
+    pl_tex_destroy(gpu, &img_tex);
+    pl_tex_destroy(gpu, &fbo);
+}
+
+static struct pl_hook_res noop_hook(void *priv, const struct pl_hook_params *params)
+{
+    return (struct pl_hook_res) {0};
+}
+
+static void pl_ycbcr_tests(pl_gpu gpu)
+{
+    struct pl_plane_data data[3];
+    for (int i = 0; i < 3; i++) {
+        const int sub = i > 0 ? 1 : 0;
+        const int width = (323 + sub) >> sub;
+        const int height = (255 + sub) >> sub;
+
+        data[i] = (struct pl_plane_data) {
+            .type = PL_FMT_UNORM,
+            .width = width,
+            .height = height,
+            .component_size = {16},
+            .component_map = {i},
+            .pixel_stride = sizeof(uint16_t),
+            .row_stride = PL_ALIGN2(width * sizeof(uint16_t),
+                                    gpu->limits.align_tex_xfer_pitch),
+        };
+    }
+
+    pl_fmt fmt = pl_plane_find_fmt(gpu, NULL, &data[0]);
+    enum pl_fmt_caps caps = PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_HOST_READABLE;
+    if (!fmt || (fmt->caps & caps) != caps)
+        return;
+
+    pl_renderer rr = pl_renderer_create(gpu->log, gpu);
+    if (!rr)
+        return;
+
+    pl_tex src_tex[3] = {0};
+    pl_tex dst_tex[3] = {0};
+    struct pl_frame img = {
+        .num_planes = 3,
+        .repr = pl_color_repr_hdtv,
+        .color = pl_color_space_bt709,
+    };
+
+    struct pl_frame target = {
+        .num_planes = 3,
+        .repr = pl_color_repr_hdtv,
+        .color = pl_color_space_bt709,
+    };
+
+    uint8_t *src_buffer[3] = {0};
+    uint8_t *dst_buffer = NULL;
+    for (int i = 0; i < 3; i++) {
+        // Generate some arbitrary data for the buffer
+        src_buffer[i] = malloc(data[i].height * data[i].row_stride);
+        if (!src_buffer[i])
+            goto error;
+
+        data[i].pixels = src_buffer[i];
+        for (int y = 0; y < data[i].height; y++) {
+            for (int x = 0; x < data[i].width; x++) {
+                size_t off = y * data[i].row_stride + x * data[i].pixel_stride;
+                uint16_t *pixel = (uint16_t *) &src_buffer[i][off];
+                int gx = 200 + 100 * i, gy = 300 + 150 * i;
+                *pixel = (gx * x) ^ (gy * y); // whatever
+            }
+        }
+
+        REQUIRE(pl_upload_plane(gpu, &img.planes[i], &src_tex[i], &data[i]));
+    }
+
+    // This co-sites chroma pixels with pixels in the RGB image, meaning we
+    // get an exact round-trip when sampling both ways. This makes it useful
+    // as a test case, even though it's not common in the real world.
+    pl_frame_set_chroma_location(&img, PL_CHROMA_TOP_LEFT);
+
+    for (int i = 0; i < 3; i++) {
+        dst_tex[i] = pl_tex_create(gpu, &(struct pl_tex_params) {
+            .format = fmt,
+            .w = data[i].width,
+            .h = data[i].height,
+            .renderable = true,
+            .host_readable = true,
+            .storable = fmt->caps & PL_FMT_CAP_STORABLE,
+            .blit_dst = fmt->caps & PL_FMT_CAP_BLITTABLE,
+        });
+
+        if (!dst_tex[i])
+            goto error;
+
+        target.planes[i] = img.planes[i];
+        target.planes[i].texture = dst_tex[i];
+    }
+
+    REQUIRE(pl_render_image(rr, &img, &target, &(struct pl_render_params) {
+        .num_hooks = 1,
+        .hooks = &(const struct pl_hook *){&(struct pl_hook) {
+            // Forces chroma merging, to test the chroma merging code
+            .stages = PL_HOOK_CHROMA_INPUT,
+            .hook = noop_hook,
+        }},
+    }));
+    REQUIRE(pl_renderer_get_errors(rr).errors == PL_RENDER_ERR_NONE);
+
+    size_t buf_size = data[0].height * data[0].row_stride;
+    dst_buffer = malloc(buf_size);
+    if (!dst_buffer)
+        goto error;
+
+    for (int i = 0; i < 3; i++) {
+        memset(dst_buffer, 0xAA, buf_size);
+        REQUIRE(pl_tex_download(gpu, &(struct pl_tex_transfer_params) {
+            .tex = dst_tex[i],
+            .ptr = dst_buffer,
+            .row_pitch = data[i].row_stride,
+        }));
+
+        for (int y = 0; y < data[i].height; y++) {
+            for (int x = 0; x < data[i].width; x++) {
+                size_t off = y * data[i].row_stride + x * data[i].pixel_stride;
+                uint16_t *src_pixel = (uint16_t *) &src_buffer[i][off];
+                uint16_t *dst_pixel = (uint16_t *) &dst_buffer[off];
+                int diff = abs((int) *src_pixel - (int) *dst_pixel);
+                REQUIRE_CMP(diff, <=, 50, "d"); // a little under 0.1%
+            }
+        }
+    }
+
+error:
+    pl_renderer_destroy(&rr);
+    free(dst_buffer);
+    for (int i = 0; i < 3; i++) {
+        free(src_buffer[i]);
+        pl_tex_destroy(gpu, &src_tex[i]);
+        pl_tex_destroy(gpu, &dst_tex[i]);
+    }
+}
+
+static void pl_test_export_import(pl_gpu gpu,
+                                  enum pl_handle_type handle_type)
+{
+    // Test texture roundtrip
+
+    if (!(gpu->export_caps.tex & handle_type) ||
+        !(gpu->import_caps.tex & handle_type))
+        goto skip_tex;
+
+    pl_fmt fmt = pl_find_fmt(gpu, PL_FMT_UNORM, 4, 0, 0, PL_FMT_CAP_BLITTABLE);
+    if (!fmt)
+        goto skip_tex;
+
+    printf("testing texture import/export with fmt %s\n", fmt->name);
+
+    pl_tex export = pl_tex_create(gpu, &(struct pl_tex_params) {
+        .w = 32,
+        .h = 32,
+        .format = fmt,
+        .export_handle = handle_type,
+    });
+    REQUIRE(export);
+    REQUIRE_HANDLE(export->shared_mem, handle_type);
+
+    pl_tex import = pl_tex_create(gpu, &(struct pl_tex_params) {
+        .w = export->params.w,
+        .h = export->params.h,
+        .format = fmt,
+        .import_handle = handle_type,
+        .shared_mem = export->shared_mem,
+    });
+    REQUIRE(import);
+
+    pl_tex_destroy(gpu, &import);
+    pl_tex_destroy(gpu, &export);
+
+skip_tex: ;
+
+    // Test buffer roundtrip
+
+    if (!(gpu->export_caps.buf & handle_type) ||
+        !(gpu->import_caps.buf & handle_type))
+        return;
+
+    printf("testing buffer import/export\n");
+
+    pl_buf exp_buf = pl_buf_create(gpu, &(struct pl_buf_params) {
+        .size = 32,
+        .export_handle = handle_type,
+    });
+    REQUIRE(exp_buf);
+    REQUIRE_HANDLE(exp_buf->shared_mem, handle_type);
+
+    pl_buf imp_buf = pl_buf_create(gpu, &(struct pl_buf_params) {
+        .size = 32,
+        .import_handle = handle_type,
+        .shared_mem = exp_buf->shared_mem,
+    });
+    REQUIRE(imp_buf);
+
+    pl_buf_destroy(gpu, &imp_buf);
+    pl_buf_destroy(gpu, &exp_buf);
+}
+
+static void pl_test_host_ptr(pl_gpu gpu)
+{
+    if (!(gpu->import_caps.buf & PL_HANDLE_HOST_PTR))
+        return;
+
+#ifdef __unix__
+
+    printf("testing host ptr\n");
+    REQUIRE(gpu->limits.max_mapped_size);
+
+    const size_t size = 2 << 20;
+    const size_t offset = 2 << 10;
+    const size_t slice = 2 << 16;
+
+    uint8_t *data = aligned_alloc(0x1000, size);
+    for (int i = 0; i < size; i++)
+        data[i] = (uint8_t) i;
+
+    pl_buf buf = pl_buf_create(gpu, &(struct pl_buf_params) {
+        .size = slice,
+        .import_handle = PL_HANDLE_HOST_PTR,
+        .shared_mem = {
+            .handle.ptr = data,
+            .size = size,
+            .offset = offset,
+        },
+        .host_mapped = true,
+    });
+
+    REQUIRE(buf);
+    REQUIRE_MEMEQ(data + offset, buf->data, slice);
+
+    pl_buf_destroy(gpu, &buf);
+    free(data);
+
+#endif // unix
+}
+
+static void gpu_shader_tests(pl_gpu gpu)
+{
+    pl_buffer_tests(gpu);
+    pl_texture_tests(gpu);
+    pl_planar_tests(gpu);
+    pl_shader_tests(gpu);
+    pl_scaler_tests(gpu);
+    pl_render_tests(gpu);
+    pl_ycbcr_tests(gpu);
+
+    REQUIRE(!pl_gpu_is_failed(gpu));
+}
+
+static void gpu_interop_tests(pl_gpu gpu)
+{
+    pl_test_export_import(gpu, PL_HANDLE_DMA_BUF);
+    pl_test_host_ptr(gpu);
+
+    REQUIRE(!pl_gpu_is_failed(gpu));
+}
diff --git a/src/tests/icc.c b/src/tests/icc.c
new file mode 100644
index 0000000..188940b
--- /dev/null
+++ b/src/tests/icc.c
@@ -0,0 +1,106 @@
+#include "tests.h"
+
+#include <libplacebo/shaders/icc.h>
+
+static const uint8_t DisplayP3_v2_micro_icc[] = {
+  0x00, 0x00, 0x01, 0xc8, 0x6c, 0x63, 0x6d, 0x73, 0x02, 0x10, 0x00, 0x00,
+  0x6d, 0x6e, 0x74, 0x72, 0x52, 0x47, 0x42, 0x20, 0x58, 0x59, 0x5a, 0x20,
+  0x07, 0xe2, 0x00, 0x03, 0x00, 0x14, 0x00, 0x09, 0x00, 0x0e, 0x00, 0x1d,
+  0x61, 0x63, 0x73, 0x70, 0x4d, 0x53, 0x46, 0x54, 0x00, 0x00, 0x00, 0x00,
+  0x73, 0x61, 0x77, 0x73, 0x63, 0x74, 0x72, 0x6c, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xd6,
+  0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0xd3, 0x2d, 0x68, 0x61, 0x6e, 0x64,
+  0xb4, 0xaa, 0xdd, 0x1f, 0x13, 0xc8, 0x03, 0x3c, 0xf5, 0x51, 0x14, 0x45,
+  0x28, 0x7a, 0x98, 0xe2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,
+  0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00, 0x5e,
+  0x63, 0x70, 0x72, 0x74, 0x00, 0x00, 0x01, 0x0c, 0x00, 0x00, 0x00, 0x0c,
+  0x77, 0x74, 0x70, 0x74, 0x00, 0x00, 0x01, 0x18, 0x00, 0x00, 0x00, 0x14,
+  0x72, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x2c, 0x00, 0x00, 0x00, 0x14,
+  0x67, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x14,
+  0x62, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x54, 0x00, 0x00, 0x00, 0x14,
+  0x72, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x68, 0x00, 0x00, 0x00, 0x60,
+  0x67, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x68, 0x00, 0x00, 0x00, 0x60,
+  0x62, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x68, 0x00, 0x00, 0x00, 0x60,
+  0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04,
+  0x75, 0x50, 0x33, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x74, 0x65, 0x78, 0x74, 0x00, 0x00, 0x00, 0x00,
+  0x43, 0x43, 0x30, 0x00, 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0xf3, 0x51, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x16, 0xcc,
+  0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x83, 0xdf,
+  0x00, 0x00, 0x3d, 0xbf, 0xff, 0xff, 0xff, 0xbb, 0x58, 0x59, 0x5a, 0x20,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4a, 0xbf, 0x00, 0x00, 0xb1, 0x37,
+  0x00, 0x00, 0x0a, 0xb9, 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x28, 0x38, 0x00, 0x00, 0x11, 0x0a, 0x00, 0x00, 0xc8, 0xb9,
+  0x63, 0x75, 0x72, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2a,
+  0x00, 0x00, 0x00, 0x7c, 0x00, 0xf8, 0x01, 0x9c, 0x02, 0x75, 0x03, 0x83,
+  0x04, 0xc9, 0x06, 0x4e, 0x08, 0x12, 0x0a, 0x18, 0x0c, 0x62, 0x0e, 0xf4,
+  0x11, 0xcf, 0x14, 0xf6, 0x18, 0x6a, 0x1c, 0x2e, 0x20, 0x43, 0x24, 0xac,
+  0x29, 0x6a, 0x2e, 0x7e, 0x33, 0xeb, 0x39, 0xb3, 0x3f, 0xd6, 0x46, 0x57,
+  0x4d, 0x36, 0x54, 0x76, 0x5c, 0x17, 0x64, 0x1d, 0x6c, 0x86, 0x75, 0x56,
+  0x7e, 0x8d, 0x88, 0x2c, 0x92, 0x36, 0x9c, 0xab, 0xa7, 0x8c, 0xb2, 0xdb,
+  0xbe, 0x99, 0xca, 0xc7, 0xd7, 0x65, 0xe4, 0x77, 0xf1, 0xf9, 0xff, 0xff
+};
+
+static const uint8_t Rec2020_v2_micro_icc[] = {
+  0x00, 0x00, 0x01, 0xcc, 0x6c, 0x63, 0x6d, 0x73, 0x02, 0x10, 0x00, 0x00,
+  0x6d, 0x6e, 0x74, 0x72, 0x52, 0x47, 0x42, 0x20, 0x58, 0x59, 0x5a, 0x20,
+  0x07, 0xe2, 0x00, 0x03, 0x00, 0x14, 0x00, 0x09, 0x00, 0x0e, 0x00, 0x1d,
+  0x61, 0x63, 0x73, 0x70, 0x4d, 0x53, 0x46, 0x54, 0x00, 0x00, 0x00, 0x00,
+  0x73, 0x61, 0x77, 0x73, 0x63, 0x74, 0x72, 0x6c, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xd6,
+  0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0xd3, 0x2d, 0x68, 0x61, 0x6e, 0x64,
+  0x17, 0xcb, 0x44, 0xd1, 0x0d, 0xca, 0xe1, 0xc9, 0x03, 0x3e, 0x20, 0x85,
+  0x4a, 0x67, 0x4e, 0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,
+  0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00, 0x5f,
+  0x63, 0x70, 0x72, 0x74, 0x00, 0x00, 0x01, 0x0c, 0x00, 0x00, 0x00, 0x0c,
+  0x77, 0x74, 0x70, 0x74, 0x00, 0x00, 0x01, 0x18, 0x00, 0x00, 0x00, 0x14,
+  0x72, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x2c, 0x00, 0x00, 0x00, 0x14,
+  0x67, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x40, 0x00, 0x00, 0x00, 0x14,
+  0x62, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x54, 0x00, 0x00, 0x00, 0x14,
+  0x72, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x68, 0x00, 0x00, 0x00, 0x64,
+  0x67, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x68, 0x00, 0x00, 0x00, 0x64,
+  0x62, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x68, 0x00, 0x00, 0x00, 0x64,
+  0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05,
+  0x75, 0x32, 0x30, 0x32, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x74, 0x65, 0x78, 0x74, 0x00, 0x00, 0x00, 0x00,
+  0x43, 0x43, 0x30, 0x00, 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0xf3, 0x51, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x16, 0xcc,
+  0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xac, 0x69,
+  0x00, 0x00, 0x47, 0x70, 0xff, 0xff, 0xff, 0x81, 0x58, 0x59, 0x5a, 0x20,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2a, 0x6a, 0x00, 0x00, 0xac, 0xe3,
+  0x00, 0x00, 0x07, 0xad, 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x20, 0x03, 0x00, 0x00, 0x0b, 0xad, 0x00, 0x00, 0xcb, 0xff,
+  0x63, 0x75, 0x72, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2c,
+  0x00, 0x00, 0x01, 0x53, 0x02, 0xa5, 0x03, 0xf8, 0x05, 0x4e, 0x06, 0xd6,
+  0x08, 0x98, 0x0a, 0x8f, 0x0c, 0xc3, 0x0f, 0x31, 0x11, 0xdc, 0x14, 0xc3,
+  0x17, 0xe8, 0x1b, 0x4c, 0x1e, 0xf0, 0x22, 0xd5, 0x26, 0xfa, 0x2b, 0x62,
+  0x30, 0x0c, 0x34, 0xfa, 0x3a, 0x2b, 0x3f, 0xa2, 0x45, 0x5d, 0x4b, 0x5f,
+  0x51, 0xa7, 0x58, 0x37, 0x5f, 0x0d, 0x66, 0x2c, 0x6d, 0x94, 0x75, 0x45,
+  0x7d, 0x3f, 0x85, 0x84, 0x8e, 0x13, 0x96, 0xee, 0xa0, 0x13, 0xa9, 0x86,
+  0xb3, 0x44, 0xbd, 0x4f, 0xc7, 0xa8, 0xd2, 0x4e, 0xdd, 0x42, 0xe8, 0x86,
+  0xf4, 0x16, 0xff, 0xff
+};
+
+int main()
+{
+    pl_log log = pl_test_logger();
+    pl_icc_object icc;
+
+    icc = pl_icc_open(log, &TEST_PROFILE(sRGB_v2_nano_icc), NULL);
+    REQUIRE_CMP(icc->csp.primaries, ==, PL_COLOR_PRIM_BT_709, "u");
+    pl_icc_close(&icc);
+
+    icc = pl_icc_open(log, &TEST_PROFILE(DisplayP3_v2_micro_icc), NULL);
+    REQUIRE_CMP(icc->csp.primaries, ==, PL_COLOR_PRIM_DISPLAY_P3, "u");
+    pl_icc_close(&icc);
+
+    icc = pl_icc_open(log, &TEST_PROFILE(Rec2020_v2_micro_icc), NULL);
+    REQUIRE_CMP(icc->csp.primaries, ==, PL_COLOR_PRIM_BT_2020, "u");
+    pl_icc_close(&icc);
+
+    pl_log_destroy(&log);
+}
diff --git a/src/tests/include/include_tmpl.c b/src/tests/include/include_tmpl.c
new file mode 100644
index 0000000..dd1000e
--- /dev/null
+++ b/src/tests/include/include_tmpl.c
@@ -0,0 +1 @@
+#include <libplacebo/@header@>
diff --git a/src/tests/include/include_tmpl.cpp b/src/tests/include/include_tmpl.cpp
new file mode 100644
index 0000000..2b6334c
--- /dev/null
+++ b/src/tests/include/include_tmpl.cpp
@@ -0,0 +1,3 @@
+#define PL_LIBAV_IMPLEMENTATION 0
+#define PL_DAV1D_IMPLEMENTATION 0
+#include <libplacebo/@header@>
diff --git a/src/tests/include/meson.build b/src/tests/include/meson.build
new file mode 100644
index 0000000..25dfaee
--- /dev/null
+++ b/src/tests/include/meson.build
@@ -0,0 +1,35 @@
+include_tmpl_langs = ['c', 'cpp']
+
+# Ensure all headers compile
+
+test_include_sources = []
+foreach h : headers
+
+  if (h.contains('internal') or
+      h.contains('dav1d') and not dav1d.found() or
+      h.contains('libav') and not libav_found or
+      h.contains('d3d11') and not d3d11_header)
+    continue
+  endif
+
+  foreach lang : include_tmpl_langs
+
+    test_include_sources += configure_file(
+        input: 'include_tmpl.' + lang,
+        output: 'include_@0@.@1@'.format(h.underscorify(), lang),
+        configuration: {
+          'header': h
+        },
+    )
+
+  endforeach
+
+endforeach
+
+static_library('test_include', test_include_sources,
+    dependencies: [tdep_static, lavu, lavc, lavf],
+    include_directories: [inc, vulkan_headers_inc],
+    implicit_include_directories: false,
+    c_args: ['-Wall', '-Wextra', '-Wpedantic'],
+    cpp_args: ['-Wall', '-Wextra', '-Wpedantic'],
+)
diff --git a/src/tests/libav.c b/src/tests/libav.c
new file mode 100644
index 0000000..7c91e85
--- /dev/null
+++ b/src/tests/libav.c
@@ -0,0 +1,393 @@
+#include "tests.h"
+#include "libplacebo/utils/libav.h"
+
+int main()
+{
+    struct pl_plane_data data[4] = {0};
+    struct pl_bit_encoding bits;
+
+    // Make sure we don't crash on any av pixfmt
+    const AVPixFmtDescriptor *desc = NULL;
+    while ((desc = av_pix_fmt_desc_next(desc)))
+        pl_plane_data_from_pixfmt(data, &bits, av_pix_fmt_desc_get_id(desc));
+
+#define TEST(pixfmt, reference)                                                 \
+    do {                                                                        \
+        int planes = pl_plane_data_from_pixfmt(data, &bits, pixfmt);            \
+        REQUIRE_CMP(planes, ==, sizeof(reference) / sizeof(*reference), "d");   \
+        REQUIRE_MEMEQ(data, reference, sizeof(reference));                      \
+    } while (0)
+
+    // Planar and semiplanar formats
+    static const struct pl_plane_data yuvp8[] = {
+        {
+            .type = PL_FMT_UNORM,
+            .component_size = {8},
+            .component_map = {0},
+            .pixel_stride = 1,
+        }, {
+            .type = PL_FMT_UNORM,
+            .component_size = {8},
+            .component_map = {1},
+            .pixel_stride = 1,
+        }, {
+            .type = PL_FMT_UNORM,
+            .component_size = {8},
+            .component_map = {2},
+            .pixel_stride = 1,
+        }
+    };
+
+    TEST(AV_PIX_FMT_YUV420P, yuvp8);
+    TEST(AV_PIX_FMT_YUV422P, yuvp8);
+    TEST(AV_PIX_FMT_YUV444P, yuvp8);
+    TEST(AV_PIX_FMT_YUV410P, yuvp8);
+    TEST(AV_PIX_FMT_YUV411P, yuvp8);
+    TEST(AV_PIX_FMT_YUV440P, yuvp8);
+
+    static const struct pl_plane_data yuvap8[] = {
+        {
+            .type = PL_FMT_UNORM,
+            .component_size = {8},
+            .component_map = {0},
+            .pixel_stride = 1,
+        }, {
+            .type = PL_FMT_UNORM,
+            .component_size = {8},
+            .component_map = {1},
+            .pixel_stride = 1,
+        }, {
+            .type = PL_FMT_UNORM,
+            .component_size = {8},
+            .component_map = {2},
+            .pixel_stride = 1,
+        }, {
+            .type = PL_FMT_UNORM,
+            .component_size = {8},
+            .component_map = {3},
+            .pixel_stride = 1,
+        }
+    };
+
+    TEST(AV_PIX_FMT_YUVA420P, yuvap8);
+
+    static const struct pl_plane_data yuvp16[] = {
+        {
+            .type = PL_FMT_UNORM,
+            .component_size = {16},
+            .component_map = {0},
+            .pixel_stride = 2,
+        }, {
+            .type = PL_FMT_UNORM,
+            .component_size = {16},
+            .component_map = {1},
+            .pixel_stride = 2,
+        }, {
+            .type = PL_FMT_UNORM,
+            .component_size = {16},
+            .component_map = {2},
+            .pixel_stride = 2,
+        }
+    };
+
+    TEST(AV_PIX_FMT_YUV420P10LE, yuvp16);
+    TEST(AV_PIX_FMT_YUV420P16LE, yuvp16);
+
+    static const struct pl_plane_data nv12[] = {
+        {
+            .type = PL_FMT_UNORM,
+            .component_size = {8},
+            .component_map = {0},
+            .pixel_stride = 1,
+        }, {
+            .type = PL_FMT_UNORM,
+            .component_size = {8, 8},
+            .component_map = {1, 2},
+            .pixel_stride = 2,
+        }
+    };
+
+    TEST(AV_PIX_FMT_NV12, nv12);
+
+    static const struct pl_plane_data nv21[] = {
+        {
+            .type = PL_FMT_UNORM,
+            .component_size = {8},
+            .component_map = {0},
+            .pixel_stride = 1,
+        }, {
+            .type = PL_FMT_UNORM,
+            .component_size = {8, 8},
+            .component_map = {2, 1},
+            .pixel_stride = 2,
+        }
+    };
+
+    TEST(AV_PIX_FMT_NV21, nv21);
+
+    static const struct pl_plane_data p016[] = {
+        {
+            .type = PL_FMT_UNORM,
+            .component_size = {16},
+            .component_map = {0},
+            .pixel_stride = 2,
+        }, {
+            .type = PL_FMT_UNORM,
+            .component_size = {16, 16},
+            .component_map = {1, 2},
+            .pixel_stride = 4,
+        }
+    };
+
+    TEST(AV_PIX_FMT_P010LE, p016);
+    TEST(AV_PIX_FMT_P016LE, p016);
+
+    // Packed formats
+    static const struct pl_plane_data r8[] = {
+        {
+            .type = PL_FMT_UNORM,
+            .component_size = {8},
+            .component_map = {0},
+            .pixel_stride = 1,
+        }
+    };
+
+    TEST(AV_PIX_FMT_GRAY8, r8);
+
+    static const struct pl_plane_data rg8[] = {
+        {
+            .type = PL_FMT_UNORM,
+            .component_size = {8, 8},
+            .component_map = {0, 1},
+            .pixel_stride = 2,
+        }
+    };
+
+    TEST(AV_PIX_FMT_GRAY8A, rg8);
+
+    static const struct pl_plane_data rgb8[] = {
+        {
+            .type = PL_FMT_UNORM,
+            .component_size = {8, 8, 8},
+            .component_map = {0, 1, 2},
+            .pixel_stride = 3,
+        }
+    };
+
+    TEST(AV_PIX_FMT_RGB24, rgb8);
+
+    static const struct pl_plane_data bgr8[] = {
+        {
+            .type = PL_FMT_UNORM,
+            .component_size = {8, 8, 8},
+            .component_map = {2, 1, 0},
+            .pixel_stride = 3,
+        }
+    };
+
+    TEST(AV_PIX_FMT_BGR24, bgr8);
+
+    static const struct pl_plane_data rgbx8[] = {
+        {
+            .type = PL_FMT_UNORM,
+            .component_size = {8, 8, 8},
+            .component_map = {0, 1, 2},
+            .pixel_stride = 4,
+        }
+    };
+
+    TEST(AV_PIX_FMT_RGB0, rgbx8);
+
+    static const struct pl_plane_data xrgb8[] = {
+        {
+            .type = PL_FMT_UNORM,
+            .component_size = {8, 8, 8},
+            .component_map = {0, 1, 2},
+            .component_pad = {8, 0, 0},
+            .pixel_stride = 4,
+        }
+    };
+
+    TEST(AV_PIX_FMT_0RGB, xrgb8);
+
+    static const struct pl_plane_data rgba8[] = {
+        {
+            .type = PL_FMT_UNORM,
+            .component_size = {8, 8, 8, 8},
+            .component_map = {0, 1, 2, 3},
+            .pixel_stride = 4,
+        }
+    };
+
+    TEST(AV_PIX_FMT_RGBA, rgba8);
+
+    static const struct pl_plane_data argb8[] = {
+        {
+            .type = PL_FMT_UNORM,
+            .component_size = {8, 8, 8, 8},
+            .component_map = {3, 0, 1, 2},
+            .pixel_stride = 4,
+        }
+    };
+
+    TEST(AV_PIX_FMT_ARGB, argb8);
+
+    static const struct pl_plane_data bgra8[] = {
+        {
+            .type = PL_FMT_UNORM,
+            .component_size = {8, 8, 8, 8},
+            .component_map = {2, 1, 0, 3},
+            .pixel_stride = 4,
+        }
+    };
+
+    TEST(AV_PIX_FMT_BGRA, bgra8);
+
+    static const struct pl_plane_data abgr8[] = {
+        {
+            .type = PL_FMT_UNORM,
+            .component_size = {8, 8, 8, 8},
+            .component_map = {3, 2, 1, 0},
+            .pixel_stride = 4,
+        }
+    };
+
+    TEST(AV_PIX_FMT_ABGR, abgr8);
+
+    static const struct pl_plane_data r16[] = {
+        {
+            .type = PL_FMT_UNORM,
+            .component_size = {16},
+            .component_map = {0},
+            .pixel_stride = 2,
+        }
+    };
+
+    TEST(AV_PIX_FMT_GRAY16LE, r16);
+
+    static const struct pl_plane_data rgb16[] = {
+        {
+            .type = PL_FMT_UNORM,
+            .component_size = {16, 16, 16},
+            .component_map = {0, 1, 2},
+            .pixel_stride = 6,
+        }
+    };
+
+    TEST(AV_PIX_FMT_RGB48LE, rgb16);
+
+    static const struct pl_plane_data rgb16be[] = {
+        {
+            .type = PL_FMT_UNORM,
+            .component_size = {16, 16, 16},
+            .component_map = {0, 1, 2},
+            .pixel_stride = 6,
+            .swapped = true,
+        }
+    };
+
+    TEST(AV_PIX_FMT_RGB48BE, rgb16be);
+
+    static const struct pl_plane_data rgba16[] = {
+        {
+            .type = PL_FMT_UNORM,
+            .component_size = {16, 16, 16, 16},
+            .component_map = {0, 1, 2, 3},
+            .pixel_stride = 8,
+        }
+    };
+
+    TEST(AV_PIX_FMT_RGBA64LE, rgba16);
+
+    static const struct pl_plane_data rgba16be[] = {
+        {
+            .type = PL_FMT_UNORM,
+            .component_size = {16, 16, 16, 16},
+            .component_map = {0, 1, 2, 3},
+            .pixel_stride = 8,
+            .swapped = true,
+        }
+    };
+
+    TEST(AV_PIX_FMT_RGBA64BE, rgba16be);
+
+    static const struct pl_plane_data rgb565[] = {
+        {
+            .type = PL_FMT_UNORM,
+            .component_size = {5, 6, 5},
+            .component_map = {2, 1, 0}, // LSB to MSB
+            .pixel_stride = 2,
+        }
+    };
+
+    TEST(AV_PIX_FMT_RGB565LE, rgb565);
+
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 37, 100)
+
+    static const struct pl_plane_data rgb32f[] = {
+        {
+            .type = PL_FMT_FLOAT,
+            .component_size = {32, 32, 32},
+            .component_map = {0, 1, 2},
+            .pixel_stride = 12,
+        }
+    };
+
+    TEST(AV_PIX_FMT_RGBF32LE, rgb32f);
+
+#endif
+
+    // Test pl_frame <- AVFrame bridge
+    struct pl_frame image;
+    AVFrame *frame = av_frame_alloc();
+    frame->format = AV_PIX_FMT_RGBA;
+    pl_frame_from_avframe(&image, frame);
+    REQUIRE_CMP(image.num_planes, ==, 1, "d");
+    REQUIRE_CMP(image.repr.sys, ==, PL_COLOR_SYSTEM_RGB, "u");
+
+    // Test inverse mapping
+    struct pl_color_space csp = image.color;
+    pl_color_space_infer(&csp);
+    pl_avframe_set_color(frame, csp);
+    pl_avframe_set_repr(frame, image.repr);
+    pl_avframe_set_profile(frame, image.profile);
+    pl_frame_from_avframe(&image, frame);
+    pl_color_space_infer(&image.color);
+    REQUIRE(pl_color_space_equal(&csp, &image.color));
+    av_frame_free(&frame);
+
+    // Test enum functions
+    for (enum pl_color_system sys = 0; sys < PL_COLOR_SYSTEM_COUNT; sys++) {
+        enum AVColorSpace spc = pl_system_to_av(sys);
+        enum pl_color_system sys2 = pl_system_from_av(spc);
+        // Exception to the rule, due to different handling in libav*
+        if (sys2 && sys != PL_COLOR_SYSTEM_BT_2100_HLG)
+            REQUIRE_CMP(sys, ==, sys2, "u");
+    }
+
+    for (enum pl_color_levels lev = 0; lev < PL_COLOR_LEVELS_COUNT; lev++) {
+        enum AVColorRange range = pl_levels_to_av(lev);
+        enum pl_color_levels lev2 = pl_levels_from_av(range);
+        REQUIRE_CMP(lev, ==, lev2, "u");
+    }
+
+    for (enum pl_color_primaries prim = 0; prim < PL_COLOR_PRIM_COUNT; prim++) {
+        enum AVColorPrimaries avpri = pl_primaries_to_av(prim);
+        enum pl_color_primaries prim2 = pl_primaries_from_av(avpri);
+        if (prim2)
+            REQUIRE_CMP(prim, ==, prim2, "u");
+    }
+
+    for (enum pl_color_transfer trc = 0; trc < PL_COLOR_TRC_COUNT; trc++) {
+        enum AVColorTransferCharacteristic avtrc = pl_transfer_to_av(trc);
+        enum pl_color_transfer trc2 = pl_transfer_from_av(avtrc);
+        if (trc2)
+            REQUIRE_CMP(trc, ==, trc2, "u");
+    }
+
+    for (enum pl_chroma_location loc = 0; loc < PL_CHROMA_COUNT; loc++) {
+        enum AVChromaLocation avloc = pl_chroma_to_av(loc);
+        enum pl_chroma_location loc2 = pl_chroma_from_av(avloc);
+        REQUIRE_CMP(loc, ==, loc2, "u");
+    }
+}
diff --git a/src/tests/lut.c b/src/tests/lut.c
new file mode 100644
index 0000000..4af44ee
--- /dev/null
+++ b/src/tests/lut.c
@@ -0,0 +1,86 @@
+#include "tests.h"
+
+#include <libplacebo/dummy.h>
+#include <libplacebo/shaders/lut.h>
+
+static const char *luts[] = {
+
+    "TITLE \"1D LUT example\"   \n"
+    "LUT_1D_SIZE 11             \n"
+    "# Random comment           \n"
+    "0.0 0.0 0.0                \n"
+    "0.1 0.1 0.1                \n"
+    "0.2 0.2 0.2                \n"
+    "0.3 0.3 0.3                \n"
+    "0.4 0.4 0.4                \n"
+    "0.5 0.5 0.5                \n"
+    "0.6 0.6 0.6                \n"
+    "0.7 0.7 0.7                \n"
+    "0.8 0.8 0.8                \n"
+    "0.9 0.9 0.9                \n"
+    "0.10 0.10 0.10             \n",
+
+    "LUT_3D_SIZE 3              \n"
+    "TITLE \"3D LUT example\"   \n"
+    "0.0 0.0 0.0                \n"
+    "0.5 0.0 0.0                \n"
+    "1.0 0.0 0.0                \n"
+    "0.0 0.5 0.0                \n"
+    "0.5 0.5 0.0                \n"
+    "1.0 0.5 0.0                \n"
+    "0.0 1.0 0.0                \n"
+    "0.5 1.0 0.0                \n"
+    "1.0 1.0 0.0                \n"
+    "0.0 0.0 0.5                \n"
+    "0.5 0.0 0.5                \n"
+    "1.0 0.0 0.5                \n"
+    "0.0 0.5 0.5                \n"
+    "0.5 0.5 0.5                \n"
+    "1.0 0.5 0.5                \n"
+    "0.0 1.0 0.5                \n"
+    "0.5 1.0 0.5                \n"
+    "1.0 1.0 0.5                \n"
+    "0.0 0.0 1.0                \n"
+    "0.5 0.0 1.0                \n"
+    "1.0 0.0 1.0                \n"
+    "0.0 0.5 1.0                \n"
+    "0.5 0.5 1.0                \n"
+    "1.0 0.5 1.0                \n"
+    "0.0 1.0 1.0                \n"
+    "0.5 1.0 1.0                \n"
+    "1.0 1.0 1.0                \n",
+
+    "LUT_1D_SIZE 3              \n"
+    "TITLE \"custom domain\"    \n"
+    "DOMAIN_MAX 255 255 255     \n"
+    "0 0 0                      \n"
+    "128 128 128                \n"
+    "255 255 255                \n"
+
+};
+
+int main()
+{
+    pl_log log = pl_test_logger();
+    pl_gpu gpu = pl_gpu_dummy_create(log, NULL);
+    pl_shader sh = pl_shader_alloc(log, NULL);
+    pl_shader_obj obj = NULL;
+
+    for (int i = 0; i < PL_ARRAY_SIZE(luts); i++) {
+        struct pl_custom_lut *lut;
+        lut = pl_lut_parse_cube(log, luts[i], strlen(luts[i]));
+        REQUIRE(lut);
+
+        pl_shader_reset(sh, pl_shader_params( .gpu = gpu ));
+        pl_shader_custom_lut(sh, lut, &obj);
+        const struct pl_shader_res *res = pl_shader_finalize(sh);
+        REQUIRE(res);
+        printf("Generated LUT shader:\n%s\n", res->glsl);
+        pl_lut_free(&lut);
+    }
+
+    pl_shader_obj_destroy(&obj);
+    pl_shader_free(&sh);
+    pl_gpu_dummy_destroy(&gpu);
+    pl_log_destroy(&log);
+}
diff --git a/src/tests/meson.build b/src/tests/meson.build
new file mode 100644
index 0000000..335c6b1
--- /dev/null
+++ b/src/tests/meson.build
@@ -0,0 +1,39 @@
+ts = []
+
+foreach t : tests
+  deps = [tdep_static]
+  if t == 'opengl_surfaceless.c'
+    deps += glad_dep
+  endif
+  # TODO: Define objects in tdep_static once Meson 1.1.0 is ok to use
+  ts += { 'source': t,
+          'deps': deps,
+          'objects': lib.extract_all_objects(recursive: false) }
+endforeach
+
+dav1d = dependency('dav1d', required: false)
+if dav1d.found()
+  ts += { 'source': 'dav1d.c', 'deps': [dav1d, tdep_shared] }
+endif
+
+lavu = dependency('libavutil', version: '>=55.74.100', required: false)
+lavc = dependency('libavcodec', required: false)
+lavf = dependency('libavformat', required: false)
+libav_found = lavu.found() and lavc.found() and lavf.found()
+if libav_found
+  ts += { 'source': 'libav.c', 'deps': [lavu, lavc, lavf, tdep_shared] }
+endif
+
+foreach t : ts
+  e = executable('test.' + t['source'], t['source'],
+      objects: t.get('objects', []),
+      c_args: [ '-Wno-unused-function' ],
+      dependencies: t.get('deps', []),
+      link_args: link_args,
+      link_depends: link_depends,
+  )
+
+  test(t['source'], e, timeout: 120)
+endforeach
+
+subdir('include')
diff --git a/src/tests/opengl_surfaceless.c b/src/tests/opengl_surfaceless.c
new file mode 100644
index 0000000..2d12a08
--- /dev/null
+++ b/src/tests/opengl_surfaceless.c
@@ -0,0 +1,247 @@
+#include "gpu_tests.h"
+#include "opengl/utils.h"
+
+#include <libplacebo/opengl.h>
+
+static void opengl_interop_tests(pl_gpu gpu)
+{
+    pl_fmt fmt = pl_find_fmt(gpu, PL_FMT_UNORM, 1, 0, 0,
+                             PL_FMT_CAP_RENDERABLE | PL_FMT_CAP_LINEAR);
+    if (!fmt)
+        return;
+
+    pl_tex export = pl_tex_create(gpu, pl_tex_params(
+        .w = 32,
+        .h = 32,
+        .format = fmt,
+        .sampleable = true,
+        .renderable = true,
+        .blit_dst = fmt->caps & PL_FMT_CAP_BLITTABLE,
+    ));
+
+    REQUIRE(export);
+
+    struct pl_opengl_wrap_params wrap = {
+        .width = export->params.w,
+        .height = export->params.h,
+        .depth = export->params.d,
+    };
+
+    wrap.texture = pl_opengl_unwrap(gpu, export, &wrap.target, &wrap.iformat, NULL);
+    REQUIRE(wrap.texture);
+
+    pl_tex import = pl_opengl_wrap(gpu, &wrap);
+    REQUIRE(import);
+    REQUIRE(import->params.renderable);
+    REQUIRE_CMP(import->params.blit_dst, ==, export->params.blit_dst, "d");
+
+    pl_tex_destroy(gpu, &import);
+    pl_tex_destroy(gpu, &export);
+}
+
+#define PBUFFER_WIDTH 640
+#define PBUFFER_HEIGHT 480
+
+struct swapchain_priv {
+    EGLDisplay display;
+    EGLSurface surface;
+};
+
+static void swap_buffers(void *priv)
+{
+    struct swapchain_priv *p = priv;
+    eglSwapBuffers(p->display, p->surface);
+}
+
+static void opengl_swapchain_tests(pl_opengl gl,
+                                   EGLDisplay display, EGLSurface surface)
+{
+    if (surface == EGL_NO_SURFACE)
+        return;
+
+    printf("testing opengl swapchain\n");
+    pl_gpu gpu = gl->gpu;
+    pl_swapchain sw;
+    sw = pl_opengl_create_swapchain(gl, pl_opengl_swapchain_params(
+        .swap_buffers = swap_buffers,
+        .priv = &(struct swapchain_priv) { display, surface },
+    ));
+    REQUIRE(sw);
+
+    int w = PBUFFER_WIDTH, h = PBUFFER_HEIGHT;
+    REQUIRE(pl_swapchain_resize(sw, &w, &h));
+
+    for (int i = 0; i < 10; i++) {
+        struct pl_swapchain_frame frame;
+        REQUIRE(pl_swapchain_start_frame(sw, &frame));
+        if (frame.fbo->params.blit_dst)
+            pl_tex_clear(gpu, frame.fbo, (float[4]){0});
+
+        // TODO: test this with an actual pl_renderer instance
+        struct pl_frame target;
+        pl_frame_from_swapchain(&target, &frame);
+
+        REQUIRE(pl_swapchain_submit_frame(sw));
+        pl_swapchain_swap_buffers(sw);
+    }
+
+    pl_swapchain_destroy(&sw);
+}
+
+int main()
+{
+    if (!gladLoaderLoadEGL(EGL_NO_DISPLAY))
+        return SKIP;
+
+    const char *extstr = eglQueryString(EGL_NO_DISPLAY, EGL_EXTENSIONS);
+    if (!extstr || !strstr(extstr, "EGL_MESA_platform_surfaceless"))
+        return SKIP;
+
+    // Create the OpenGL context
+    EGLDisplay dpy = eglGetPlatformDisplayEXT(EGL_PLATFORM_SURFACELESS_MESA,
+                                              (void *) EGL_DEFAULT_DISPLAY, NULL);
+    if (dpy == EGL_NO_DISPLAY)
+        return SKIP;
+
+    EGLint major, minor;
+    if (!eglInitialize(dpy, &major, &minor))
+        return SKIP;
+
+    if (!gladLoaderLoadEGL(dpy))
+        return SKIP;
+
+    printf("Initialized EGL v%d.%d\n", major, minor);
+    int egl_ver = major * 10 + minor;
+
+    struct {
+        EGLenum api;
+        EGLenum render;
+        int major, minor;
+        int glsl_ver;
+        EGLenum profile;
+    } egl_vers[] = {
+        { EGL_OPENGL_API,       EGL_OPENGL_BIT,     4, 6, 460, EGL_CONTEXT_OPENGL_CORE_PROFILE_BIT },
+        { EGL_OPENGL_API,       EGL_OPENGL_BIT,     3, 3, 330, EGL_CONTEXT_OPENGL_CORE_PROFILE_BIT },
+        { EGL_OPENGL_API,       EGL_OPENGL_BIT,     3, 0, 130, EGL_CONTEXT_OPENGL_COMPATIBILITY_PROFILE_BIT, },
+        { EGL_OPENGL_ES_API,    EGL_OPENGL_ES3_BIT, 3, 0, 300, },
+    };
+
+    struct pl_glsl_version last_glsl = {0};
+    struct pl_gpu_limits last_limits = {0};
+
+    pl_log log = pl_test_logger();
+
+    for (int i = 0; i < PL_ARRAY_SIZE(egl_vers); i++) {
+
+        const int cfg_attribs[] = {
+            EGL_SURFACE_TYPE, EGL_PBUFFER_BIT,
+            EGL_RENDERABLE_TYPE, egl_vers[i].render,
+            EGL_NONE
+        };
+
+        EGLConfig config = 0;
+        EGLint num_configs = 0;
+        bool ok = eglChooseConfig(dpy, cfg_attribs, &config, 1, &num_configs);
+        if (!ok || !num_configs)
+            goto error;
+
+        if (!eglBindAPI(egl_vers[i].api))
+            goto error;
+
+        EGLContext egl;
+        if (egl_vers[i].api == EGL_OPENGL_ES_API) {
+            // OpenGL ES
+            const EGLint egl_attribs[] = {
+                EGL_CONTEXT_CLIENT_VERSION, egl_vers[i].major,
+                (egl_ver >= 15) ? EGL_CONTEXT_OPENGL_DEBUG : EGL_NONE, EGL_TRUE,
+                EGL_NONE
+            };
+
+            printf("Attempting creation of OpenGL ES v%d context\n", egl_vers[i].major);
+            egl = eglCreateContext(dpy, config, EGL_NO_CONTEXT, egl_attribs);
+        } else {
+            // Desktop OpenGL
+            const int egl_attribs[] = {
+                EGL_CONTEXT_MAJOR_VERSION, egl_vers[i].major,
+                EGL_CONTEXT_MINOR_VERSION, egl_vers[i].minor,
+                EGL_CONTEXT_OPENGL_PROFILE_MASK, egl_vers[i].profile,
+                (egl_ver >= 15) ? EGL_CONTEXT_OPENGL_DEBUG : EGL_NONE, EGL_TRUE,
+                EGL_NONE
+            };
+
+            printf("Attempting creation of Desktop OpenGL v%d.%d context\n",
+                   egl_vers[i].major, egl_vers[i].minor);
+            egl = eglCreateContext(dpy, config, EGL_NO_CONTEXT, egl_attribs);
+        }
+
+        if (!egl)
+            goto error;
+
+        const EGLint pbuffer_attribs[] = {
+            EGL_WIDTH, PBUFFER_WIDTH,
+            EGL_HEIGHT, PBUFFER_HEIGHT,
+            EGL_NONE
+        };
+
+        EGLSurface surf = eglCreatePbufferSurface(dpy, config, pbuffer_attribs);
+
+        if (!eglMakeCurrent(dpy, surf, surf, egl))
+            goto error;
+
+        pl_opengl gl = pl_opengl_create(log, pl_opengl_params(
+            .get_proc_addr = (pl_voidfunc_t (*)(const char *)) eglGetProcAddress,
+            .max_glsl_version = egl_vers[i].glsl_ver,
+            .debug = true,
+            .egl_display = dpy,
+            .egl_context = egl,
+#ifdef CI_ALLOW_SW
+            .allow_software = true,
+#endif
+        ));
+        if (!gl)
+            goto next;
+
+        // Skip repeat tests
+        pl_gpu gpu = gl->gpu;
+        if (memcmp(&last_glsl, &gpu->glsl, sizeof(last_glsl)) == 0 &&
+            memcmp(&last_limits, &gpu->limits, sizeof(last_limits)) == 0)
+        {
+            printf("Skipping tests due to duplicate capabilities/version\n");
+            goto next;
+        }
+
+#ifdef CI_MAXGL
+        if (last_glsl.version && last_glsl.gles == gpu->glsl.gles)
+            goto next;
+#endif
+
+        last_glsl = gpu->glsl;
+        last_limits = gpu->limits;
+
+        gpu_shader_tests(gpu);
+        gpu_interop_tests(gpu);
+        opengl_interop_tests(gpu);
+        opengl_swapchain_tests(gl, dpy, surf);
+
+        // Reduce log spam after first successful test
+        pl_log_level_update(log, PL_LOG_INFO);
+
+next:
+        pl_opengl_destroy(&gl);
+        eglDestroySurface(dpy, surf);
+        eglDestroyContext(dpy, egl);
+        continue;
+
+error: ;
+        EGLint error = eglGetError();
+        if (error != EGL_SUCCESS)
+            fprintf(stderr, "EGL error: %s\n", egl_err_str(error));
+    }
+
+    eglTerminate(dpy);
+    gladLoaderUnloadEGL();
+    pl_log_destroy(&log);
+
+    if (!last_glsl.version)
+        return SKIP;
+}
diff --git a/src/tests/options.c b/src/tests/options.c
new file mode 100644
index 0000000..f178668
--- /dev/null
+++ b/src/tests/options.c
@@ -0,0 +1,123 @@
+#include "tests.h"
+
+#include <libplacebo/options.h>
+
+static void count_cb(void *priv, pl_opt_data data)
+{
+    int *num = priv;
+    printf("Iterating over option: %s = %s\n", data->opt->key, data->text);
+    (*num)++;
+}
+
+static void set_cb(void *priv, pl_opt_data data)
+{
+    pl_options dst = priv;
+    REQUIRE(pl_options_set_str(dst, data->opt->key, data->text));
+}
+
+int main()
+{
+    pl_log log = pl_test_logger();
+    pl_options test = pl_options_alloc(log);
+
+    REQUIRE_STREQ(pl_options_save(test), "");
+    REQUIRE(pl_options_load(test, ""));
+    REQUIRE_STREQ(pl_options_save(test), "");
+
+    pl_options_reset(test, &pl_render_fast_params);
+    REQUIRE_STREQ(pl_options_save(test), "");
+    REQUIRE(pl_options_load(test, "preset=fast"));
+    REQUIRE_STREQ(pl_options_save(test), "");
+
+    const char *def_opts = "upscaler=lanczos,downscaler=hermite,frame_mixer=oversample,sigmoid=yes,peak_detect=yes,dither=yes";
+    pl_options_reset(test, &pl_render_default_params);
+    REQUIRE_STREQ(pl_options_save(test), def_opts);
+    struct pl_options_t def_pre = *test;
+    pl_options_reset(test, NULL);
+    REQUIRE_STREQ(pl_options_save(test), "");
+    REQUIRE(pl_options_load(test, def_opts));
+    REQUIRE_STREQ(pl_options_save(test), def_opts);
+    REQUIRE_MEMEQ(test, &def_pre, sizeof(*test));
+    pl_options_reset(test, NULL);
+    REQUIRE(pl_options_load(test, "preset=default"));
+    REQUIRE_STREQ(pl_options_save(test), def_opts);
+    REQUIRE_MEMEQ(test, &def_pre, sizeof(*test));
+
+    int num = 0;
+    pl_options_iterate(test, count_cb, &num);
+    REQUIRE_CMP(num, ==, 6, "d");
+
+    pl_opt_data data;
+    REQUIRE((data = pl_options_get(test, "tile_size")));
+    REQUIRE_STREQ(data->opt->key, "tile_size");
+    REQUIRE_CMP(*(int *) data->value, =, pl_render_default_params.tile_size, "d");
+    REQUIRE_STREQ(data->text, "32");
+
+    const char *hq_opts = "upscaler=ewa_lanczossharp,downscaler=hermite,frame_mixer=oversample,deband=yes,sigmoid=yes,peak_detect=yes,peak_percentile=99.99500274658203,contrast_recovery=0.30000001192092896,dither=yes";
+    // fallback can produce different precision
+    const char *hq_opts2 = "upscaler=ewa_lanczossharp,downscaler=hermite,frame_mixer=oversample,deband=yes,sigmoid=yes,peak_detect=yes,peak_percentile=99.99500274658203125,contrast_recovery=0.30000001192092896,dither=yes";
+
+    pl_options_reset(test, &pl_render_high_quality_params);
+    const char *opts = pl_options_save(test);
+    if (!strcmp(opts, hq_opts2))
+        hq_opts = hq_opts2;
+    REQUIRE_STREQ(opts, hq_opts);
+    struct pl_options_t hq_pre = *test;
+    pl_options_reset(test, NULL);
+    REQUIRE_STREQ(pl_options_save(test), "");
+    REQUIRE(pl_options_load(test, hq_opts));
+    REQUIRE_STREQ(pl_options_save(test), hq_opts);
+    REQUIRE_MEMEQ(test, &hq_pre, sizeof(*test));
+    REQUIRE(pl_options_load(test, "preset=high_quality"));
+    REQUIRE_STREQ(pl_options_save(test), hq_opts);
+    REQUIRE_MEMEQ(test, &hq_pre, sizeof(*test));
+
+    pl_options test2 = pl_options_alloc(log);
+    pl_options_iterate(test, set_cb, test2);
+    REQUIRE_STREQ(pl_options_save(test), pl_options_save(test2));
+    pl_options_free(&test2);
+
+    // Test custom scalers
+    pl_options_reset(test, pl_render_params(
+        .upscaler = &(struct pl_filter_config) {
+            .kernel = &pl_filter_function_jinc,
+            .window = &pl_filter_function_jinc,
+            .radius = 4.0,
+            .polar  = true,
+        },
+    ));
+    const char *jinc4_opts = "upscaler=custom,upscaler_kernel=jinc,upscaler_window=jinc,upscaler_radius=4,upscaler_polar=yes";
+    REQUIRE_STREQ(pl_options_save(test), jinc4_opts);
+    struct pl_options_t jinc4_pre = *test;
+    pl_options_reset(test, NULL);
+    REQUIRE(pl_options_load(test, "upscaler=custom,upscaler_preset=ewa_lanczos,upscaler_radius=4.0,upscaler_clamp=0.0"));
+    REQUIRE_STREQ(pl_options_save(test), jinc4_opts);
+    REQUIRE_MEMEQ(test, &jinc4_pre, sizeof(*test));
+
+    // Test params presets
+    pl_options_reset(test, NULL);
+    REQUIRE(pl_options_load(test, "cone=yes,cone_preset=deuteranomaly"));
+    REQUIRE_STREQ(pl_options_save(test), "cone=yes,cones=m,cone_strength=0.5");
+
+    // Test error paths
+    pl_options bad = pl_options_alloc(NULL);
+    REQUIRE(!pl_options_load(bad, "scale_preset=help"));
+    REQUIRE(!pl_options_load(bad, "dither_method=invalid"));
+    REQUIRE(!pl_options_load(bad, "lut_entries=-1"));
+    REQUIRE(!pl_options_load(bad, "deband_iterations=100"));
+    REQUIRE(!pl_options_load(bad, "tone_lut_size=abc"));
+    REQUIRE(!pl_options_load(bad, "show_clipping=hello"));
+    REQUIRE(!pl_options_load(bad, "brightness=2.0"));
+    REQUIRE(!pl_options_load(bad, "gamma=oops"));
+    REQUIRE(!pl_options_load(bad, "invalid"));
+    REQUIRE(!pl_options_load(bad, "="));
+    REQUIRE(!pl_options_load(bad, "preset==bar"));
+    REQUIRE(!pl_options_load(bad, "peak_percentile=E8203125"));
+    REQUIRE(!pl_options_get(bad, "invalid"));
+    REQUIRE_STREQ(pl_options_save(bad), "");
+    pl_options_free(&bad);
+
+    pl_options_free(&test);
+    pl_log_destroy(&log);
+    return 0;
+}
diff --git a/src/tests/string.c b/src/tests/string.c
new file mode 100644
index 0000000..52985c4
--- /dev/null
+++ b/src/tests/string.c
@@ -0,0 +1,147 @@
+#include "tests.h"
+
+static const pl_str null = {0};
+static const pl_str test = PL_STR0("test");
+static const pl_str empty = PL_STR0("");
+
+static inline bool is_null(pl_str str)
+{
+    return !str.len && !str.buf;
+}
+
+static inline bool is_empty(pl_str str)
+{
+    return !str.len;
+}
+
+int main()
+{
+    void *tmp = pl_tmp(NULL);
+
+    REQUIRE(is_null(pl_str0(NULL)));
+    REQUIRE(is_null(pl_strdup(tmp, null)));
+    char *empty0 = pl_strdup0(tmp, null);
+    REQUIRE(empty0 && !empty0[0]);
+    REQUIRE(pl_str_equals0(empty, empty0));
+
+    pl_str buf = {0};
+    pl_str_append(tmp, &buf, null);
+    REQUIRE(is_empty(buf));
+    pl_str_append_asprintf(tmp, &buf, "%.*s", PL_STR_FMT(test));
+    REQUIRE(pl_str_equals(buf, test));
+
+    pl_str_append_asprintf_c(tmp, &buf, "%d %f %f %f %lld %zu %.*sx %hx %hx %hx %hx",
+        1, 1.0f, 4294967295.56, 83224965647295.65, 0xFFll, (size_t) 0, PL_STR_FMT(empty),
+        (unsigned short) 0xCAFEu, (unsigned short) 0x1, (unsigned short) 0,
+        (unsigned short) 0xFFFFu);
+    const char *expected = "test1 1 4294967295.56 83224965647295.66 255 0 x cafe 1 0 ffff";
+    // fallback can produce different precision
+    const char *expected2 = "test1 1 4294967295.55999994277954102 83224965647295.65625 255 0 x cafe 1 0 ffff";
+    REQUIRE(pl_str_equals0(buf, expected) || pl_str_equals0(buf, expected2));
+
+    REQUIRE_CMP(pl_strchr(null, ' '), <, 0, "d");
+    REQUIRE_CMP((int) pl_strspn(null, " "), ==, 0, "d");
+    REQUIRE_CMP((int) pl_strcspn(null, " "), ==, 0, "d");
+    REQUIRE(is_null(pl_str_strip(null)));
+
+    REQUIRE_CMP(pl_strchr(test, 's'), ==, 2, "d");
+    REQUIRE_CMP((int) pl_strspn(test, "et"), ==, 2, "d");
+    REQUIRE_CMP((int) pl_strcspn(test, "xs"), ==, 2, "d");
+
+    REQUIRE(is_null(pl_str_take(null, 10)));
+    REQUIRE(is_empty(pl_str_take(test, 0)));
+    REQUIRE(is_null(pl_str_drop(null, 10)));
+    REQUIRE(is_null(pl_str_drop(test, test.len)));
+    REQUIRE(pl_str_equals(pl_str_drop(test, 0), test));
+
+    REQUIRE_CMP(pl_str_find(null, test), <, 0, "d");
+    REQUIRE_CMP(pl_str_find(null, null), ==, 0, "d");
+    REQUIRE_CMP(pl_str_find(test, null), ==, 0, "d");
+    REQUIRE_CMP(pl_str_find(test, test), ==, 0, "d");
+
+    pl_str rest;
+    REQUIRE(is_null(pl_str_split_char(null, ' ', &rest)) && is_null(rest));
+    REQUIRE(is_null(pl_str_split_str(null, test, &rest)) && is_null(rest));
+    REQUIRE(is_empty(pl_str_split_str(test, test, &rest)) && is_empty(rest));
+    REQUIRE(is_null(pl_str_getline(null, &rest)) && is_null(rest));
+
+    pl_str right, left = pl_str_split_char(pl_str0("left right"), ' ', &right);
+    REQUIRE(pl_str_equals0(left, "left"));
+    REQUIRE(pl_str_equals0(right, "right"));
+
+    left = pl_str_split_str0(pl_str0("leftTESTright"), "TEST", &right);
+    REQUIRE(pl_str_equals0(left, "left"));
+    REQUIRE(pl_str_equals0(right, "right"));
+
+    pl_str out;
+    REQUIRE(pl_str_decode_hex(tmp, null, &out) && is_empty(out));
+    REQUIRE(!pl_str_decode_hex(tmp, pl_str0("invalid"), &out));
+
+    REQUIRE(pl_str_equals(null, null));
+    REQUIRE(pl_str_equals(null, empty));
+    REQUIRE(pl_str_startswith(null, null));
+    REQUIRE(pl_str_startswith(test, null));
+    REQUIRE(pl_str_startswith(test, test));
+    REQUIRE(pl_str_endswith(null, null));
+    REQUIRE(pl_str_endswith(test, null));
+    REQUIRE(pl_str_endswith(test, test));
+
+    double d;
+    float f;
+    int i;
+    unsigned u;
+    int64_t i64;
+    uint64_t u64;
+
+    REQUIRE(pl_str_parse_double(pl_str0("4294967295.56"), &d));      REQUIRE_FEQ(d, 4294967295.56, 1e-20);
+    REQUIRE(pl_str_parse_double(pl_str0("-4294967295.56"), &d));     REQUIRE_FEQ(d, -4294967295.56, 1e-20);
+    REQUIRE(pl_str_parse_double(pl_str0("83224965647295.65"), &d));  REQUIRE_FEQ(d, 83224965647295.65, 1e-20);
+    REQUIRE(pl_str_parse_double(pl_str0("-83224965647295.65"), &d)); REQUIRE_FEQ(d, -83224965647295.65, 1e-20);
+    REQUIRE(pl_str_parse_float(pl_str0("4294967295.56"), &f));       REQUIRE_FEQ(f, 4294967295.56f, 1e-8);
+    REQUIRE(pl_str_parse_float(pl_str0("-4294967295.56"), &f));      REQUIRE_FEQ(f, -4294967295.56f, 1e-8);
+    REQUIRE(pl_str_parse_float(pl_str0("83224965647295.65"), &f));   REQUIRE_FEQ(f, 83224965647295.65f, 1e-8);
+    REQUIRE(pl_str_parse_float(pl_str0("-83224965647295.65"), &f));  REQUIRE_FEQ(f, -83224965647295.65f, 1e-8);
+    REQUIRE(pl_str_parse_float(pl_str0("1.3984"), &f));     REQUIRE_FEQ(f, 1.3984f, 1e-8);
+    REQUIRE(pl_str_parse_float(pl_str0("-8.9100083"), &f)); REQUIRE_FEQ(f, -8.9100083f, 1e-8);
+    REQUIRE(pl_str_parse_float(pl_str0("-0"), &f));         REQUIRE_FEQ(f, 0.0f, 1e-8);
+    REQUIRE(pl_str_parse_float(pl_str0("-3.14e20"), &f));   REQUIRE_FEQ(f, -3.14e20f, 1e-8);
+    REQUIRE(pl_str_parse_float(pl_str0("0.5e-5"), &f));     REQUIRE_FEQ(f, 0.5e-5f, 1e-8);
+    REQUIRE(pl_str_parse_float(pl_str0("0.5e+5"), &f));     REQUIRE_FEQ(f, 0.5e+5f, 1e-8);
+    REQUIRE(pl_str_parse_int(pl_str0("64239"), &i));        REQUIRE_CMP(i, ==, 64239, "d");
+    REQUIRE(pl_str_parse_int(pl_str0("-102"), &i));         REQUIRE_CMP(i, ==, -102, "d");
+    REQUIRE(pl_str_parse_int(pl_str0("1"), &i));            REQUIRE_CMP(i, ==, 1, "d");
+    REQUIRE(pl_str_parse_int(pl_str0("-0"), &i));           REQUIRE_CMP(i, ==, 0, "d");
+    REQUIRE(pl_str_parse_uint(pl_str0("64239"), &u));       REQUIRE_CMP(u, ==, 64239, "u");
+    REQUIRE(pl_str_parse_uint(pl_str0("1"), &u));           REQUIRE_CMP(u, ==, 1, "u");
+    REQUIRE(pl_str_parse_int64(pl_str0("9223372036854775799"), &i64));
+    REQUIRE_CMP(i64, ==, 9223372036854775799LL, PRIi64);
+    REQUIRE(pl_str_parse_int64(pl_str0("-9223372036854775799"), &i64));
+    REQUIRE_CMP(i64, ==, -9223372036854775799LL, PRIi64);
+    REQUIRE(pl_str_parse_uint64(pl_str0("18446744073709551609"), &u64));
+    REQUIRE_CMP(u64, ==, 18446744073709551609LLU, PRIu64);
+    REQUIRE(!pl_str_parse_float(null, &f));
+    REQUIRE(!pl_str_parse_float(test, &f));
+    REQUIRE(!pl_str_parse_float(empty, &f));
+    REQUIRE(!pl_str_parse_int(null, &i));
+    REQUIRE(!pl_str_parse_int(test, &i));
+    REQUIRE(!pl_str_parse_int(empty, &i));
+    REQUIRE(!pl_str_parse_uint(null, &u));
+    REQUIRE(!pl_str_parse_uint(test, &u));
+    REQUIRE(!pl_str_parse_uint(empty, &u));
+
+    pl_str_builder builder = pl_str_builder_alloc(tmp);
+    pl_str_builder_const_str(builder, "hello");
+    pl_str_builder_str(builder, pl_str0("world"));
+    pl_str res = pl_str_builder_exec(builder);
+    REQUIRE(pl_str_equals0(res, "helloworld"));
+
+    pl_str_builder_reset(builder);
+    pl_str_builder_printf_c(builder, "foo %d bar %u bat %s baz %lld",
+            123, 56u, "quack", 0xDEADBEEFll);
+    pl_str_builder_printf_c(builder, " %.*s", PL_STR_FMT(pl_str0("test123")));
+    res = pl_str_builder_exec(builder);
+    REQUIRE(pl_str_equals0(res, "foo 123 bar 56 bat quack baz 3735928559 test123"));
+
+    pl_free(tmp);
+    return 0;
+}
diff --git a/src/tests/tests.h b/src/tests/tests.h
new file mode 100644
index 0000000..a33a0de
--- /dev/null
+++ b/src/tests/tests.h
@@ -0,0 +1,319 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+#include <libplacebo/log.h>
+#include <libplacebo/colorspace.h>
+#include <libplacebo/shaders/film_grain.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+#ifdef PL_HAVE_WIN32
+#include <io.h>
+#define isatty _isatty
+#define fileno _fileno
+#else
+#include <unistd.h>
+#endif
+
+static void pl_log_timestamp(void *stream, enum pl_log_level level, const char *msg)
+{
+    static char letter[] = {
+        [PL_LOG_FATAL] = 'f',
+        [PL_LOG_ERR]   = 'e',
+        [PL_LOG_WARN]  = 'w',
+        [PL_LOG_INFO]  = 'i',
+        [PL_LOG_DEBUG] = 'd',
+        [PL_LOG_TRACE] = 't',
+    };
+
+    // Log time relative to the first message
+    static pl_clock_t base = 0;
+    if (!base)
+        base = pl_clock_now();
+
+    double secs = pl_clock_diff(pl_clock_now(), base);
+    printf("[%2.3f][%c] %s\n", secs, letter[level], msg);
+
+    if (level <= PL_LOG_WARN) {
+        // duplicate warnings/errors to stderr
+        fprintf(stderr, "[%2.3f][%c] %s\n", secs, letter[level], msg);
+        fflush(stderr);
+    }
+}
+
+static inline pl_log pl_test_logger(void)
+{
+    setbuf(stdout, NULL);
+    setbuf(stderr, NULL);
+
+    return pl_log_create(PL_API_VER, pl_log_params(
+        .log_cb    = isatty(fileno(stdout)) ? pl_log_color : pl_log_timestamp,
+        .log_level = PL_LOG_DEBUG,
+    ));
+}
+
+#define RANDOM (rand() / (float) RAND_MAX)
+#define RANDOM_U8 ((uint8_t) (256.0 * rand() / (RAND_MAX + 1.0)))
+#define SKIP 77
+
+// Helpers for performing various checks
+#define REQUIRE(cond) do                                                        \
+{                                                                               \
+    if (!(cond)) {                                                              \
+        fprintf(stderr, "=== FAILED: '"#cond"' at "__FILE__":%d\n\n", __LINE__);\
+        exit(1);                                                                \
+    }                                                                           \
+} while (0)
+
+#define REQUIRE_CMP(a, op, b, fmt) do                                           \
+{                                                                               \
+    __typeof__(a) _va = (a), _vb = (b);                                         \
+                                                                                \
+    if (!(_va op _vb)) {                                                        \
+        fprintf(stderr, "=== FAILED: '"#a" "#op" "#b"' at "__FILE__":%d\n"      \
+                        " %-31s = %"fmt"\n"                                     \
+                        " %-31s = %"fmt"\n\n",                                  \
+                __LINE__, #a, _va, #b, _vb);                                    \
+        exit(1);                                                                \
+    }                                                                           \
+} while (0)
+
+#define REQUIRE_FEQ(a, b, epsilon) do                                           \
+{                                                                               \
+    float _va = (a);                                                            \
+    float _vb = (b);                                                            \
+    float _delta = (epsilon) * fmax(1.0, fabs(_va));                            \
+                                                                                \
+    if (fabs(_va - _vb) > _delta) {                                             \
+        fprintf(stderr, "=== FAILED: '"#a" ≈ "#b"' at "__FILE__":%d\n"          \
+                        " %-31s = %f\n"                                         \
+                        " %-31s = %f\n"                                         \
+                        " %-31s = %f\n\n",                                      \
+                __LINE__, #a, _va, #b, _vb,                                     \
+                "epsilon "#epsilon" -> max delta", _delta);                     \
+        exit(1);                                                                \
+    }                                                                           \
+} while (0)
+
+#define REQUIRE_STREQ(a, b) do                                                  \
+{                                                                               \
+    const char *_a = (a);                                                       \
+    const char *_b = (b);                                                       \
+    if (strcmp(_a, _b) != 0) {                                                  \
+        fprintf(stderr, "=== FAILED: !strcmp("#a", "#b") at "__FILE__":%d\n"    \
+                        " %-31s = %s\n"                                         \
+                        " %-31s = %s\n\n",                                      \
+                __LINE__, #a, _a, #b, _b);                                      \
+        exit(1);                                                                \
+    }                                                                           \
+} while (0)
+
+static inline void log_array(const uint8_t *a, const uint8_t *ref, size_t off, size_t size)
+{
+    for (size_t n = 0; n < size; n++) {
+        const char *prefix = "", *suffix = "";
+        char terminator = ' ';
+        if (a[n + off] != ref[n + off]) {
+            prefix = "\033[31;1m";
+            suffix = "\033[0m";
+        }
+        if (n+1 == size || n % 16 == 15)
+            terminator = '\n';
+        fprintf(stderr, "%s%02"PRIx8"%s%c", prefix, a[n + off], suffix, terminator);
+    }
+}
+
+static inline void require_memeq(const void *aptr, const void *bptr, size_t size,
+                                 const char *astr, const char *bstr,
+                                 const char *sizestr, const char *file, int line)
+{
+    const uint8_t *a = aptr, *b = bptr;
+    for (size_t i = 0; i < size; i++) {
+        if (a[i] == b[i])
+            continue;
+
+        fprintf(stderr, "=== FAILED: memcmp(%s, %s, %s) == 0 at %s:%d\n"
+                        "at position %zu: 0x%02"PRIx8" != 0x%02"PRIx8"\n\n",
+                astr, bstr, sizestr, file, line, i, a[i], b[i]);
+
+        size_t start = i >= 256 ? i - 256 : 0;
+        size_t end   = PL_MIN(size, i + 256);
+        fprintf(stderr, "%zu bytes of '%s' at offset %zu:\n", end - start, astr, start);
+        log_array(a, b, start, end - start);
+        fprintf(stderr, "\n%zu bytes of '%s' at offset %zu:\n", end - start, bstr, start);
+        log_array(b, a, start, end - start);
+        exit(1);
+    }
+}
+
+#define REQUIRE_MEMEQ(a, b, size) require_memeq(a, b, size, #a, #b, #size, __FILE__, __LINE__)
+
+#define REQUIRE_HANDLE(shmem, type)                                             \
+    switch (type) {                                                             \
+    case PL_HANDLE_FD:                                                          \
+    case PL_HANDLE_DMA_BUF:                                                     \
+        REQUIRE(shmem.handle.fd > -1);                                          \
+        break;                                                                  \
+    case PL_HANDLE_WIN32:                                                       \
+    case PL_HANDLE_WIN32_KMT:                                                   \
+        /* INVALID_HANDLE_VALUE = (-1) */                                       \
+        REQUIRE(shmem.handle.handle != (void *)(intptr_t) (-1));                \
+        /* fallthrough */                                                       \
+    case PL_HANDLE_MTL_TEX:                                                     \
+    case PL_HANDLE_IOSURFACE:                                                   \
+        REQUIRE(shmem.handle.handle);                                           \
+        break;                                                                  \
+    case PL_HANDLE_HOST_PTR:                                                    \
+        REQUIRE(shmem.handle.ptr);                                              \
+        break;                                                                  \
+    }
+
+static const struct pl_av1_grain_data av1_grain_data = {
+    .num_points_y = 6,
+    .points_y = {{0, 4}, {27, 33}, {54, 55}, {67, 61}, {108, 71}, {255, 72}},
+    .chroma_scaling_from_luma = false,
+    .num_points_uv = {2, 2},
+    .points_uv = {{{0, 64}, {255, 64}}, {{0, 64}, {255, 64}}},
+    .scaling_shift = 11,
+    .ar_coeff_lag = 3,
+    .ar_coeffs_y = {4,   1, 3,   0,  1, -3,  8, -3,  7, -23, 1, -25,
+                    0, -10, 6, -17, -4, 53, 36,  5, -5, -17, 8,  66},
+    .ar_coeffs_uv = {
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127},
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127},
+    },
+    .ar_coeff_shift = 7,
+    .grain_scale_shift = 0,
+    .uv_mult = {0, 0},
+    .uv_mult_luma = {64, 64},
+    .uv_offset = {0, 0},
+};
+
+static const uint8_t h274_lower_bound = 10;
+static const uint8_t h274_upper_bound = 250;
+static const int16_t h274_values[6] = {16, 12, 14};
+
+static const struct pl_h274_grain_data h274_grain_data = {
+    .model_id = 0,
+    .blending_mode_id = 0,
+    .log2_scale_factor = 2,
+    .component_model_present = {true},
+    .num_intensity_intervals = {1},
+    .num_model_values = {3},
+    .intensity_interval_lower_bound = {&h274_lower_bound},
+    .intensity_interval_upper_bound = {&h274_upper_bound},
+    .comp_model_value = {&h274_values},
+};
+
+static const struct pl_dovi_metadata dovi_meta = {
+    .nonlinear = {{{1, 0, 0}, {0, 1, 0}, {0, 0, 1}}},
+    .linear    = {{{1, 0, 0}, {0, 1, 0}, {0, 0, 1}}},
+    .comp = {
+        {
+            .num_pivots = 9,
+            .pivots = {0.0615835786, 0.129032254, 0.353861183,
+                       0.604105592, 0.854349971, 0.890518069,
+                       0.906158328, 0.913978517, 0.92082113},
+            .method = {0, 0, 0, 0, 0, 0, 0, 0},
+            .poly_coeffs = {
+                {-0.0488376617, 1.99335372, -2.41716385},
+                {-0.0141925812, 1.61829138, -1.53397191},
+                { 0.157061458, 0.63640213, -0.11302495},
+                {0.25272119, 0.246226311, 0.27281332},
+                {0.951621532, -1.35507894, 1.18898678},
+                {6.41251612, -13.6188488, 8.07336903},
+                {13.467535, -29.1869125, 16.6612244},
+                {28.2321472, -61.8516273, 34.7264938}
+            },
+        }, {
+            .num_pivots = 2,
+            .pivots = {0.0, 1.0},
+            .method = {1},
+            .mmr_order = {3},
+            .mmr_constant = {-0.500733018},
+            .mmr_coeffs = {{
+                {1.08411026, 3.80807829, 0.0881733894, -3.23097038, -0.409078479, -1.31310081, 2.71297002},
+                {-0.241833091, -3.57880807, -0.108109117, 3.13198471, 0.869203091, 1.96561158, -9.30871677},
+                {-0.177356839, 1.48970401, 0.0908923149, -0.510447979, -0.687603354, -0.934977889, 12.3544884},
+            }},
+        }, {
+            .num_pivots = 2,
+            .pivots = {0.0, 1.0},
+            .method = {1},
+            .mmr_order = {3},
+            .mmr_constant = {-1.23833287},
+            .mmr_coeffs = {{
+                {3.52909589, 0.383154511, 5.50820637, -1.02094889, -6.36386824, 0.194121242, 0.64683497},
+                {-2.57899785, -0.626081586, -6.05729723, 2.29143763, 9.14653015, -0.0507702827, -4.17724133},
+                {0.705404401, 0.341412306, 2.98387456, -1.71712542, -4.91501331, 0.1465137, 6.38665438},
+            }},
+        },
+    },
+};
+
+static const uint8_t sRGB_v2_nano_icc[] = {
+  0x00, 0x00, 0x01, 0x9a, 0x6c, 0x63, 0x6d, 0x73, 0x02, 0x10, 0x00, 0x00,
+  0x6d, 0x6e, 0x74, 0x72, 0x52, 0x47, 0x42, 0x20, 0x58, 0x59, 0x5a, 0x20,
+  0x07, 0xe2, 0x00, 0x03, 0x00, 0x14, 0x00, 0x09, 0x00, 0x0e, 0x00, 0x1d,
+  0x61, 0x63, 0x73, 0x70, 0x4d, 0x53, 0x46, 0x54, 0x00, 0x00, 0x00, 0x00,
+  0x73, 0x61, 0x77, 0x73, 0x63, 0x74, 0x72, 0x6c, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xd6,
+  0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0xd3, 0x2d, 0x68, 0x61, 0x6e, 0x64,
+  0xeb, 0x77, 0x1f, 0x3c, 0xaa, 0x53, 0x51, 0x02, 0xe9, 0x3e, 0x28, 0x6c,
+  0x91, 0x46, 0xae, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,
+  0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00, 0x5f,
+  0x77, 0x74, 0x70, 0x74, 0x00, 0x00, 0x01, 0x0c, 0x00, 0x00, 0x00, 0x14,
+  0x72, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x20, 0x00, 0x00, 0x00, 0x14,
+  0x67, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x34, 0x00, 0x00, 0x00, 0x14,
+  0x62, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x01, 0x48, 0x00, 0x00, 0x00, 0x14,
+  0x72, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x5c, 0x00, 0x00, 0x00, 0x34,
+  0x67, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x5c, 0x00, 0x00, 0x00, 0x34,
+  0x62, 0x54, 0x52, 0x43, 0x00, 0x00, 0x01, 0x5c, 0x00, 0x00, 0x00, 0x34,
+  0x63, 0x70, 0x72, 0x74, 0x00, 0x00, 0x01, 0x90, 0x00, 0x00, 0x00, 0x0a,
+  0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05,
+  0x6e, 0x52, 0x47, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0xf3, 0x54, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x16, 0xc9,
+  0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6f, 0xa0,
+  0x00, 0x00, 0x38, 0xf2, 0x00, 0x00, 0x03, 0x8f, 0x58, 0x59, 0x5a, 0x20,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x62, 0x96, 0x00, 0x00, 0xb7, 0x89,
+  0x00, 0x00, 0x18, 0xda, 0x58, 0x59, 0x5a, 0x20, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x24, 0xa0, 0x00, 0x00, 0x0f, 0x85, 0x00, 0x00, 0xb6, 0xc4,
+  0x63, 0x75, 0x72, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14,
+  0x00, 0x00, 0x01, 0x07, 0x02, 0xb5, 0x05, 0x6b, 0x09, 0x36, 0x0e, 0x50,
+  0x14, 0xb1, 0x1c, 0x80, 0x25, 0xc8, 0x30, 0xa1, 0x3d, 0x19, 0x4b, 0x40,
+  0x5b, 0x27, 0x6c, 0xdb, 0x80, 0x6b, 0x95, 0xe3, 0xad, 0x50, 0xc6, 0xc2,
+  0xe2, 0x31, 0xff, 0xff, 0x74, 0x65, 0x78, 0x74, 0x00, 0x00, 0x00, 0x00,
+  0x30, 0x00
+};
+
+#define TEST_PROFILE(arr) ((struct pl_icc_profile) {    \
+    .data = (arr),                                      \
+    .len = PL_ARRAY_SIZE(arr),                          \
+    .signature = (uintptr_t) (arr),                     \
+})
diff --git a/src/tests/tone_mapping.c b/src/tests/tone_mapping.c
new file mode 100644
index 0000000..0a48945
--- /dev/null
+++ b/src/tests/tone_mapping.c
@@ -0,0 +1,181 @@
+#include "tests.h"
+#include "log.h"
+
+#include <libplacebo/gamut_mapping.h>
+#include <libplacebo/tone_mapping.h>
+
+//#define PRINT_LUTS
+
+int main()
+{
+    pl_log log = pl_test_logger();
+
+    // PQ unit tests
+    REQUIRE_FEQ(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, 0.0), 0.0,     1e-2);
+    REQUIRE_FEQ(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, 1.0), 10000.0, 1e-2);
+    REQUIRE_FEQ(pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NITS, 0.58), 203.0,  1e-2);
+
+    // Test round-trip
+    for (float x = 0.0f; x < 1.0f; x += 0.01f) {
+        REQUIRE_FEQ(x, pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ,
+                       pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, x)),
+                    1e-5);
+    }
+
+    static float lut[128];
+    struct pl_tone_map_params params = {
+        .constants      = { PL_TONE_MAP_CONSTANTS },
+        .input_scaling  = PL_HDR_PQ,
+        .output_scaling = PL_HDR_PQ,
+        .lut_size       = PL_ARRAY_SIZE(lut),
+    };
+
+    // Test regular tone-mapping
+    params.input_min = pl_hdr_rescale(PL_HDR_NITS, params.input_scaling, 0.005);
+    params.input_max = pl_hdr_rescale(PL_HDR_NITS, params.input_scaling, 1000.0);
+    params.output_min = pl_hdr_rescale(PL_HDR_NORM, params.output_scaling, 0.001);
+    params.output_max = pl_hdr_rescale(PL_HDR_NORM, params.output_scaling, 1.0);
+
+    struct pl_tone_map_params params_inv = params;
+    PL_SWAP(params_inv.input_min, params_inv.output_min);
+    PL_SWAP(params_inv.input_max, params_inv.output_max);
+
+    int tested_pure_bpc = 0;
+
+    // Generate example tone mapping curves, forward and inverse
+    for (int i = 0; i < pl_num_tone_map_functions; i++) {
+        const struct pl_tone_map_function *fun = pl_tone_map_functions[i];
+        printf("Testing tone-mapping function %s\n", fun->name);
+        params.function = params_inv.function = fun;
+        pl_clock_t start = pl_clock_now();
+        pl_tone_map_generate(lut, &params);
+        pl_log_cpu_time(log, start, pl_clock_now(), "generating LUT");
+        for (int j = 0; j < PL_ARRAY_SIZE(lut); j++) {
+            REQUIRE(isfinite(lut[j]) && !isnan(lut[j]));
+            if (j > 0)
+                REQUIRE_CMP(lut[j], >=, lut[j - 1], "f");
+#ifdef PRINT_LUTS
+            printf("%f, %f\n", j / (PL_ARRAY_SIZE(lut) - 1.0f), lut[j]);
+#endif
+        }
+
+        if (fun->map_inverse || !tested_pure_bpc++) {
+            start = pl_clock_now();
+            pl_tone_map_generate(lut, &params_inv);
+            pl_log_cpu_time(log, start, pl_clock_now(), "generating inverse LUT");
+            for (int j = 0; j < PL_ARRAY_SIZE(lut); j++) {
+                REQUIRE(isfinite(lut[j]) && !isnan(lut[j]));
+                if (j > 0)
+                    REQUIRE_CMP(lut[j], >=, lut[j - 1], "f");
+#ifdef PRINT_LUTS
+                printf("%f, %f\n", j / (PL_ARRAY_SIZE(lut) - 1.0f), lut[j]);
+#endif
+            }
+        }
+    }
+
+    // Test that `spline` is a no-op for 1:1 tone mapping
+    params.output_min = params.input_min;
+    params.output_max = params.input_max;
+    params.function = &pl_tone_map_spline;
+    pl_tone_map_generate(lut, &params);
+    for (int j = 0; j < PL_ARRAY_SIZE(lut); j++) {
+        float x = j / (PL_ARRAY_SIZE(lut) - 1.0f);
+        x = PL_MIX(params.input_min, params.input_max, x);
+        REQUIRE_FEQ(x, lut[j], 1e-5);
+    }
+
+    // Test some gamut mapping methods
+    for (int i = 0; i < pl_num_gamut_map_functions; i++) {
+        static const float min_rgb = 0.1f, max_rgb = PL_COLOR_SDR_WHITE;
+        struct pl_gamut_map_params gamut = {
+            .function     = pl_gamut_map_functions[i],
+            .input_gamut  = *pl_raw_primaries_get(PL_COLOR_PRIM_BT_2020),
+            .output_gamut = *pl_raw_primaries_get(PL_COLOR_PRIM_BT_709),
+            .min_luma     = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_PQ, min_rgb),
+            .max_luma     = pl_hdr_rescale(PL_HDR_NITS, PL_HDR_PQ, max_rgb),
+        };
+
+        printf("Testing gamut-mapping function %s\n", gamut.function->name);
+
+        // Require that black maps to black and white maps to white
+        float black[3] = { gamut.min_luma, 0.0f, 0.0f };
+        float white[3] = { gamut.max_luma, 0.0f, 0.0f };
+        pl_gamut_map_sample(black, &gamut);
+        pl_gamut_map_sample(white, &gamut);
+        REQUIRE_FEQ(black[0], gamut.min_luma, 1e-4);
+        REQUIRE_FEQ(black[1], 0.0f, 1e-4);
+        REQUIRE_FEQ(black[2], 0.0f, 1e-4);
+        if (gamut.function != &pl_gamut_map_darken)
+            REQUIRE_FEQ(white[0], gamut.max_luma, 1e-4);
+        REQUIRE_FEQ(white[1], 0.0f, 1e-4);
+        REQUIRE_FEQ(white[2], 0.0f, 1e-4);
+    }
+
+    enum { LUT3D_SIZE = 65 }; // for benchmarking
+    struct pl_gamut_map_params perceptual = {
+        .function     = &pl_gamut_map_perceptual,
+        .input_gamut  = *pl_raw_primaries_get(PL_COLOR_PRIM_BT_2020),
+        .output_gamut = *pl_raw_primaries_get(PL_COLOR_PRIM_BT_709),
+        .max_luma     = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, 1.0f),
+        .lut_size_I   = LUT3D_SIZE,
+        .lut_size_C   = LUT3D_SIZE,
+        .lut_size_h   = LUT3D_SIZE,
+        .lut_stride   = 3,
+
+        // Set strength to maximum, because otherwise the saturation mapping
+        // code will not fully apply, invalidating the following test
+        .constants.perceptual_strength = 1.0f,
+    };
+
+    // Test that primaries round-trip for perceptual gamut mapping
+    const pl_matrix3x3 rgb2lms_src = pl_ipt_rgb2lms(&perceptual.input_gamut);
+    const pl_matrix3x3 rgb2lms_dst = pl_ipt_rgb2lms(&perceptual.output_gamut);
+    const pl_matrix3x3 lms2rgb_dst = pl_ipt_lms2rgb(&perceptual.output_gamut);
+    static const float refpoints[][3] = {
+        {1, 0, 0}, {0, 1, 0}, {0, 0, 1},
+        {0, 1, 1}, {1, 0, 1}, {1, 1, 0},
+    };
+
+    for (int i = 0; i < PL_ARRAY_SIZE(refpoints); i++) {
+        float c[3]   = { refpoints[i][0], refpoints[i][1], refpoints[i][2] };
+        float ref[3] = { refpoints[i][0], refpoints[i][1], refpoints[i][2] };
+        printf("Testing primary: RGB {%.0f %.0f %.0f}\n", c[0], c[1], c[2]);
+        pl_matrix3x3_apply(&rgb2lms_src, c);
+        c[0] = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, c[0]);
+        c[1] = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, c[1]);
+        c[2] = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, c[2]);
+        pl_matrix3x3_apply(&pl_ipt_lms2ipt, c);
+        printf("Before:    ICh {%f %f %f}\n",
+               c[0], sqrtf(c[1]*c[1] + c[2]*c[2]), atan2f(c[2], c[1]));
+        pl_gamut_map_sample(c, &perceptual);
+        float rgb[3] = { c[0], c[1], c[2] };
+        pl_matrix3x3_apply(&pl_ipt_ipt2lms, rgb);
+        rgb[0] = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, rgb[0]);
+        rgb[1] = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, rgb[1]);
+        rgb[2] = pl_hdr_rescale(PL_HDR_PQ, PL_HDR_NORM, rgb[2]);
+        pl_matrix3x3_apply(&lms2rgb_dst, rgb);
+        const float hue = atan2f(c[2], c[1]);
+        printf("After:     ICh {%f %f %f} = RGB {%f %f %f}\n",
+               c[0], sqrtf(c[1]*c[1] + c[2]*c[2]), hue, rgb[0], rgb[1], rgb[2]);
+        pl_matrix3x3_apply(&rgb2lms_dst, ref);
+        ref[0] = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, ref[0]);
+        ref[1] = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, ref[1]);
+        ref[2] = pl_hdr_rescale(PL_HDR_NORM, PL_HDR_PQ, ref[2]);
+        pl_matrix3x3_apply(&pl_ipt_lms2ipt, ref);
+        const float hue_ref = atan2f(ref[2], ref[1]);
+        printf("Should be: ICh {%f %f %f}\n",
+               ref[0], sqrtf(ref[1]*ref[1] + ref[2]*ref[2]), hue_ref);
+        REQUIRE_FEQ(hue, hue_ref, 3.0e-3);
+    }
+
+    float *tmp = malloc(sizeof(float[LUT3D_SIZE][LUT3D_SIZE][LUT3D_SIZE][3]));
+    if (tmp) {
+        pl_clock_t start = pl_clock_now();
+        pl_gamut_map_generate(tmp, &perceptual);
+        pl_log_cpu_time(log, start, pl_clock_now(), "generating 3DLUT");
+        free(tmp);
+    }
+
+    pl_log_destroy(&log);
+}
diff --git a/src/tests/utils.c b/src/tests/utils.c
new file mode 100644
index 0000000..73a9265
--- /dev/null
+++ b/src/tests/utils.c
@@ -0,0 +1,165 @@
+#include "tests.h"
+#include "gpu.h"
+
+#include <libplacebo/utils/upload.h>
+
+int main()
+{
+    struct pl_bit_encoding bits = {0};
+    struct pl_plane_data data = {0};
+
+    static const struct pl_bit_encoding bits0 = {0};
+    static const struct pl_bit_encoding bits8 = {
+        .sample_depth = 8,
+        .color_depth = 8,
+    };
+
+    static const struct pl_bit_encoding bits16 = {
+        .sample_depth = 16,
+        .color_depth = 16,
+    };
+
+    static const struct pl_bit_encoding bits10_16 = {
+        .sample_depth = 16,
+        .color_depth = 10,
+    };
+
+    static const struct pl_bit_encoding bits10_16_6 = {
+        .sample_depth = 16,
+        .color_depth = 10,
+        .bit_shift = 6,
+    };
+
+#define TEST_ALIGN(ref, ref_align, ref_bits, ...)                       \
+    do {                                                                \
+        pl_plane_data_from_mask(&data, (uint64_t[4]){ __VA_ARGS__ });   \
+        REQUIRE_MEMEQ(&data, &ref, sizeof(ref));                        \
+        pl_plane_data_align(&data, &bits);                              \
+        REQUIRE_MEMEQ(&data, &ref_align, sizeof(ref_align));            \
+        REQUIRE_MEMEQ(&bits, &ref_bits, sizeof(bits));                  \
+    } while (0)
+
+#define TEST(ref, bits, ...) TEST_ALIGN(ref, ref, bits, __VA_ARGS__)
+
+    static const struct pl_plane_data rgb8 = {
+        .component_size = {8, 8, 8},
+        .component_map  = {0, 1, 2},
+    };
+
+    TEST(rgb8, bits8, 0xFF, 0xFF00, 0xFF0000);
+
+    static const struct pl_plane_data bgra8 = {
+        .component_size = {8, 8, 8, 8},
+        .component_map  = {2, 1, 0, 3},
+    };
+
+    TEST(bgra8, bits8, 0xFF0000, 0xFF00, 0xFF, 0xFF000000);
+
+    static const struct pl_plane_data gr16 = {
+        .component_size = {16, 16},
+        .component_map  = {1, 0},
+    };
+
+    TEST(gr16, bits16, 0xFFFF0000, 0xFFFF);
+
+    static const struct pl_plane_data r10x6g10 = {
+        .component_size = {10, 10},
+        .component_map  = {1, 0}, // LSB -> MSB ordering
+        .component_pad  = {0, 6},
+    };
+
+    TEST_ALIGN(r10x6g10, gr16, bits10_16, 0x03FF0000, 0x03FF);
+
+    static const struct pl_plane_data rgb565 = {
+        .component_size = {5, 6, 5},
+        .component_map  = {2, 1, 0}, // LSB -> MSB ordering
+    };
+
+    TEST(rgb565, bits0, 0xF800, 0x07E0, 0x001F);
+
+    static const struct pl_plane_data rgba16 = {
+        .component_size = {16, 16, 16, 16},
+        .component_map  = {0, 1, 2, 3},
+    };
+
+    TEST(rgba16, bits16, 0xFFFFllu, 0xFFFF0000llu, 0xFFFF00000000llu, 0xFFFF000000000000llu);
+
+    static const struct pl_plane_data p010 = {
+        .component_size = {10, 10, 10},
+        .component_map  = {0, 1, 2},
+        .component_pad  = {6, 6, 6},
+    };
+
+    static const struct pl_plane_data rgb16 = {
+        .component_size = {16, 16, 16},
+        .component_map  = {0, 1, 2},
+    };
+
+    TEST_ALIGN(p010, rgb16, bits10_16_6, 0xFFC0llu, 0xFFC00000llu, 0xFFC000000000llu);
+
+    // Test GLSL structure packing
+    struct pl_var vec1 = pl_var_float(""),
+                  vec2 = pl_var_vec2(""),
+                  vec3 = pl_var_vec3(""),
+                  mat2 = pl_var_mat2(""),
+                  mat3 = pl_var_mat3("");
+
+    struct pl_var_layout layout;
+    layout = pl_std140_layout(0, &vec2);
+    REQUIRE_CMP(layout.offset, ==, 0 * sizeof(float), "zu");
+    REQUIRE_CMP(layout.stride, ==, 2 * sizeof(float), "zu");
+    REQUIRE_CMP(layout.size, ==, 2 * sizeof(float), "zu");
+
+    layout = pl_std140_layout(3 * sizeof(float), &vec3);
+    REQUIRE_CMP(layout.offset, ==, 4 * sizeof(float), "zu");
+    REQUIRE_CMP(layout.stride, ==, 3 * sizeof(float), "zu");
+    REQUIRE_CMP(layout.size, ==, 3 * sizeof(float), "zu");
+
+    layout = pl_std140_layout(2 * sizeof(float), &mat3);
+    REQUIRE_CMP(layout.offset, ==, 4 * sizeof(float), "zu");
+    REQUIRE_CMP(layout.stride, ==, 4 * sizeof(float), "zu");
+    REQUIRE_CMP(layout.size, ==, 3 * 4 * sizeof(float), "zu");
+
+    layout = pl_std430_layout(2 * sizeof(float), &mat3);
+    REQUIRE_CMP(layout.offset, ==, 4 * sizeof(float), "zu");
+    REQUIRE_CMP(layout.stride, ==, 4 * sizeof(float), "zu");
+    REQUIRE_CMP(layout.size, ==, 4 * 3 * sizeof(float), "zu");
+
+    layout = pl_std140_layout(3 * sizeof(float), &vec1);
+    REQUIRE_CMP(layout.offset, ==, 3 * sizeof(float), "zu");
+    REQUIRE_CMP(layout.stride, ==, sizeof(float), "zu");
+    REQUIRE_CMP(layout.size, ==, sizeof(float), "zu");
+
+    struct pl_var vec2a = vec2;
+    vec2a.dim_a = 50;
+
+    layout = pl_std140_layout(sizeof(float), &vec2a);
+    REQUIRE_CMP(layout.offset, ==, 4 * sizeof(float), "zu");
+    REQUIRE_CMP(layout.stride, ==, 4 * sizeof(float), "zu");
+    REQUIRE_CMP(layout.size, ==, 50 * 4 * sizeof(float), "zu");
+
+    layout = pl_std430_layout(sizeof(float), &vec2a);
+    REQUIRE_CMP(layout.offset, ==, 2 * sizeof(float), "zu");
+    REQUIRE_CMP(layout.stride, ==, 2 * sizeof(float), "zu");
+    REQUIRE_CMP(layout.size, ==, 50 * 2 * sizeof(float), "zu");
+
+    struct pl_var mat2a = mat2;
+    mat2a.dim_a = 20;
+
+    layout = pl_std140_layout(5 * sizeof(float), &mat2a);
+    REQUIRE_CMP(layout.offset, ==, 8 * sizeof(float), "zu");
+    REQUIRE_CMP(layout.stride, ==, 4 * sizeof(float), "zu");
+    REQUIRE_CMP(layout.size, ==, 20 * 2 * 4 * sizeof(float), "zu");
+
+    layout = pl_std430_layout(5 * sizeof(float), &mat2a);
+    REQUIRE_CMP(layout.offset, ==, 6 * sizeof(float), "zu");
+    REQUIRE_CMP(layout.stride, ==, 2 * sizeof(float), "zu");
+    REQUIRE_CMP(layout.size, ==, 20 * 2 * 2 * sizeof(float), "zu");
+
+    for (const struct pl_named_var *nvar = pl_var_glsl_types; nvar->glsl_name; nvar++) {
+        struct pl_var var = nvar->var;
+        REQUIRE_CMP(nvar->glsl_name, ==, pl_var_glsl_type_name(var), "s");
+        var.dim_a = 100;
+        REQUIRE_CMP(nvar->glsl_name, ==, pl_var_glsl_type_name(var), "s");
+    }
+}
diff --git a/src/tests/vulkan.c b/src/tests/vulkan.c
new file mode 100644
index 0000000..476560a
--- /dev/null
+++ b/src/tests/vulkan.c
@@ -0,0 +1,296 @@
+#include <vulkan/vulkan.h>
+
+#include "gpu_tests.h"
+#include "vulkan/command.h"
+#include "vulkan/gpu.h"
+
+#include <libplacebo/vulkan.h>
+
+static void vulkan_interop_tests(pl_vulkan pl_vk,
+                                 enum pl_handle_type handle_type)
+{
+    pl_gpu gpu = pl_vk->gpu;
+    printf("testing vulkan interop for handle type 0x%x\n", handle_type);
+
+    if (gpu->export_caps.buf & handle_type) {
+        pl_buf buf = pl_buf_create(gpu, pl_buf_params(
+            .size = 1024,
+            .export_handle = handle_type,
+        ));
+
+        REQUIRE(buf);
+        REQUIRE_HANDLE(buf->shared_mem, handle_type);
+        REQUIRE_CMP(buf->shared_mem.size, >=, buf->params.size, "zu");
+        REQUIRE(pl_buf_export(gpu, buf));
+        pl_buf_destroy(gpu, &buf);
+    }
+
+    pl_fmt fmt = pl_find_fmt(gpu, PL_FMT_UNORM, 1, 0, 0, PL_FMT_CAP_BLITTABLE);
+    if (!fmt)
+        return;
+
+    if (gpu->export_caps.sync & handle_type) {
+        pl_sync sync = pl_sync_create(gpu, handle_type);
+        pl_tex tex = pl_tex_create(gpu, pl_tex_params(
+            .w = 32,
+            .h = 32,
+            .format = fmt,
+            .blit_dst = true,
+        ));
+
+        REQUIRE(sync);
+        REQUIRE(tex);
+
+        // Note: For testing purposes, we have to fool pl_tex_export into
+        // thinking this texture is actually exportable. Just hack it in
+        // horribly.
+        ((struct pl_tex_params *) &tex->params)->export_handle = PL_HANDLE_DMA_BUF;
+
+        REQUIRE(pl_tex_export(gpu, tex, sync));
+
+        // Re-use our internal helpers to signal this VkSemaphore
+        struct vk_ctx *vk = PL_PRIV(pl_vk);
+        struct vk_cmd *cmd = vk_cmd_begin(vk->pool_graphics, NULL);
+        REQUIRE(cmd);
+        struct pl_sync_vk *sync_vk = PL_PRIV(sync);
+        vk_cmd_sig(cmd, VK_PIPELINE_STAGE_2_NONE, (pl_vulkan_sem){ sync_vk->signal });
+        REQUIRE(vk_cmd_submit(&cmd));
+
+        // Do something with the image again to "import" it
+        pl_tex_clear(gpu, tex, (float[4]){0});
+        pl_gpu_finish(gpu);
+        REQUIRE(!pl_tex_poll(gpu, tex, 0));
+
+        pl_sync_destroy(gpu, &sync);
+        pl_tex_destroy(gpu, &tex);
+    }
+
+    // Test interop API
+    if (gpu->export_caps.tex & handle_type) {
+        VkSemaphore sem = pl_vulkan_sem_create(gpu, pl_vulkan_sem_params(
+            .type           = VK_SEMAPHORE_TYPE_TIMELINE,
+            .initial_value  = 0,
+        ));
+
+        pl_tex tex = pl_tex_create(gpu, pl_tex_params(
+            .w              = 32,
+            .h              = 32,
+            .format         = fmt,
+            .blit_dst       = true,
+            .export_handle  = handle_type,
+        ));
+
+        REQUIRE(sem);
+        REQUIRE(tex);
+
+        REQUIRE(pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params(
+            .tex            = tex,
+            .layout         = VK_IMAGE_LAYOUT_GENERAL,
+            .qf             = VK_QUEUE_FAMILY_EXTERNAL,
+            .semaphore      = { sem, 1 },
+        )));
+
+        pl_vulkan_release_ex(gpu, pl_vulkan_release_params(
+            .tex            = tex,
+            .layout         = VK_IMAGE_LAYOUT_GENERAL,
+            .qf             = VK_QUEUE_FAMILY_EXTERNAL,
+            .semaphore      = { sem, 1 },
+        ));
+
+        pl_tex_clear(gpu, tex, (float[4]){0});
+        pl_gpu_finish(gpu);
+        REQUIRE(!pl_tex_poll(gpu, tex, 0));
+
+        pl_vulkan_sem_destroy(gpu, &sem);
+        pl_tex_destroy(gpu, &tex);
+    }
+}
+
+static void vulkan_swapchain_tests(pl_vulkan vk, VkSurfaceKHR surf)
+{
+    if (!surf)
+        return;
+
+    printf("testing vulkan swapchain\n");
+    pl_gpu gpu = vk->gpu;
+    pl_swapchain sw;
+    sw = pl_vulkan_create_swapchain(vk, pl_vulkan_swapchain_params(
+        .surface = surf,
+    ));
+    REQUIRE(sw);
+
+    // Attempt actually initializing the swapchain
+    int w = 640, h = 480;
+    REQUIRE(pl_swapchain_resize(sw, &w, &h));
+
+    for (int i = 0; i < 10; i++) {
+        struct pl_swapchain_frame frame;
+        REQUIRE(pl_swapchain_start_frame(sw, &frame));
+        if (frame.fbo->params.blit_dst)
+            pl_tex_clear(gpu, frame.fbo, (float[4]){0});
+
+        // TODO: test this with an actual pl_renderer instance
+        struct pl_frame target;
+        pl_frame_from_swapchain(&target, &frame);
+
+        REQUIRE(pl_swapchain_submit_frame(sw));
+        pl_swapchain_swap_buffers(sw);
+
+        // Try resizing the swapchain in the middle of rendering
+        if (i == 5) {
+            w = 320;
+            h = 240;
+            REQUIRE(pl_swapchain_resize(sw, &w, &h));
+        }
+    }
+
+    pl_swapchain_destroy(&sw);
+}
+
+int main()
+{
+    pl_log log = pl_test_logger();
+    pl_vk_inst inst = pl_vk_inst_create(log, pl_vk_inst_params(
+        .debug = true,
+        .debug_extra = true,
+        .get_proc_addr = vkGetInstanceProcAddr,
+        .opt_extensions = (const char *[]){
+            VK_KHR_SURFACE_EXTENSION_NAME,
+            VK_EXT_HEADLESS_SURFACE_EXTENSION_NAME,
+        },
+        .num_opt_extensions = 2,
+    ));
+
+    if (!inst)
+        return SKIP;
+
+    PL_VK_LOAD_FUN(inst->instance, EnumeratePhysicalDevices, inst->get_proc_addr);
+    PL_VK_LOAD_FUN(inst->instance, GetPhysicalDeviceProperties, inst->get_proc_addr);
+
+    uint32_t num = 0;
+    EnumeratePhysicalDevices(inst->instance, &num, NULL);
+    if (!num)
+        return SKIP;
+
+    VkPhysicalDevice *devices = calloc(num, sizeof(*devices));
+    if (!devices)
+        return 1;
+    EnumeratePhysicalDevices(inst->instance, &num, devices);
+
+    VkSurfaceKHR surf = VK_NULL_HANDLE;
+
+    PL_VK_LOAD_FUN(inst->instance, CreateHeadlessSurfaceEXT, inst->get_proc_addr);
+    if (CreateHeadlessSurfaceEXT) {
+        VkHeadlessSurfaceCreateInfoEXT info = {
+            .sType = VK_STRUCTURE_TYPE_HEADLESS_SURFACE_CREATE_INFO_EXT,
+        };
+
+        VkResult res = CreateHeadlessSurfaceEXT(inst->instance, &info, NULL, &surf);
+        REQUIRE_CMP(res, ==, VK_SUCCESS, "u");
+    }
+
+    // Make sure choosing any device works
+    VkPhysicalDevice dev;
+    dev = pl_vulkan_choose_device(log, pl_vulkan_device_params(
+        .instance = inst->instance,
+        .get_proc_addr = inst->get_proc_addr,
+        .allow_software = true,
+        .surface = surf,
+    ));
+    if (!dev)
+        return SKIP;
+
+    // Test all attached devices
+    for (int i = 0; i < num; i++) {
+        VkPhysicalDeviceProperties props = {0};
+        GetPhysicalDeviceProperties(devices[i], &props);
+#ifndef CI_ALLOW_SW
+        if (props.deviceType == VK_PHYSICAL_DEVICE_TYPE_CPU) {
+            printf("Skipping device %d: %s\n", i, props.deviceName);
+            continue;
+        }
+#endif
+        printf("Testing device %d: %s\n", i, props.deviceName);
+
+        // Make sure we can choose this device by name
+        dev = pl_vulkan_choose_device(log, pl_vulkan_device_params(
+            .instance = inst->instance,
+            .get_proc_addr = inst->get_proc_addr,
+            .device_name = props.deviceName,
+        ));
+        REQUIRE_CMP(dev, ==, devices[i], "p");
+
+        struct pl_vulkan_params params = *pl_vulkan_params(
+            .instance = inst->instance,
+            .get_proc_addr = inst->get_proc_addr,
+            .device = devices[i],
+            .queue_count = 8, // test inter-queue stuff
+            .surface = surf,
+        );
+
+        pl_vulkan vk = pl_vulkan_create(log, &params);
+        if (!vk)
+            continue;
+
+        gpu_shader_tests(vk->gpu);
+        vulkan_swapchain_tests(vk, surf);
+
+        // Print heap statistics
+        pl_vk_print_heap(vk->gpu, PL_LOG_DEBUG);
+
+        // Test importing this context via the vulkan interop API
+        pl_vulkan vk2 = pl_vulkan_import(log, pl_vulkan_import_params(
+            .instance = vk->instance,
+            .get_proc_addr = inst->get_proc_addr,
+            .phys_device = vk->phys_device,
+            .device = vk->device,
+
+            .extensions = vk->extensions,
+            .num_extensions = vk->num_extensions,
+            .features = vk->features,
+            .queue_graphics = vk->queue_graphics,
+            .queue_compute = vk->queue_compute,
+            .queue_transfer = vk->queue_transfer,
+        ));
+        REQUIRE(vk2);
+        pl_vulkan_destroy(&vk2);
+
+        // Run these tests last because they disable some validation layers
+#ifdef PL_HAVE_UNIX
+        vulkan_interop_tests(vk, PL_HANDLE_FD);
+        vulkan_interop_tests(vk, PL_HANDLE_DMA_BUF);
+#endif
+#ifdef PL_HAVE_WIN32
+        vulkan_interop_tests(vk, PL_HANDLE_WIN32);
+        vulkan_interop_tests(vk, PL_HANDLE_WIN32_KMT);
+#endif
+        gpu_interop_tests(vk->gpu);
+        pl_vulkan_destroy(&vk);
+
+        // Re-run the same export/import tests with async queues disabled
+        params.async_compute = false;
+        params.async_transfer = false;
+        vk = pl_vulkan_create(log, &params);
+        REQUIRE(vk); // it succeeded the first time
+
+#ifdef PL_HAVE_UNIX
+        vulkan_interop_tests(vk, PL_HANDLE_FD);
+        vulkan_interop_tests(vk, PL_HANDLE_DMA_BUF);
+#endif
+#ifdef PL_HAVE_WIN32
+        vulkan_interop_tests(vk, PL_HANDLE_WIN32);
+        vulkan_interop_tests(vk, PL_HANDLE_WIN32_KMT);
+#endif
+        gpu_interop_tests(vk->gpu);
+        pl_vulkan_destroy(&vk);
+
+        // Reduce log spam after first tested device
+        pl_log_level_update(log, PL_LOG_INFO);
+    }
+
+    if (surf)
+        vkDestroySurfaceKHR(inst->instance, surf, NULL);
+    pl_vk_inst_destroy(&inst);
+    pl_log_destroy(&log);
+    free(devices);
+}
diff --git a/src/tone_mapping.c b/src/tone_mapping.c
new file mode 100644
index 0000000..f08bb58
--- /dev/null
+++ b/src/tone_mapping.c
@@ -0,0 +1,775 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include "common.h"
+
+#include <libplacebo/tone_mapping.h>
+
+#define fclampf(x, lo, hi) fminf(fmaxf(x, lo), hi)
+static void fix_constants(struct pl_tone_map_constants *c)
+{
+    const float eps = 1e-6f;
+    c->knee_adaptation   = fclampf(c->knee_adaptation, 0.0f, 1.0f);
+    c->knee_minimum      = fclampf(c->knee_minimum, eps, 0.5f - eps);
+    c->knee_maximum      = fclampf(c->knee_maximum, 0.5f + eps, 1.0f - eps);
+    c->knee_default      = fclampf(c->knee_default, c->knee_minimum, c->knee_maximum);
+    c->knee_offset       = fclampf(c->knee_offset, 0.5f, 2.0f);
+    c->slope_tuning      = fclampf(c->slope_tuning, 0.0f, 10.0f);
+    c->slope_offset      = fclampf(c->slope_offset, 0.0f, 1.0f);
+    c->spline_contrast   = fclampf(c->spline_contrast, 0.0f, 1.5f);
+    c->reinhard_contrast = fclampf(c->reinhard_contrast, eps, 1.0f - eps);
+    c->linear_knee       = fclampf(c->linear_knee, eps, 1.0f - eps);
+    c->exposure          = fclampf(c->exposure, eps, 10.0f);
+}
+
+static inline bool constants_equal(const struct pl_tone_map_constants *a,
+                                   const struct pl_tone_map_constants *b)
+{
+    pl_static_assert(sizeof(*a) % sizeof(float) == 0);
+    return !memcmp(a, b, sizeof(*a));
+}
+
+bool pl_tone_map_params_equal(const struct pl_tone_map_params *a,
+                              const struct pl_tone_map_params *b)
+{
+    return a->function == b->function &&
+           a->param == b->param &&
+           a->input_scaling == b->input_scaling &&
+           a->output_scaling == b->output_scaling &&
+           a->lut_size == b->lut_size &&
+           a->input_min == b->input_min &&
+           a->input_max == b->input_max &&
+           a->input_avg == b->input_avg &&
+           a->output_min == b->output_min &&
+           a->output_max == b->output_max &&
+           constants_equal(&a->constants, &b->constants) &&
+           pl_hdr_metadata_equal(&a->hdr, &b->hdr);
+}
+
+bool pl_tone_map_params_noop(const struct pl_tone_map_params *p)
+{
+    float in_min = pl_hdr_rescale(p->input_scaling, PL_HDR_NITS, p->input_min);
+    float in_max = pl_hdr_rescale(p->input_scaling, PL_HDR_NITS, p->input_max);
+    float out_min = pl_hdr_rescale(p->output_scaling, PL_HDR_NITS, p->output_min);
+    float out_max = pl_hdr_rescale(p->output_scaling, PL_HDR_NITS, p->output_max);
+    bool can_inverse = p->function->map_inverse;
+
+    return fabs(in_min - out_min) < 1e-4 && // no BPC
+           in_max < out_max + 1e-2 && // no range reduction
+           (out_max < in_max + 1e-2 || !can_inverse); // no inverse tone-mapping
+}
+
+void pl_tone_map_params_infer(struct pl_tone_map_params *par)
+{
+    if (!par->function)
+        par->function = &pl_tone_map_clip;
+
+    if (par->param) {
+        // Backwards compatibility for older API
+        if (par->function == &pl_tone_map_st2094_40 || par->function == &pl_tone_map_st2094_10)
+            par->constants.knee_adaptation = par->param;
+        if (par->function == &pl_tone_map_bt2390)
+            par->constants.knee_offset = par->param;
+        if (par->function == &pl_tone_map_spline)
+            par->constants.spline_contrast = par->param;
+        if (par->function == &pl_tone_map_reinhard)
+            par->constants.reinhard_contrast = par->param;
+        if (par->function == &pl_tone_map_mobius || par->function == &pl_tone_map_gamma)
+            par->constants.linear_knee = par->param;
+        if (par->function == &pl_tone_map_linear || par->function == &pl_tone_map_linear_light)
+            par->constants.exposure = par->param;
+    }
+
+    fix_constants(&par->constants);
+
+    // Constrain the input peak to be no less than target SDR white
+    float sdr = pl_hdr_rescale(par->output_scaling, par->input_scaling, par->output_max);
+    sdr = fminf(sdr, pl_hdr_rescale(PL_HDR_NITS, par->input_scaling, PL_COLOR_SDR_WHITE));
+    par->input_max = fmaxf(par->input_max, sdr);
+
+    // Constrain the output peak if function does not support inverse mapping
+    if (!par->function->map_inverse)
+        par->output_max = fminf(par->output_max, par->input_max);
+}
+
+// Infer params and rescale to function scaling
+static struct pl_tone_map_params fix_params(const struct pl_tone_map_params *params)
+{
+    struct pl_tone_map_params fixed = *params;
+    pl_tone_map_params_infer(&fixed);
+
+    const struct pl_tone_map_function *fun = params->function;
+    fixed.input_scaling = fun->scaling;
+    fixed.output_scaling = fun->scaling;
+    fixed.input_min  = pl_hdr_rescale(params->input_scaling,  fun->scaling, fixed.input_min);
+    fixed.input_max  = pl_hdr_rescale(params->input_scaling,  fun->scaling, fixed.input_max);
+    fixed.input_avg  = pl_hdr_rescale(params->input_scaling,  fun->scaling, fixed.input_avg);
+    fixed.output_min = pl_hdr_rescale(params->output_scaling, fun->scaling, fixed.output_min);
+    fixed.output_max = pl_hdr_rescale(params->output_scaling, fun->scaling, fixed.output_max);
+
+    return fixed;
+}
+
+#define FOREACH_LUT(lut, V)                                                     \
+    for (float *_iter = lut, *_end = lut + params->lut_size, V;                 \
+         _iter < _end && ( V = *_iter, 1 ); *_iter++ = V)
+
+static void map_lut(float *lut, const struct pl_tone_map_params *params)
+{
+    if (params->output_max > params->input_max + 1e-4) {
+        // Inverse tone-mapping
+        pl_assert(params->function->map_inverse);
+        params->function->map_inverse(lut, params);
+    } else {
+        // Forward tone-mapping
+        params->function->map(lut, params);
+    }
+}
+
+void pl_tone_map_generate(float *out, const struct pl_tone_map_params *params)
+{
+    struct pl_tone_map_params fixed = fix_params(params);
+
+    // Generate input values evenly spaced in `params->input_scaling`
+    for (size_t i = 0; i < params->lut_size; i++) {
+        float x = (float) i / (params->lut_size - 1);
+        x = PL_MIX(params->input_min, params->input_max, x);
+        out[i] = pl_hdr_rescale(params->input_scaling, fixed.function->scaling, x);
+    }
+
+    map_lut(out, &fixed);
+
+    // Sanitize outputs and adapt back to `params->scaling`
+    for (size_t i = 0; i < params->lut_size; i++) {
+        float x = PL_CLAMP(out[i], fixed.output_min, fixed.output_max);
+        out[i] = pl_hdr_rescale(fixed.function->scaling, params->output_scaling, x);
+    }
+}
+
+float pl_tone_map_sample(float x, const struct pl_tone_map_params *params)
+{
+    struct pl_tone_map_params fixed = fix_params(params);
+    fixed.lut_size = 1;
+
+    x = PL_CLAMP(x, params->input_min, params->input_max);
+    x = pl_hdr_rescale(params->input_scaling, fixed.function->scaling, x);
+    map_lut(&x, &fixed);
+    x = PL_CLAMP(x, fixed.output_min, fixed.output_max);
+    x = pl_hdr_rescale(fixed.function->scaling, params->output_scaling, x);
+    return x;
+}
+
+// Rescale from input-absolute to input-relative
+static inline float rescale_in(float x, const struct pl_tone_map_params *params)
+{
+    return (x - params->input_min) / (params->input_max - params->input_min);
+}
+
+// Rescale from input-absolute to output-relative
+static inline float rescale(float x, const struct pl_tone_map_params *params)
+{
+    return (x - params->input_min) / (params->output_max - params->output_min);
+}
+
+// Rescale from output-relative to output-absolute
+static inline float rescale_out(float x, const struct pl_tone_map_params *params)
+{
+    return x * (params->output_max - params->output_min) + params->output_min;
+}
+
+static inline float bt1886_eotf(float x, float min, float max)
+{
+    const float lb = powf(min, 1/2.4f);
+    const float lw = powf(max, 1/2.4f);
+    return powf((lw - lb) * x + lb, 2.4f);
+}
+
+static inline float bt1886_oetf(float x, float min, float max)
+{
+    const float lb = powf(min, 1/2.4f);
+    const float lw = powf(max, 1/2.4f);
+    return (powf(x, 1/2.4f) - lb) / (lw - lb);
+}
+
+static void noop(float *lut, const struct pl_tone_map_params *params)
+{
+    return;
+}
+
+const struct pl_tone_map_function pl_tone_map_clip = {
+    .name = "clip",
+    .description = "No tone mapping (clip)",
+    .map = noop,
+    .map_inverse = noop,
+};
+
+// Helper function to pick a knee point (for suitable methods) based on the
+// HDR10+ brightness metadata and scene brightness average matching.
+//
+// Inspired by SMPTE ST2094-10, with some modifications
+static void st2094_pick_knee(float *out_src_knee, float *out_dst_knee,
+                             const struct pl_tone_map_params *params)
+{
+    const float src_min = pl_hdr_rescale(params->input_scaling,  PL_HDR_PQ, params->input_min);
+    const float src_max = pl_hdr_rescale(params->input_scaling,  PL_HDR_PQ, params->input_max);
+    const float src_avg = pl_hdr_rescale(params->input_scaling,  PL_HDR_PQ, params->input_avg);
+    const float dst_min = pl_hdr_rescale(params->output_scaling, PL_HDR_PQ, params->output_min);
+    const float dst_max = pl_hdr_rescale(params->output_scaling, PL_HDR_PQ, params->output_max);
+
+    const float min_knee = params->constants.knee_minimum;
+    const float max_knee = params->constants.knee_maximum;
+    const float def_knee = params->constants.knee_default;
+    const float src_knee_min = PL_MIX(src_min, src_max, min_knee);
+    const float src_knee_max = PL_MIX(src_min, src_max, max_knee);
+    const float dst_knee_min = PL_MIX(dst_min, dst_max, min_knee);
+    const float dst_knee_max = PL_MIX(dst_min, dst_max, max_knee);
+
+    // Choose source knee based on source scene brightness
+    float src_knee = PL_DEF(src_avg, PL_MIX(src_min, src_max, def_knee));
+    src_knee = fclampf(src_knee, src_knee_min, src_knee_max);
+
+    // Choose target adaptation point based on linearly re-scaling source knee
+    float target = (src_knee - src_min) / (src_max - src_min);
+    float adapted = PL_MIX(dst_min, dst_max, target);
+
+    // Choose the destnation knee by picking the perceptual adaptation point
+    // between the source knee and the desired target. This moves the knee
+    // point, on the vertical axis, closer to the 1:1 (neutral) line.
+    //
+    // Adjust the adaptation strength towards 1 based on how close the knee
+    // point is to its extreme values (min/max knee)
+    float tuning = 1.0f - pl_smoothstep(max_knee, def_knee, target) *
+                          pl_smoothstep(min_knee, def_knee, target);
+    float adaptation = PL_MIX(params->constants.knee_adaptation, 1.0f, tuning);
+    float dst_knee = PL_MIX(src_knee, adapted, adaptation);
+    dst_knee = fclampf(dst_knee, dst_knee_min, dst_knee_max);
+
+    *out_src_knee = pl_hdr_rescale(PL_HDR_PQ, params->input_scaling, src_knee);
+    *out_dst_knee = pl_hdr_rescale(PL_HDR_PQ, params->output_scaling, dst_knee);
+}
+
+// Pascal's triangle
+static const uint16_t binom[17][17] = {
+    {1},
+    {1,1},
+    {1,2,1},
+    {1,3,3,1},
+    {1,4,6,4,1},
+    {1,5,10,10,5,1},
+    {1,6,15,20,15,6,1},
+    {1,7,21,35,35,21,7,1},
+    {1,8,28,56,70,56,28,8,1},
+    {1,9,36,84,126,126,84,36,9,1},
+    {1,10,45,120,210,252,210,120,45,10,1},
+    {1,11,55,165,330,462,462,330,165,55,11,1},
+    {1,12,66,220,495,792,924,792,495,220,66,12,1},
+    {1,13,78,286,715,1287,1716,1716,1287,715,286,78,13,1},
+    {1,14,91,364,1001,2002,3003,3432,3003,2002,1001,364,91,14,1},
+    {1,15,105,455,1365,3003,5005,6435,6435,5005,3003,1365,455,105,15,1},
+    {1,16,120,560,1820,4368,8008,11440,12870,11440,8008,4368,1820,560,120,16,1},
+};
+
+static inline float st2094_intercept(uint8_t N, float Kx, float Ky)
+{
+    if (Kx <= 0 || Ky >= 1)
+        return 1.0f / N;
+
+    const float slope = Ky / Kx * (1 - Kx) / (1 - Ky);
+    return fminf(slope / N, 1.0f);
+}
+
+static void st2094_40(float *lut, const struct pl_tone_map_params *params)
+{
+    const float D = params->output_max;
+
+    // Allocate space for the adjusted bezier control points, plus endpoints
+    float P[17], Kx, Ky, T;
+    uint8_t N;
+
+    if (params->hdr.ootf.num_anchors) {
+
+        // Use bezier curve from metadata
+        Kx = PL_CLAMP(params->hdr.ootf.knee_x, 0, 1);
+        Ky = PL_CLAMP(params->hdr.ootf.knee_y, 0, 1);
+        T = PL_CLAMP(params->hdr.ootf.target_luma, params->input_min, params->input_max);
+        N = params->hdr.ootf.num_anchors + 1;
+        pl_assert(N < PL_ARRAY_SIZE(P));
+        memcpy(P + 1, params->hdr.ootf.anchors, (N - 1) * sizeof(*P));
+        P[0] = 0.0f;
+        P[N] = 1.0f;
+
+    } else {
+
+        // Missing metadata, default to simple brightness matching
+        float src_knee, dst_knee;
+        st2094_pick_knee(&src_knee, &dst_knee, params);
+        Kx = src_knee / params->input_max;
+        Ky = dst_knee / params->output_max;
+
+        // Solve spline to match slope at knee intercept
+        const float slope = Ky / Kx * (1 - Kx) / (1 - Ky);
+        N = PL_CLAMP((int) ceilf(slope), 2, PL_ARRAY_SIZE(P) - 1);
+        P[0] = 0.0f;
+        P[1] = st2094_intercept(N, Kx, Ky);
+        for (int i = 2; i <= N; i++)
+            P[i] = 1.0f;
+        T = D;
+
+    }
+
+    if (D < T) {
+
+        // Output display darker than OOTF target, make brighter
+        const float Dmin = 0.0f, u = fmaxf(0.0f, (D - Dmin) / (T - Dmin));
+
+        // Scale down the knee point to make more room for the OOTF
+        Kx *= u;
+        Ky *= u;
+
+        // Make the slope of the knee more closely approximate a clip(),
+        // constrained to avoid exploding P[1]
+        const float beta = N * Kx / (1 - Kx);
+        const float Kxy = fminf(Kx * params->input_max / D, beta / (beta + 1));
+        Ky = PL_MIX(Kxy, Ky, u);
+
+        for (int p = 2; p <= N; p++)
+            P[p] = PL_MIX(1.0f, P[p], u);
+
+        // Make the OOTF intercept linear as D -> Dmin
+        P[1] = PL_MIX(st2094_intercept(N, Kx, Ky), P[1], u);
+
+    } else if (D > T) {
+
+        // Output display brighter than OOTF target, make more linear
+        pl_assert(params->input_max > T);
+        const float w = powf(1 - (D - T) / (params->input_max - T), 1.4f);
+
+        // Constrain the slope of the input knee to prevent it from
+        // exploding and making the picture way too bright
+        Ky *= T / D;
+
+        // Make the slope of the knee more linear by solving for f(Kx) = Kx
+        float Kxy = Kx * D / params->input_max;
+        Ky = PL_MIX(Kxy, Ky, w);
+
+        for (int p = 2; p < N; p++) {
+            float anchor_lin = (float) p / N;
+            P[p] = PL_MIX(anchor_lin, P[p], w);
+        }
+
+        // Make the OOTF intercept linear as D -> input_max
+        P[1] = PL_MIX(st2094_intercept(N, Kx, Ky), P[1], w);
+
+    }
+
+    pl_assert(Kx >= 0 && Kx <= 1);
+    pl_assert(Ky >= 0 && Ky <= 1);
+
+    FOREACH_LUT(lut, x) {
+        x = bt1886_oetf(x, params->input_min, params->input_max);
+        x = bt1886_eotf(x, 0.0f, 1.0f);
+
+        if (x <= Kx && Kx) {
+            // Linear section
+            x *= Ky / Kx;
+        } else {
+            // Bezier section
+            const float t = (x - Kx) / (1 - Kx);
+
+            x = 0; // Bn
+            for (uint8_t p = 0; p <= N; p++)
+                x += binom[N][p] * powf(t, p) * powf(1 - t, N - p) * P[p];
+
+            x = Ky + (1 - Ky) * x;
+        }
+
+        x = bt1886_oetf(x, 0.0f, 1.0f);
+        x = bt1886_eotf(x, params->output_min, params->output_max);
+    }
+}
+
+const struct pl_tone_map_function pl_tone_map_st2094_40 = {
+    .name = "st2094-40",
+    .description = "SMPTE ST 2094-40 Annex B",
+    .param_desc = "Knee point target",
+    .param_min = 0.00f,
+    .param_def = 0.70f,
+    .param_max = 1.00f,
+    .scaling = PL_HDR_NITS,
+    .map = st2094_40,
+};
+
+static void st2094_10(float *lut, const struct pl_tone_map_params *params)
+{
+    float src_knee, dst_knee;
+    st2094_pick_knee(&src_knee, &dst_knee, params);
+
+    const float x1 = params->input_min;
+    const float x3 = params->input_max;
+    const float x2 = src_knee;
+
+    const float y1 = params->output_min;
+    const float y3 = params->output_max;
+    const float y2 = dst_knee;
+
+    const pl_matrix3x3 cmat = {{
+        { x2*x3*(y2 - y3), x1*x3*(y3 - y1), x1*x2*(y1 - y2) },
+        { x3*y3 - x2*y2,   x1*y1 - x3*y3,   x2*y2 - x1*y1   },
+        { x3 - x2,         x1 - x3,         x2 - x1         },
+    }};
+
+    float coeffs[3] = { y1, y2, y3 };
+    pl_matrix3x3_apply(&cmat, coeffs);
+
+    const float k = 1.0 / (x3*y3*(x1 - x2) + x2*y2*(x3 - x1) + x1*y1*(x2 - x3));
+    const float c1 = k * coeffs[0];
+    const float c2 = k * coeffs[1];
+    const float c3 = k * coeffs[2];
+
+    FOREACH_LUT(lut, x)
+        x = (c1 + c2 * x) / (1 + c3 * x);
+}
+
+const struct pl_tone_map_function pl_tone_map_st2094_10 = {
+    .name = "st2094-10",
+    .description = "SMPTE ST 2094-10 Annex B.2",
+    .param_desc = "Knee point target",
+    .param_min = 0.00f,
+    .param_def = 0.70f,
+    .param_max = 1.00f,
+    .scaling = PL_HDR_NITS,
+    .map = st2094_10,
+};
+
+static void bt2390(float *lut, const struct pl_tone_map_params *params)
+{
+    const float minLum = rescale_in(params->output_min, params);
+    const float maxLum = rescale_in(params->output_max, params);
+    const float offset = params->constants.knee_offset;
+    const float ks = (1 + offset) * maxLum - offset;
+    const float bp = minLum > 0 ? fminf(1 / minLum, 4) : 4;
+    const float gain_inv = 1 + minLum / maxLum * powf(1 - maxLum, bp);
+    const float gain = maxLum < 1 ? 1 / gain_inv : 1;
+
+    FOREACH_LUT(lut, x) {
+        x = rescale_in(x, params);
+
+        // Piece-wise hermite spline
+        if (ks < 1) {
+            float tb = (x - ks) / (1 - ks);
+            float tb2 = tb * tb;
+            float tb3 = tb2 * tb;
+            float pb = (2 * tb3 - 3 * tb2 + 1) * ks +
+                       (tb3 - 2 * tb2 + tb) * (1 - ks) +
+                       (-2 * tb3 + 3 * tb2) * maxLum;
+            x = x < ks ? x : pb;
+        }
+
+        // Black point adaptation
+        if (x < 1) {
+            x += minLum * powf(1 - x, bp);
+            x = gain * (x - minLum) + minLum;
+        }
+
+        x = x * (params->input_max - params->input_min) + params->input_min;
+    }
+}
+
+const struct pl_tone_map_function pl_tone_map_bt2390 = {
+    .name = "bt2390",
+    .description = "ITU-R BT.2390 EETF",
+    .scaling = PL_HDR_PQ,
+    .param_desc = "Knee offset",
+    .param_min = 0.50,
+    .param_def = 1.00,
+    .param_max = 2.00,
+    .map = bt2390,
+};
+
+static void bt2446a(float *lut, const struct pl_tone_map_params *params)
+{
+    const float phdr = 1 + 32 * powf(params->input_max / 10000, 1/2.4f);
+    const float psdr = 1 + 32 * powf(params->output_max / 10000, 1/2.4f);
+
+    FOREACH_LUT(lut, x) {
+        x = powf(rescale_in(x, params), 1/2.4f);
+        x = logf(1 + (phdr - 1) * x) / logf(phdr);
+
+        if (x <= 0.7399f) {
+            x = 1.0770f * x;
+        } else if (x < 0.9909f) {
+            x = (-1.1510f * x + 2.7811f) * x - 0.6302f;
+        } else {
+            x = 0.5f * x + 0.5f;
+        }
+
+        x = (powf(psdr, x) - 1) / (psdr - 1);
+        x = bt1886_eotf(x, params->output_min, params->output_max);
+    }
+}
+
+static void bt2446a_inv(float *lut, const struct pl_tone_map_params *params)
+{
+    FOREACH_LUT(lut, x) {
+        x = bt1886_oetf(x, params->input_min, params->input_max);
+        x *= 255.0;
+        if (x > 70) {
+            x = powf(x, (2.8305e-6f * x - 7.4622e-4f) * x + 1.2528f);
+        } else {
+            x = powf(x, (1.8712e-5f * x - 2.7334e-3f) * x + 1.3141f);
+        }
+        x = powf(x / 1000, 2.4f);
+        x = rescale_out(x, params);
+    }
+}
+
+const struct pl_tone_map_function pl_tone_map_bt2446a = {
+    .name = "bt2446a",
+    .description = "ITU-R BT.2446 Method A",
+    .scaling = PL_HDR_NITS,
+    .map = bt2446a,
+    .map_inverse = bt2446a_inv,
+};
+
+static void spline(float *lut, const struct pl_tone_map_params *params)
+{
+    float src_pivot, dst_pivot;
+    st2094_pick_knee(&src_pivot, &dst_pivot, params);
+
+    // Solve for linear knee (Pa = 0)
+    float slope = (dst_pivot - params->output_min) /
+                  (src_pivot - params->input_min);
+
+    // Tune the slope at the knee point slightly: raise it to a user-provided
+    // gamma exponent, multiplied by an extra tuning coefficient designed to
+    // make the slope closer to 1.0 when the difference in peaks is low, and
+    // closer to linear when the difference between peaks is high.
+    float ratio = params->input_max / params->output_max - 1.0f;
+    ratio = fclampf(params->constants.slope_tuning * ratio,
+                    params->constants.slope_offset,
+                    1.0f + params->constants.slope_offset);
+    slope = powf(slope, (1.0f - params->constants.spline_contrast) * ratio);
+
+    // Normalize everything the pivot to make the math easier
+    const float in_min = params->input_min - src_pivot;
+    const float in_max = params->input_max - src_pivot;
+    const float out_min = params->output_min - dst_pivot;
+    const float out_max = params->output_max - dst_pivot;
+
+    // Solve P of order 2 for:
+    //  P(in_min) = out_min
+    //  P'(0.0) = slope
+    //  P(0.0) = 0.0
+    const float Pa = (out_min - slope * in_min) / (in_min * in_min);
+    const float Pb = slope;
+
+    // Solve Q of order 3 for:
+    //  Q(in_max) = out_max
+    //  Q''(in_max) = 0.0
+    //  Q(0.0) = 0.0
+    //  Q'(0.0) = slope
+    const float t = 2 * in_max * in_max;
+    const float Qa = (slope * in_max - out_max) / (in_max * t);
+    const float Qb = -3 * (slope * in_max - out_max) / t;
+    const float Qc = slope;
+
+    FOREACH_LUT(lut, x) {
+        x -= src_pivot;
+        x = x > 0 ? ((Qa * x + Qb) * x + Qc) * x : (Pa * x + Pb) * x;
+        x += dst_pivot;
+    }
+}
+
+const struct pl_tone_map_function pl_tone_map_spline = {
+    .name = "spline",
+    .description = "Single-pivot polynomial spline",
+    .param_desc = "Contrast",
+    .param_min = 0.00f,
+    .param_def = 0.50f,
+    .param_max = 1.50f,
+    .scaling = PL_HDR_PQ,
+    .map = spline,
+    .map_inverse = spline,
+};
+
+static void reinhard(float *lut, const struct pl_tone_map_params *params)
+{
+    const float peak = rescale(params->input_max, params),
+                contrast = params->constants.reinhard_contrast,
+                offset = (1.0 - contrast) / contrast,
+                scale = (peak + offset) / peak;
+
+    FOREACH_LUT(lut, x) {
+        x = rescale(x, params);
+        x = x / (x + offset);
+        x *= scale;
+        x = rescale_out(x, params);
+    }
+}
+
+const struct pl_tone_map_function pl_tone_map_reinhard = {
+    .name = "reinhard",
+    .description = "Reinhard",
+    .param_desc = "Contrast",
+    .param_min = 0.001,
+    .param_def = 0.50,
+    .param_max = 0.99,
+    .map = reinhard,
+};
+
+static void mobius(float *lut, const struct pl_tone_map_params *params)
+{
+    const float peak = rescale(params->input_max, params),
+                j = params->constants.linear_knee;
+
+    // Solve for M(j) = j; M(peak) = 1.0; M'(j) = 1.0
+    // where M(x) = scale * (x+a)/(x+b)
+    const float a = -j*j * (peak - 1.0f) / (j*j - 2.0f * j + peak);
+    const float b = (j*j - 2.0f * j * peak + peak) /
+                    fmaxf(1e-6f, peak - 1.0f);
+    const float scale = (b*b + 2.0f * b*j + j*j) / (b - a);
+
+    FOREACH_LUT(lut, x) {
+        x = rescale(x, params);
+        x = x <= j ? x : scale * (x + a) / (x + b);
+        x = rescale_out(x, params);
+    }
+}
+
+const struct pl_tone_map_function pl_tone_map_mobius = {
+    .name = "mobius",
+    .description = "Mobius",
+    .param_desc = "Knee point",
+    .param_min = 0.00,
+    .param_def = 0.30,
+    .param_max = 0.99,
+    .map = mobius,
+};
+
+static inline float hable(float x)
+{
+    const float A = 0.15, B = 0.50, C = 0.10, D = 0.20, E = 0.02, F = 0.30;
+    return ((x * (A*x + C*B) + D*E) / (x * (A*x + B) + D*F)) - E/F;
+}
+
+static void hable_map(float *lut, const struct pl_tone_map_params *params)
+{
+    const float peak = params->input_max / params->output_max,
+                scale = 1.0f / hable(peak);
+
+    FOREACH_LUT(lut, x) {
+        x = bt1886_oetf(x, params->input_min, params->input_max);
+        x = bt1886_eotf(x, 0, peak);
+        x = scale * hable(x);
+        x = bt1886_oetf(x, 0, 1);
+        x = bt1886_eotf(x, params->output_min, params->output_max);
+    }
+}
+
+const struct pl_tone_map_function pl_tone_map_hable = {
+    .name = "hable",
+    .description = "Filmic tone-mapping (Hable)",
+    .map = hable_map,
+};
+
+static void gamma_map(float *lut, const struct pl_tone_map_params *params)
+{
+    const float peak = rescale(params->input_max, params),
+                cutoff = params->constants.linear_knee,
+                gamma = logf(cutoff) / logf(cutoff / peak);
+
+    FOREACH_LUT(lut, x) {
+        x = rescale(x, params);
+        x = x > cutoff ? powf(x / peak, gamma) : x;
+        x = rescale_out(x, params);
+    }
+}
+
+const struct pl_tone_map_function pl_tone_map_gamma = {
+    .name = "gamma",
+    .description = "Gamma function with knee",
+    .param_desc = "Knee point",
+    .param_min = 0.001,
+    .param_def = 0.30,
+    .param_max = 1.00,
+    .map = gamma_map,
+};
+
+static void linear(float *lut, const struct pl_tone_map_params *params)
+{
+    const float gain = params->constants.exposure;
+
+    FOREACH_LUT(lut, x) {
+        x = rescale_in(x, params);
+        x *= gain;
+        x = rescale_out(x, params);
+    }
+}
+
+const struct pl_tone_map_function pl_tone_map_linear = {
+    .name = "linear",
+    .description = "Perceptually linear stretch",
+    .param_desc = "Exposure",
+    .param_min = 0.001,
+    .param_def = 1.00,
+    .param_max = 10.0,
+    .scaling = PL_HDR_PQ,
+    .map = linear,
+    .map_inverse = linear,
+};
+
+const struct pl_tone_map_function pl_tone_map_linear_light = {
+    .name = "linearlight",
+    .description = "Linear light stretch",
+    .param_desc = "Exposure",
+    .param_min = 0.001,
+    .param_def = 1.00,
+    .param_max = 10.0,
+    .scaling = PL_HDR_NORM,
+    .map = linear,
+    .map_inverse = linear,
+};
+
+const struct pl_tone_map_function * const pl_tone_map_functions[] = {
+    &pl_tone_map_clip,
+    &pl_tone_map_st2094_40,
+    &pl_tone_map_st2094_10,
+    &pl_tone_map_bt2390,
+    &pl_tone_map_bt2446a,
+    &pl_tone_map_spline,
+    &pl_tone_map_reinhard,
+    &pl_tone_map_mobius,
+    &pl_tone_map_hable,
+    &pl_tone_map_gamma,
+    &pl_tone_map_linear,
+    &pl_tone_map_linear_light,
+    NULL
+};
+
+const int pl_num_tone_map_functions = PL_ARRAY_SIZE(pl_tone_map_functions) - 1;
+
+const struct pl_tone_map_function *pl_find_tone_map_function(const char *name)
+{
+    for (int i = 0; i < pl_num_tone_map_functions; i++) {
+        if (strcmp(name, pl_tone_map_functions[i]->name) == 0)
+            return pl_tone_map_functions[i];
+    }
+
+    return NULL;
+}
diff --git a/src/ucrt_math.def b/src/ucrt_math.def
new file mode 100644
index 0000000..f7d000d
--- /dev/null
+++ b/src/ucrt_math.def
@@ -0,0 +1,292 @@
+LIBRARY api-ms-win-crt-math-l1-1-0
+EXPORTS
+_Cbuild
+_Cmulcc
+_Cmulcr
+_FCbuild
+_FCmulcc
+_FCmulcr
+_LCbuild
+_LCmulcc
+_LCmulcr
+__setusermatherr
+_cabs
+_chgsign
+_chgsignf
+_copysign
+_copysignf
+_d_int
+_dclass
+_dexp
+_dlog
+_dnorm
+_dpcomp
+_dpoly
+_dscale
+_dsign
+_dsin
+_dtest
+_dunscale
+_except1
+_fd_int
+_fdclass
+_fdexp
+_fdlog
+_fdnorm
+_fdopen
+_fdpcomp
+_fdpoly
+_fdscale
+_fdsign
+_fdsin
+_fdtest
+_fdunscale
+_finite
+_finitef
+_fpclass
+_fpclassf
+_get_FMA3_enable
+_hypot
+_hypotf
+_isnan
+_isnanf
+_j0
+_j1
+_jn
+_ld_int
+_ldclass
+_ldexp
+_ldlog
+_ldpcomp
+_ldpoly
+_ldscale
+_ldsign
+_ldsin
+_ldtest
+_ldunscale
+_logb
+_logbf
+_nextafter
+_nextafterf
+_scalb
+_scalbf
+_set_FMA3_enable
+_y0
+_y1
+_yn
+acos
+acosf
+acosh
+acoshf
+acoshl
+asin
+asinf
+asinh
+asinhf
+asinhl
+atan
+atan2
+atan2f
+atanf
+atanh
+atanhf
+atanhl
+cabs
+cabsf
+cabsl
+cacos
+cacosf
+cacosh
+cacoshf
+cacoshl
+cacosl
+carg
+cargf
+cargl
+casin
+casinf
+casinh
+casinhf
+casinhl
+casinl
+catan
+catanf
+catanh
+catanhf
+catanhl
+catanl
+cbrt
+cbrtf
+cbrtl
+ccos
+ccosf
+ccosh
+ccoshf
+ccoshl
+ccosl
+ceil
+ceilf
+cexp
+cexpf
+cexpl
+cimag
+cimagf
+cimagl
+clog
+clog10
+clog10f
+clog10l
+clogf
+clogl
+conj
+conjf
+conjl
+copysign
+copysignf
+copysignl
+cos
+cosf
+cosh
+coshf
+cpow
+cpowf
+cpowl
+cproj
+cprojf
+cprojl
+creal
+crealf
+creall
+csin
+csinf
+csinh
+csinhf
+csinhl
+csinl
+csqrt
+csqrtf
+csqrtl
+ctan
+ctanf
+ctanh
+ctanhf
+ctanhl
+ctanl
+erf
+erfc
+erfcf
+erfcl
+erff
+erfl
+exp
+exp2
+exp2f
+exp2l
+expf
+expm1
+expm1f
+expm1l
+fabs
+fdim
+fdimf
+fdiml
+floor
+floorf
+fma
+fmaf
+fmal
+fmax
+fmaxf
+fmaxl
+fmin
+fminf
+fminl
+fmod
+fmodf
+frexp
+hypot
+ilogb
+ilogbf
+ilogbl
+ldexp
+lgamma
+lgammaf
+lgammal
+llrint
+llrintf
+llrintl
+llround
+llroundf
+llroundl
+log
+log10
+log10f
+log1p
+log1pf
+log1pl
+log2
+log2f
+log2l
+logb
+logbf
+logbl
+logf
+lrint
+lrintf
+lrintl
+lround
+lroundf
+lroundl
+modf
+modff
+nan
+nanf
+nanl
+nearbyint
+nearbyintf
+nearbyintl
+nextafter
+nextafterf
+nextafterl
+nexttoward
+nexttowardf
+nexttowardl
+norm
+normf
+norml
+pow
+powf
+remainder
+remainderf
+remainderl
+remquo
+remquof
+remquol
+rint
+rintf
+rintl
+round
+roundf
+roundl
+scalbln
+scalblnf
+scalblnl
+scalbn
+scalbnf
+scalbnl
+sin
+sinf
+sinh
+sinhf
+sqrt
+sqrtf
+tan
+tanf
+tanh
+tanhf
+tgamma
+tgammaf
+tgammal
+trunc
+truncf
+truncl
diff --git a/src/utils/dolbyvision.c b/src/utils/dolbyvision.c
new file mode 100644
index 0000000..3798532
--- /dev/null
+++ b/src/utils/dolbyvision.c
@@ -0,0 +1,63 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include <libplacebo/utils/dolbyvision.h>
+
+#ifdef PL_HAVE_LIBDOVI
+#include <libplacebo/tone_mapping.h>
+#include <libdovi/rpu_parser.h>
+#endif
+
+void pl_hdr_metadata_from_dovi_rpu(struct pl_hdr_metadata *out,
+                                   const uint8_t *buf, size_t size)
+{
+#ifdef PL_HAVE_LIBDOVI
+    if (buf && size) {
+        DoviRpuOpaque *rpu =
+            dovi_parse_unspec62_nalu(buf, size);
+        const DoviRpuDataHeader *header = dovi_rpu_get_header(rpu);
+
+        if (header && header->vdr_dm_metadata_present_flag) {
+            // Profile 4 reshaping isn't done as it is a dual layer format.
+            // However there are still unknowns on its EOTF, so it cannot be enabled.
+            //
+            // For profile 7, the brightness metadata can still be used as most
+            // titles are going to have accurate metadata<->image brightness,
+            // with the exception of some titles that require the enhancement layer
+            // to be processed to restore the intended brightness, which would then
+            // match the metadata values.
+            if (header->guessed_profile == 4) {
+                goto done;
+            }
+
+            const DoviVdrDmData *vdr_dm_data = dovi_rpu_get_vdr_dm_data(rpu);
+            if (vdr_dm_data->dm_data.level1) {
+                const DoviExtMetadataBlockLevel1 *l1 = vdr_dm_data->dm_data.level1;
+                out->max_pq_y = l1->max_pq / 4095.0f;
+                out->avg_pq_y = l1->avg_pq / 4095.0f;
+            }
+
+            dovi_rpu_free_vdr_dm_data(vdr_dm_data);
+        }
+
+    done:
+        dovi_rpu_free_header(header);
+        dovi_rpu_free(rpu);
+    }
+#endif
+}
diff --git a/src/utils/frame_queue.c b/src/utils/frame_queue.c
new file mode 100644
index 0000000..0155983
--- /dev/null
+++ b/src/utils/frame_queue.c
@@ -0,0 +1,1030 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <errno.h>
+#include <math.h>
+
+#include "common.h"
+#include "log.h"
+#include "pl_thread.h"
+
+#include <libplacebo/utils/frame_queue.h>
+
+struct cache_entry {
+    pl_tex tex[4];
+};
+
+struct entry {
+    pl_rc_t rc;
+    double pts;
+    struct cache_entry cache;
+    struct pl_source_frame src;
+    struct pl_frame frame;
+    uint64_t signature;
+    bool mapped;
+    bool ok;
+
+    // for interlaced frames
+    enum pl_field field;
+    struct entry *primary;
+    struct entry *prev, *next;
+    bool dirty;
+};
+
+// Hard limits for vsync timing validity
+#define MIN_FPS 10
+#define MAX_FPS 400
+
+// Limits for FPS estimation state
+#define MAX_SAMPLES 32
+#define MIN_SAMPLES 4
+
+// Stickiness to prevent `interpolation_threshold` oscillation
+#define THRESHOLD_MAX_RATIO 0.3
+#define THRESHOLD_FRAMES 5
+
+// Maximum number of not-yet-mapped frames to allow queueing in advance
+#define PREFETCH_FRAMES 2
+
+struct pool {
+    float samples[MAX_SAMPLES];
+    float estimate;
+    float sum;
+    int idx;
+    int num;
+    int total;
+};
+
+struct pl_queue_t {
+    pl_gpu gpu;
+    pl_log log;
+
+    // For multi-threading, we use two locks. The `lock_weak` guards the queue
+    // state itself. The `lock_strong` has a bigger scope and should be held
+    // for the duration of any functions that expect the queue state to
+    // remain more or less valid (with the exception of adding new members).
+    //
+    // In particular, `pl_queue_reset` and `pl_queue_update` will take
+    // the strong lock, while `pl_queue_push_*` will only take the weak
+    // lock.
+    pl_mutex lock_strong;
+    pl_mutex lock_weak;
+    pl_cond wakeup;
+
+    // Frame queue and state
+    PL_ARRAY(struct entry *) queue;
+    uint64_t signature;
+    int threshold_frames;
+    bool want_frame;
+    bool eof;
+
+    // Average vsync/frame fps estimation state
+    struct pool vps, fps;
+    float reported_vps;
+    float reported_fps;
+    double prev_pts;
+
+    // Storage for temporary arrays
+    PL_ARRAY(uint64_t) tmp_sig;
+    PL_ARRAY(float) tmp_ts;
+    PL_ARRAY(const struct pl_frame *) tmp_frame;
+
+    // Queue of GPU objects to reuse
+    PL_ARRAY(struct cache_entry) cache;
+};
+
+pl_queue pl_queue_create(pl_gpu gpu)
+{
+    pl_queue p = pl_alloc_ptr(NULL, p);
+    *p = (struct pl_queue_t) {
+        .gpu = gpu,
+        .log = gpu->log,
+    };
+
+    pl_mutex_init(&p->lock_strong);
+    pl_mutex_init(&p->lock_weak);
+    int ret = pl_cond_init(&p->wakeup);
+    if (ret) {
+        PL_ERR(p, "Failed to init conditional variable: %d", ret);
+        return NULL;
+    }
+    return p;
+}
+
+static void recycle_cache(pl_queue p, struct cache_entry *cache, bool recycle)
+{
+    bool has_textures = false;
+    for (int i = 0; i < PL_ARRAY_SIZE(cache->tex); i++) {
+        if (!cache->tex[i])
+            continue;
+
+        has_textures = true;
+        if (recycle) {
+            pl_tex_invalidate(p->gpu, cache->tex[i]);
+        } else {
+            pl_tex_destroy(p->gpu, &cache->tex[i]);
+        }
+    }
+
+    if (recycle && has_textures)
+        PL_ARRAY_APPEND(p, p->cache, *cache);
+
+    memset(cache, 0, sizeof(*cache)); // sanity
+}
+
+static void entry_deref(pl_queue p, struct entry **pentry, bool recycle)
+{
+    struct entry *entry = *pentry;
+    *pentry = NULL;
+    if (!entry || !pl_rc_deref(&entry->rc))
+        return;
+
+    if (!entry->mapped && entry->src.discard) {
+        PL_TRACE(p, "Discarding unused frame id %"PRIu64" with PTS %f",
+                 entry->signature, entry->src.pts);
+        entry->src.discard(&entry->src);
+    }
+
+    if (entry->mapped && entry->ok && entry->src.unmap) {
+        PL_TRACE(p, "Unmapping frame id %"PRIu64" with PTS %f",
+                 entry->signature, entry->src.pts);
+        entry->src.unmap(p->gpu, &entry->frame, &entry->src);
+    }
+
+    recycle_cache(p, &entry->cache, recycle);
+    pl_free(entry);
+}
+
+static struct entry *entry_ref(struct entry *entry)
+{
+    pl_rc_ref(&entry->rc);
+    return entry;
+}
+
+static void entry_cull(pl_queue p, struct entry *entry, bool recycle)
+{
+    // Forcibly clean up references to prev/next frames, even if `entry` has
+    // remaining refs pointing at it. This is to prevent cyclic references.
+    entry_deref(p, &entry->primary, recycle);
+    entry_deref(p, &entry->prev, recycle);
+    entry_deref(p, &entry->next, recycle);
+    entry_deref(p, &entry, recycle);
+}
+
+void pl_queue_destroy(pl_queue *queue)
+{
+    pl_queue p = *queue;
+    if (!p)
+        return;
+
+    for (int n = 0; n < p->queue.num; n++)
+        entry_cull(p, p->queue.elem[n], false);
+    for (int n = 0; n < p->cache.num; n++) {
+        for (int i = 0; i < PL_ARRAY_SIZE(p->cache.elem[n].tex); i++)
+            pl_tex_destroy(p->gpu, &p->cache.elem[n].tex[i]);
+    }
+
+    pl_cond_destroy(&p->wakeup);
+    pl_mutex_destroy(&p->lock_weak);
+    pl_mutex_destroy(&p->lock_strong);
+    pl_free(p);
+    *queue = NULL;
+}
+
+void pl_queue_reset(pl_queue p)
+{
+    pl_mutex_lock(&p->lock_strong);
+    pl_mutex_lock(&p->lock_weak);
+
+    for (int i = 0; i < p->queue.num; i++)
+        entry_cull(p, p->queue.elem[i], false);
+
+    *p = (struct pl_queue_t) {
+        .gpu = p->gpu,
+        .log = p->log,
+
+        // Reuse lock objects
+        .lock_strong = p->lock_strong,
+        .lock_weak = p->lock_weak,
+        .wakeup = p->wakeup,
+
+        // Explicitly preserve allocations
+        .queue.elem = p->queue.elem,
+        .tmp_sig.elem = p->tmp_sig.elem,
+        .tmp_ts.elem = p->tmp_ts.elem,
+        .tmp_frame.elem = p->tmp_frame.elem,
+
+        // Reuse GPU object cache entirely
+        .cache = p->cache,
+    };
+
+    pl_cond_signal(&p->wakeup);
+    pl_mutex_unlock(&p->lock_weak);
+    pl_mutex_unlock(&p->lock_strong);
+}
+
+static inline float delta(float old, float new)
+{
+    return fabsf((new - old) / PL_MIN(new, old));
+}
+
+static inline void default_estimate(struct pool *pool, float val)
+{
+    if (!pool->estimate && isnormal(val) && val > 0.0)
+        pool->estimate = val;
+}
+
+static inline void update_estimate(struct pool *pool, float cur)
+{
+    if (pool->num) {
+        static const float max_delta = 0.3;
+        if (delta(pool->sum / pool->num, cur) > max_delta) {
+            pool->sum = 0.0;
+            pool->num = pool->idx = 0;
+        }
+    }
+
+    if (pool->num++ == MAX_SAMPLES) {
+        pool->sum -= pool->samples[pool->idx];
+        pool->num--;
+    }
+
+    pool->sum += pool->samples[pool->idx] = cur;
+    pool->idx = (pool->idx + 1) % MAX_SAMPLES;
+    pool->total++;
+
+    if (pool->total < MIN_SAMPLES || pool->num >= MIN_SAMPLES)
+        pool->estimate = pool->sum / pool->num;
+}
+
+static void queue_push(pl_queue p, const struct pl_source_frame *src)
+{
+    if (p->eof && !src)
+        return; // ignore duplicate EOF
+
+    if (p->eof && src) {
+        PL_INFO(p, "Received frame after EOF signaled... discarding frame!");
+        if (src->discard)
+            src->discard(src);
+        return;
+    }
+
+    pl_cond_signal(&p->wakeup);
+
+    if (!src) {
+        PL_TRACE(p, "Received EOF, draining frame queue...");
+        p->eof = true;
+        p->want_frame = false;
+        return;
+    }
+
+    // Update FPS estimates if possible/reasonable
+    default_estimate(&p->fps, src->first_field ? src->duration / 2 : src->duration);
+    if (p->queue.num) {
+        double last_pts = p->queue.elem[p->queue.num - 1]->pts;
+        float delta = src->pts - last_pts;
+        if (delta <= 0.0f) {
+            PL_DEBUG(p, "Non monotonically increasing PTS %f -> %f", last_pts, src->pts);
+        } else if (p->fps.estimate && delta > 10.0 * p->fps.estimate) {
+            PL_DEBUG(p, "Discontinuous source PTS jump %f -> %f", last_pts, src->pts);
+        } else {
+            update_estimate(&p->fps, delta);
+        }
+    } else if (src->pts != 0) {
+        PL_DEBUG(p, "First frame received with non-zero PTS %f", src->pts);
+    }
+
+    struct entry *entry = pl_alloc_ptr(NULL, entry);
+    *entry = (struct entry) {
+        .signature = p->signature++,
+        .pts = src->pts,
+        .src = *src,
+    };
+    pl_rc_init(&entry->rc);
+    PL_ARRAY_POP(p->cache, &entry->cache);
+    PL_TRACE(p, "Added new frame id %"PRIu64" with PTS %f",
+             entry->signature, entry->pts);
+
+    // Insert new entry into the correct spot in the queue, sorted by PTS
+    for (int i = p->queue.num;; i--) {
+        if (i == 0 || p->queue.elem[i - 1]->pts <= entry->pts) {
+            if (src->first_field == PL_FIELD_NONE) {
+                // Progressive
+                PL_ARRAY_INSERT_AT(p, p->queue, i, entry);
+                break;
+            } else {
+                // Interlaced
+                struct entry *prev = i > 0 ? p->queue.elem[i - 1] : NULL;
+                struct entry *next = i < p->queue.num ? p->queue.elem[i] : NULL;
+                struct entry *entry2 = pl_zalloc_ptr(NULL, entry2);
+                pl_rc_init(&entry2->rc);
+                if (next) {
+                    entry2->pts = (entry->pts + next->pts) / 2;
+                } else if (src->duration) {
+                    entry2->pts = entry->pts + src->duration / 2;
+                } else if (p->fps.estimate) {
+                    entry2->pts = entry->pts + p->fps.estimate;
+                } else {
+                    PL_ERR(p, "Frame with PTS %f specified as interlaced, but "
+                           "no FPS information known yet! Please specify a "
+                           "valid `pl_source_frame.duration`. Treating as "
+                           "progressive...", src->pts);
+                    PL_ARRAY_INSERT_AT(p, p->queue, i, entry);
+                    pl_free(entry2);
+                    break;
+                }
+
+                entry->field = src->first_field;
+                entry2->primary = entry_ref(entry);
+                entry2->field = pl_field_other(entry->field);
+                entry2->signature = p->signature++;
+
+                PL_TRACE(p, "Added second field id %"PRIu64" with PTS %f",
+                         entry2->signature, entry2->pts);
+
+                // Link previous/next frames
+                if (prev) {
+                    entry->prev = entry_ref(PL_DEF(prev->primary, prev));
+                    entry2->prev = entry_ref(PL_DEF(prev->primary, prev));
+                    // Retroactively re-link the previous frames that should
+                    // be referencing this frame
+                    for (int j = i - 1; j >= 0; --j) {
+                        struct entry *e = p->queue.elem[j];
+                        if (e != prev && e != prev->primary)
+                            break;
+                        entry_deref(p, &e->next, true);
+                        e->next = entry_ref(entry);
+                        if (e->dirty) { // reset signature to signal change
+                            e->signature = p->signature++;
+                            e->dirty = false;
+                        }
+                    }
+                }
+
+                if (next) {
+                    entry->next = entry_ref(PL_DEF(next->primary, next));
+                    entry2->next = entry_ref(PL_DEF(next->primary, next));
+                    for (int j = i; j < p->queue.num; j++) {
+                        struct entry *e = p->queue.elem[j];
+                        if (e != next && e != next->primary)
+                            break;
+                        entry_deref(p, &e->prev, true);
+                        e->prev = entry_ref(entry);
+                        if (e->dirty) {
+                            e->signature = p->signature++;
+                            e->dirty = false;
+                        }
+                    }
+                }
+
+                PL_ARRAY_INSERT_AT(p, p->queue, i, entry);
+                PL_ARRAY_INSERT_AT(p, p->queue, i+1, entry2);
+                break;
+            }
+        }
+    }
+
+    p->want_frame = false;
+}
+
+void pl_queue_push(pl_queue p, const struct pl_source_frame *frame)
+{
+    pl_mutex_lock(&p->lock_weak);
+    queue_push(p, frame);
+    pl_mutex_unlock(&p->lock_weak);
+}
+
+static inline bool entry_mapped(struct entry *entry)
+{
+    return entry->mapped || (entry->primary && entry->primary->mapped);
+}
+
+static bool queue_has_room(pl_queue p)
+{
+    if (p->want_frame)
+        return true;
+
+    int wanted_frames = PREFETCH_FRAMES;
+    if (p->fps.estimate && p->vps.estimate && p->vps.estimate <= 1.0f / MIN_FPS)
+        wanted_frames += ceilf(p->vps.estimate / p->fps.estimate) - 1;
+
+    // Examine the queue tail
+    for (int i = p->queue.num - 1; i >= 0; i--) {
+        if (entry_mapped(p->queue.elem[i]))
+            return true;
+        if (p->queue.num - i >= wanted_frames)
+            return false;
+    }
+
+    return true;
+}
+
+bool pl_queue_push_block(pl_queue p, uint64_t timeout,
+                         const struct pl_source_frame *frame)
+{
+    pl_mutex_lock(&p->lock_weak);
+    if (!timeout || !frame || p->eof)
+        goto skip_blocking;
+
+    while (!queue_has_room(p) && !p->eof) {
+        if (pl_cond_timedwait(&p->wakeup, &p->lock_weak, timeout) == ETIMEDOUT) {
+            pl_mutex_unlock(&p->lock_weak);
+            return false;
+        }
+    }
+
+skip_blocking:
+
+    queue_push(p, frame);
+    pl_mutex_unlock(&p->lock_weak);
+    return true;
+}
+
+static void report_estimates(pl_queue p)
+{
+    if (p->fps.total >= MIN_SAMPLES && p->vps.total >= MIN_SAMPLES) {
+        if (p->reported_fps && p->reported_vps) {
+            // Only re-report the estimates if they've changed considerably
+            // from the previously reported values
+            static const float report_delta = 0.3f;
+            float delta_fps = delta(p->reported_fps, p->fps.estimate);
+            float delta_vps = delta(p->reported_vps, p->vps.estimate);
+            if (delta_fps < report_delta && delta_vps < report_delta)
+                return;
+        }
+
+        PL_INFO(p, "Estimated source FPS: %.3f, display FPS: %.3f",
+                1.0 / p->fps.estimate, 1.0 / p->vps.estimate);
+
+        p->reported_fps = p->fps.estimate;
+        p->reported_vps = p->vps.estimate;
+    }
+}
+
+// note: may add more than one frame, since it releases the lock
+static enum pl_queue_status get_frame(pl_queue p, const struct pl_queue_params *params)
+{
+    if (p->eof)
+        return PL_QUEUE_EOF;
+
+    if (!params->get_frame) {
+        if (!params->timeout)
+            return PL_QUEUE_MORE;
+
+        p->want_frame = true;
+        pl_cond_signal(&p->wakeup);
+
+        while (p->want_frame) {
+            if (pl_cond_timedwait(&p->wakeup, &p->lock_weak, params->timeout) == ETIMEDOUT)
+                return PL_QUEUE_MORE;
+        }
+
+        return p->eof ? PL_QUEUE_EOF : PL_QUEUE_OK;
+    }
+
+    // Don't hold the weak mutex while calling into `get_frame`, to allow
+    // `pl_queue_push` to run concurrently while we're waiting for frames
+    pl_mutex_unlock(&p->lock_weak);
+
+    struct pl_source_frame src;
+    enum pl_queue_status ret;
+    switch ((ret = params->get_frame(&src, params))) {
+    case PL_QUEUE_OK:
+        pl_queue_push(p, &src);
+        break;
+    case PL_QUEUE_EOF:
+        pl_queue_push(p, NULL);
+        break;
+    case PL_QUEUE_MORE:
+    case PL_QUEUE_ERR:
+        break;
+    }
+
+    pl_mutex_lock(&p->lock_weak);
+    return ret;
+}
+
+static inline bool map_frame(pl_queue p, struct entry *entry)
+{
+    if (!entry->mapped) {
+        PL_TRACE(p, "Mapping frame id %"PRIu64" with PTS %f",
+                 entry->signature, entry->pts);
+        entry->mapped = true;
+        entry->ok = entry->src.map(p->gpu, entry->cache.tex,
+                                   &entry->src, &entry->frame);
+        if (!entry->ok)
+            PL_ERR(p, "Failed mapping frame id %"PRIu64" with PTS %f",
+                   entry->signature, entry->pts);
+    }
+
+    return entry->ok;
+}
+
+static bool map_entry(pl_queue p, struct entry *entry)
+{
+    bool ok = map_frame(p, entry->primary ? entry->primary : entry);
+    if (entry->prev)
+        ok &= map_frame(p, entry->prev);
+    if (entry->next)
+        ok &= map_frame(p, entry->next);
+    if (!ok)
+        return false;
+
+    if (entry->primary)
+        entry->frame = entry->primary->frame;
+
+    if (entry->field) {
+        entry->frame.field = entry->field;
+        entry->frame.first_field = PL_DEF(entry->primary, entry)->src.first_field;
+        entry->frame.prev = entry->prev ? &entry->prev->frame : NULL;
+        entry->frame.next = entry->next ? &entry->next->frame : NULL;
+        entry->dirty = true;
+    }
+
+    return true;
+}
+
+static bool entry_complete(struct entry *entry)
+{
+    return entry->field ? !!entry->next : true;
+}
+
+// Advance the queue as needed to make sure idx 0 is the last frame before
+// `pts`, and idx 1 is the first frame after `pts` (unless this is the last).
+//
+// Returns PL_QUEUE_OK only if idx 0 is still legal under ZOH semantics.
+static enum pl_queue_status advance(pl_queue p, double pts,
+                                    const struct pl_queue_params *params)
+{
+    // Cull all frames except the last frame before `pts`
+    int culled = 0;
+    for (int i = 1; i < p->queue.num; i++) {
+        if (p->queue.elem[i]->pts <= pts) {
+            entry_cull(p, p->queue.elem[i - 1], true);
+            culled++;
+        }
+    }
+    PL_ARRAY_REMOVE_RANGE(p->queue, 0, culled);
+
+    // Keep adding new frames until we find one in the future, or EOF
+    enum pl_queue_status ret = PL_QUEUE_OK;
+    while (p->queue.num < 2) {
+        switch ((ret = get_frame(p, params))) {
+        case PL_QUEUE_ERR:
+            return ret;
+        case PL_QUEUE_EOF:
+            if (!p->queue.num)
+                return ret;
+            goto done;
+        case PL_QUEUE_MORE:
+        case PL_QUEUE_OK:
+            while (p->queue.num > 1 && p->queue.elem[1]->pts <= pts) {
+                entry_cull(p, p->queue.elem[0], true);
+                PL_ARRAY_REMOVE_AT(p->queue, 0);
+            }
+            if (ret == PL_QUEUE_MORE)
+                return ret;
+            continue;
+        }
+    }
+
+    if (!entry_complete(p->queue.elem[1])) {
+        switch (get_frame(p, params)) {
+        case PL_QUEUE_ERR:
+            return PL_QUEUE_ERR;
+        case PL_QUEUE_MORE:
+            ret = PL_QUEUE_MORE;
+            // fall through
+        case PL_QUEUE_EOF:
+        case PL_QUEUE_OK:
+            goto done;
+        }
+    }
+
+done:
+    if (p->eof && p->queue.num == 1) {
+        if (p->queue.elem[0]->pts == 0.0 || !p->fps.estimate) {
+            // If the last frame has PTS 0.0, or we have no FPS estimate, then
+            // this is probably a single-frame file, in which case we want to
+            // extend the ZOH to infinity, rather than returning. Not a perfect
+            // heuristic, but w/e
+            return PL_QUEUE_OK;
+        }
+
+        // Last frame is held for an extra `p->fps.estimate` duration,
+        // afterwards this function just returns EOF.
+        if (pts < p->queue.elem[0]->pts + p->fps.estimate) {
+            ret = PL_QUEUE_OK;
+        } else {
+            entry_cull(p, p->queue.elem[0], true);
+            p->queue.num = 0;
+            return PL_QUEUE_EOF;
+        }
+    }
+
+    pl_assert(p->queue.num);
+    return ret;
+}
+
+static inline enum pl_queue_status point(pl_queue p, struct pl_frame_mix *mix,
+                                         const struct pl_queue_params *params)
+{
+    if (!p->queue.num) {
+        *mix = (struct pl_frame_mix) {0};
+        return PL_QUEUE_MORE;
+    }
+
+    // Find closest frame (nearest neighbour semantics)
+    struct entry *entry = p->queue.elem[0];
+    if (entry->pts > params->pts) { // first frame not visible yet
+        *mix = (struct pl_frame_mix) {0};
+        return PL_QUEUE_OK;
+    }
+
+    double best = fabs(entry->pts - params->pts);
+    for (int i = 1; i < p->queue.num; i++) {
+        double dist = fabs(p->queue.elem[i]->pts - params->pts);
+        if (dist < best) {
+            entry = p->queue.elem[i];
+            best = dist;
+            continue;
+        } else {
+            break;
+        }
+    }
+
+    if (!map_entry(p, entry))
+        return PL_QUEUE_ERR;
+
+    // Return a mix containing only this single frame
+    p->tmp_sig.num = p->tmp_ts.num = p->tmp_frame.num = 0;
+    PL_ARRAY_APPEND(p, p->tmp_sig, entry->signature);
+    PL_ARRAY_APPEND(p, p->tmp_frame, &entry->frame);
+    PL_ARRAY_APPEND(p, p->tmp_ts, 0.0);
+    *mix = (struct pl_frame_mix) {
+        .num_frames = 1,
+        .frames = p->tmp_frame.elem,
+        .signatures = p->tmp_sig.elem,
+        .timestamps = p->tmp_ts.elem,
+        .vsync_duration = 1.0,
+    };
+
+    PL_TRACE(p, "Showing single frame id %"PRIu64" with PTS %f for target PTS %f",
+             entry->signature, entry->pts, params->pts);
+
+    report_estimates(p);
+    return PL_QUEUE_OK;
+}
+
+// Present a single frame as appropriate for `pts`
+static enum pl_queue_status nearest(pl_queue p, struct pl_frame_mix *mix,
+                                    const struct pl_queue_params *params)
+{
+    enum pl_queue_status ret;
+    switch ((ret = advance(p, params->pts, params))) {
+    case PL_QUEUE_ERR:
+    case PL_QUEUE_EOF:
+        return ret;
+    case PL_QUEUE_OK:
+    case PL_QUEUE_MORE:
+        if (mix && point(p, mix, params) == PL_QUEUE_ERR)
+            return PL_QUEUE_ERR;
+        return ret;
+    }
+
+    pl_unreachable();
+}
+
+// Special case of `interpolate` for radius = 0, in which case we need exactly
+// the previous frame and the following frame
+static enum pl_queue_status oversample(pl_queue p, struct pl_frame_mix *mix,
+                                       const struct pl_queue_params *params)
+{
+    enum pl_queue_status ret;
+    switch ((ret = advance(p, params->pts, params))) {
+    case PL_QUEUE_ERR:
+    case PL_QUEUE_EOF:
+        return ret;
+    case PL_QUEUE_OK:
+        break;
+    case PL_QUEUE_MORE:
+        if (!p->queue.num) {
+            if (mix)
+                *mix = (struct pl_frame_mix) {0};
+            return ret;
+        }
+        break;
+    }
+
+    if (!mix)
+        return PL_QUEUE_OK;
+
+    // Can't oversample with only a single frame, fall back to point sampling
+    if (p->queue.num < 2 || p->queue.elem[0]->pts > params->pts) {
+        if (point(p, mix, params) != PL_QUEUE_OK)
+            return PL_QUEUE_ERR;
+        return ret;
+    }
+
+    struct entry *entries[2] = { p->queue.elem[0], p->queue.elem[1] };
+    pl_assert(entries[0]->pts <= params->pts);
+    pl_assert(entries[1]->pts >= params->pts);
+
+    // Returning a mix containing both of these two frames
+    p->tmp_sig.num = p->tmp_ts.num = p->tmp_frame.num = 0;
+    for (int i = 0; i < 2; i++) {
+        if (!map_entry(p, entries[i]))
+            return PL_QUEUE_ERR;
+        float ts = (entries[i]->pts - params->pts) / p->fps.estimate;
+        PL_ARRAY_APPEND(p, p->tmp_sig, entries[i]->signature);
+        PL_ARRAY_APPEND(p, p->tmp_frame, &entries[i]->frame);
+        PL_ARRAY_APPEND(p, p->tmp_ts, ts);
+    }
+
+    *mix = (struct pl_frame_mix) {
+        .num_frames = 2,
+        .frames = p->tmp_frame.elem,
+        .signatures = p->tmp_sig.elem,
+        .timestamps = p->tmp_ts.elem,
+        .vsync_duration = p->vps.estimate / p->fps.estimate,
+    };
+
+    PL_TRACE(p, "Oversampling 2 frames for target PTS %f:", params->pts);
+    for (int i = 0; i < mix->num_frames; i++)
+        PL_TRACE(p, "    id %"PRIu64" ts %f", mix->signatures[i], mix->timestamps[i]);
+
+    report_estimates(p);
+    return ret;
+}
+
+// Present a mixture of frames, relative to the vsync ratio
+static enum pl_queue_status interpolate(pl_queue p, struct pl_frame_mix *mix,
+                                        const struct pl_queue_params *params)
+{
+    // No FPS estimate available, possibly source contains only a single frame,
+    // or this is the first frame to be rendered. Fall back to point sampling.
+    if (!p->fps.estimate)
+        return nearest(p, mix, params);
+
+    // Silently disable interpolation if the ratio dips lower than the
+    // configured threshold
+    float ratio = fabs(p->fps.estimate / p->vps.estimate - 1.0);
+    if (ratio < params->interpolation_threshold) {
+        if (!p->threshold_frames) {
+            PL_INFO(p, "Detected fps ratio %.4f below threshold %.4f, "
+                    "disabling interpolation",
+                    ratio, params->interpolation_threshold);
+        }
+
+        p->threshold_frames = THRESHOLD_FRAMES + 1;
+        return nearest(p, mix, params);
+    } else if (ratio < THRESHOLD_MAX_RATIO && p->threshold_frames > 1) {
+        p->threshold_frames--;
+        return nearest(p, mix, params);
+    } else {
+        if (p->threshold_frames) {
+            PL_INFO(p, "Detected fps ratio %.4f exceeds threshold %.4f, "
+                    "re-enabling interpolation",
+                    ratio, params->interpolation_threshold);
+        }
+        p->threshold_frames = 0;
+    }
+
+    // No radius information, special case in which we only need the previous
+    // and next frames.
+    if (!params->radius)
+        return oversample(p, mix, params);
+
+    pl_assert(p->fps.estimate && p->vps.estimate);
+    float radius = params->radius * fmaxf(1.0f, p->vps.estimate / p->fps.estimate);
+    double min_pts = params->pts - radius * p->fps.estimate,
+           max_pts = params->pts + radius * p->fps.estimate;
+
+    enum pl_queue_status ret;
+    switch ((ret = advance(p, min_pts, params))) {
+    case PL_QUEUE_ERR:
+    case PL_QUEUE_EOF:
+        return ret;
+    case PL_QUEUE_MORE:
+        goto done;
+    case PL_QUEUE_OK:
+        break;
+    }
+
+    // Keep adding new frames until we've covered the range we care about
+    pl_assert(p->queue.num);
+    while (p->queue.elem[p->queue.num - 1]->pts < max_pts) {
+        switch ((ret = get_frame(p, params))) {
+        case PL_QUEUE_ERR:
+            return ret;
+        case PL_QUEUE_MORE:
+            goto done;
+        case PL_QUEUE_EOF:;
+            // Don't forward EOF until we've held the last frame for the
+            // desired ZOH hold duration
+            double last_pts = p->queue.elem[p->queue.num - 1]->pts;
+            if (last_pts && params->pts >= last_pts + p->fps.estimate)
+                return ret;
+            ret = PL_QUEUE_OK;
+            goto done;
+        case PL_QUEUE_OK:
+            continue;
+        }
+    }
+
+    if (!entry_complete(p->queue.elem[p->queue.num - 1])) {
+        switch ((ret = get_frame(p, params))) {
+        case PL_QUEUE_MORE:
+        case PL_QUEUE_OK:
+            break;
+        case PL_QUEUE_ERR:
+        case PL_QUEUE_EOF:
+            return ret;
+        }
+    }
+
+done: ;
+
+    if (!mix)
+        return PL_QUEUE_OK;
+
+    // Construct a mix object representing the current queue state, starting at
+    // the last frame before `min_pts` to make sure there's a fallback frame
+    // available for ZOH semantics.
+    p->tmp_sig.num = p->tmp_ts.num = p->tmp_frame.num = 0;
+    for (int i = 0; i < p->queue.num; i++) {
+        struct entry *entry = p->queue.elem[i];
+        if (entry->pts > max_pts)
+            break;
+        if (!map_entry(p, entry))
+            return PL_QUEUE_ERR;
+        float ts = (entry->pts - params->pts) / p->fps.estimate;
+        PL_ARRAY_APPEND(p, p->tmp_sig, entry->signature);
+        PL_ARRAY_APPEND(p, p->tmp_frame, &entry->frame);
+        PL_ARRAY_APPEND(p, p->tmp_ts, ts);
+    }
+
+    *mix = (struct pl_frame_mix) {
+        .num_frames = p->tmp_frame.num,
+        .frames = p->tmp_frame.elem,
+        .signatures = p->tmp_sig.elem,
+        .timestamps = p->tmp_ts.elem,
+        .vsync_duration = p->vps.estimate / p->fps.estimate,
+    };
+
+    PL_TRACE(p, "Showing mix of %d frames for target PTS %f:",
+             mix->num_frames, params->pts);
+    for (int i = 0; i < mix->num_frames; i++)
+        PL_TRACE(p, "    id %"PRIu64" ts %f", mix->signatures[i], mix->timestamps[i]);
+
+    report_estimates(p);
+    return ret;
+}
+
+static bool prefill(pl_queue p, const struct pl_queue_params *params)
+{
+    int min_frames = 2 * ceilf(params->radius);
+    if (p->fps.estimate && p->vps.estimate && p->vps.estimate <= 1.0f / MIN_FPS)
+        min_frames *= ceilf(p->vps.estimate / p->fps.estimate);
+    min_frames = PL_MAX(min_frames, PREFETCH_FRAMES);
+
+    while (p->queue.num < min_frames) {
+        switch (get_frame(p, params)) {
+        case PL_QUEUE_ERR:
+            return false;
+        case PL_QUEUE_EOF:
+        case PL_QUEUE_MORE:
+            return true;
+        case PL_QUEUE_OK:
+            continue;
+        }
+    }
+
+    // In the most likely case, the first few frames will all be required. So
+    // force-map them all to initialize GPU state on initial rendering. This is
+    // better than the alternative of missing the cache later, when timing is
+    // more relevant.
+    for (int i = 0; i < min_frames; i++) {
+        if (!map_entry(p, p->queue.elem[i]))
+            return false;
+    }
+
+    return true;
+}
+
+enum pl_queue_status pl_queue_update(pl_queue p, struct pl_frame_mix *out_mix,
+                                     const struct pl_queue_params *params)
+{
+    pl_mutex_lock(&p->lock_strong);
+    pl_mutex_lock(&p->lock_weak);
+    default_estimate(&p->vps, params->vsync_duration);
+
+    float delta = params->pts - p->prev_pts;
+    if (delta < 0.0f) {
+
+        // This is a backwards PTS jump. This is something we can handle
+        // semi-gracefully, but only if we haven't culled past the current
+        // frame yet.
+        if (p->queue.num && p->queue.elem[0]->pts > params->pts) {
+            PL_ERR(p, "Requested PTS %f is lower than the oldest frame "
+                   "PTS %f. This is not supported, PTS must be monotonically "
+                   "increasing! Please use `pl_queue_reset` to reset the frame "
+                   "queue on discontinuous PTS jumps.",
+                   params->pts, p->queue.elem[0]->pts);
+            pl_mutex_unlock(&p->lock_weak);
+            pl_mutex_unlock(&p->lock_strong);
+            return PL_QUEUE_ERR;
+        }
+
+    } else if (delta > 1.0f) {
+
+        // A jump of more than a second is probably the result of a
+        // discontinuous jump after a suspend. To prevent this from exploding
+        // the FPS estimate, treat this as a new frame.
+        PL_TRACE(p, "Discontinuous target PTS jump %f -> %f, ignoring...",
+                 p->prev_pts, params->pts);
+
+    } else if (delta > 0) {
+
+        update_estimate(&p->vps, params->pts - p->prev_pts);
+
+    }
+
+    p->prev_pts = params->pts;
+
+    // As a special case, prefill the queue if this is the first frame
+    if (!params->pts && !p->queue.num) {
+        if (!prefill(p, params)) {
+            pl_mutex_unlock(&p->lock_weak);
+            pl_mutex_unlock(&p->lock_strong);
+            return PL_QUEUE_ERR;
+        }
+    }
+
+    // Ignore unrealistically high or low FPS, common near start of playback
+    static const float max_vsync = 1.0 / MIN_FPS;
+    static const float min_vsync = 1.0 / MAX_FPS;
+    bool estimation_ok = p->vps.estimate > min_vsync && p->vps.estimate < max_vsync;
+    enum pl_queue_status ret;
+
+    if (estimation_ok || params->vsync_duration > 0) {
+        // We know the vsync duration, so construct an interpolation mix
+        ret = interpolate(p, out_mix, params);
+    } else {
+        // We don't know the vsync duration (yet), so just point-sample
+        ret = nearest(p, out_mix, params);
+    }
+
+    pl_cond_signal(&p->wakeup);
+    pl_mutex_unlock(&p->lock_weak);
+    pl_mutex_unlock(&p->lock_strong);
+    return ret;
+}
+
+float pl_queue_estimate_fps(pl_queue p)
+{
+    pl_mutex_lock(&p->lock_weak);
+    float estimate = p->fps.estimate;
+    pl_mutex_unlock(&p->lock_weak);
+    return estimate ? 1.0f / estimate : 0.0f;
+}
+
+float pl_queue_estimate_vps(pl_queue p)
+{
+    pl_mutex_lock(&p->lock_weak);
+    float estimate = p->vps.estimate;
+    pl_mutex_unlock(&p->lock_weak);
+    return estimate ? 1.0f / estimate : 0.0f;
+}
+
+int pl_queue_num_frames(pl_queue p)
+{
+    pl_mutex_lock(&p->lock_weak);
+    int count = p->queue.num;
+    pl_mutex_unlock(&p->lock_weak);
+    return count;
+}
+
+bool pl_queue_peek(pl_queue p, int idx, struct pl_source_frame *out)
+{
+    pl_mutex_lock(&p->lock_weak);
+    bool ok = idx >= 0 && idx < p->queue.num;
+    if (ok)
+        *out = p->queue.elem[idx]->src;
+    pl_mutex_unlock(&p->lock_weak);
+    return ok;
+}
diff --git a/src/utils/upload.c b/src/utils/upload.c
new file mode 100644
index 0000000..75bd4bb
--- /dev/null
+++ b/src/utils/upload.c
@@ -0,0 +1,382 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "log.h"
+#include "common.h"
+#include "gpu.h"
+
+#include <libplacebo/utils/upload.h>
+
+#define MAX_COMPS 4
+
+struct comp {
+    int order; // e.g. 0, 1, 2, 3 for RGBA
+    int size;  // size in bits
+    int shift; // bit-shift / offset in bits
+};
+
+static int compare_comp(const void *pa, const void *pb)
+{
+    const struct comp *a = pa, *b = pb;
+
+    // Move all of the components with a size of 0 to the end, so they can
+    // be ignored outright
+    if (a->size && !b->size)
+        return -1;
+    if (b->size && !a->size)
+        return 1;
+
+    // Otherwise, just compare based on the shift
+    return PL_CMP(a->shift, b->shift);
+}
+
+void pl_plane_data_from_comps(struct pl_plane_data *data, int size[4],
+                              int shift[4])
+{
+    struct comp comps[MAX_COMPS];
+    for (int i = 0; i < PL_ARRAY_SIZE(comps); i++) {
+        comps[i].order = i;
+        comps[i].size = size[i];
+        comps[i].shift = shift[i];
+    }
+
+    // Sort the components by shift
+    qsort(comps, MAX_COMPS, sizeof(struct comp), compare_comp);
+
+    // Generate the resulting component size/pad/map
+    int offset = 0;
+    for (int i = 0; i < MAX_COMPS; i++)  {
+        if (comps[i].size) {
+            assert(comps[i].shift >= offset);
+            data->component_size[i] = comps[i].size;
+            data->component_pad[i] = comps[i].shift - offset;
+            data->component_map[i] = comps[i].order;
+            offset += data->component_size[i] + data->component_pad[i];
+        } else {
+            // Clear the superfluous entries for sanity
+            data->component_size[i] = 0;
+            data->component_pad[i] = 0;
+            data->component_map[i] = 0;
+        }
+    }
+}
+
+void pl_plane_data_from_mask(struct pl_plane_data *data, uint64_t mask[4])
+{
+    int size[4];
+    int shift[4];
+
+    for (int i = 0; i < PL_ARRAY_SIZE(size); i++) {
+        size[i] = __builtin_popcountll(mask[i]);
+        shift[i] = PL_MAX(0, __builtin_ffsll(mask[i]) - 1);
+
+        // Sanity checking
+        uint64_t mask_reconstructed = (1LLU << size[i]) - 1;
+        mask_reconstructed <<= shift[i];
+        pl_assert(mask_reconstructed == mask[i]);
+    }
+
+    pl_plane_data_from_comps(data, size, shift);
+}
+
+bool pl_plane_data_align(struct pl_plane_data *data, struct pl_bit_encoding *out_bits)
+{
+    struct pl_plane_data aligned = *data;
+    struct pl_bit_encoding bits = {0};
+
+    int offset = 0;
+
+#define SET_TEST(var, value)                \
+    do {                                    \
+        if (offset == 0) {                  \
+            (var) = (value);                \
+        } else if ((var) != (value)) {      \
+            goto misaligned;                \
+        }                                   \
+    } while (0)
+
+    for (int i = 0; i < MAX_COMPS; i++) {
+        if (!aligned.component_size[i])
+            break;
+
+        // Can't meaningfully align alpha channel, so just skip it. This is a
+        // limitation of the fact that `pl_bit_encoding` only applies to the
+        // main color channels, and changing this would be very nontrivial.
+        if (aligned.component_map[i] == PL_CHANNEL_A)
+            continue;
+
+        // Color depth is the original component size, before alignment
+        SET_TEST(bits.color_depth, aligned.component_size[i]);
+
+        // Try consuming padding of the current component to align down. This
+        // corresponds to an extra bit shift to the left.
+        int comp_start = offset + aligned.component_pad[i];
+        int left_delta = comp_start - PL_ALIGN2(comp_start - 7, 8);
+        left_delta = PL_MIN(left_delta, aligned.component_pad[i]);
+        aligned.component_pad[i] -= left_delta;
+        aligned.component_size[i] += left_delta;
+        SET_TEST(bits.bit_shift, left_delta);
+
+        // Try consuming padding of the next component to align up. This
+        // corresponds to simply ignoring some extra 0s on the end.
+        int comp_end = comp_start + aligned.component_size[i] - left_delta;
+        int right_delta = PL_ALIGN2(comp_end, 8) - comp_end;
+        if (i+1 == MAX_COMPS || !aligned.component_size[i+1]) {
+            // This is the last component, so we can be greedy
+            aligned.component_size[i] += right_delta;
+        } else {
+            right_delta = PL_MIN(right_delta, aligned.component_pad[i+1]);
+            aligned.component_pad[i+1] -= right_delta;
+            aligned.component_size[i] += right_delta;
+        }
+
+        // Sample depth is the new total component size, including padding
+        SET_TEST(bits.sample_depth, aligned.component_size[i]);
+
+        offset += aligned.component_pad[i] + aligned.component_size[i];
+    }
+
+    // Easy sanity check, to make sure that we don't exceed the known stride
+    if (aligned.pixel_stride && offset > aligned.pixel_stride * 8)
+        goto misaligned;
+
+    *data = aligned;
+    if (out_bits)
+        *out_bits = bits;
+    return true;
+
+misaligned:
+    // Can't properly align anything, so just do a no-op
+    if (out_bits)
+        *out_bits = (struct pl_bit_encoding) {0};
+    return false;
+}
+
+pl_fmt pl_plane_find_fmt(pl_gpu gpu, int out_map[4], const struct pl_plane_data *data)
+{
+    int dummy[4] = {0};
+    out_map = PL_DEF(out_map, dummy);
+
+    // Endian swapping requires compute shaders (currently)
+    if (data->swapped && !gpu->limits.max_ssbo_size)
+        return NULL;
+
+    // Count the number of components and initialize out_map
+    int num = 0;
+    for (int i = 0; i < PL_ARRAY_SIZE(data->component_size); i++) {
+        out_map[i] = -1;
+        if (data->component_size[i])
+            num = i+1;
+    }
+
+    for (int n = 0; n < gpu->num_formats; n++) {
+        pl_fmt fmt = gpu->formats[n];
+        if (fmt->opaque || fmt->num_components < num)
+            continue;
+        if (fmt->type != data->type || fmt->texel_size != data->pixel_stride)
+            continue;
+        if (!(fmt->caps & PL_FMT_CAP_SAMPLEABLE))
+            continue;
+
+        int idx = 0;
+
+        // Try mapping all pl_plane_data components to texture components
+        for (int i = 0; i < num; i++) {
+            // If there's padding we have to map it to an unused physical
+            // component first
+            int pad = data->component_pad[i];
+            if (pad && (idx >= 4 || fmt->host_bits[idx++] != pad))
+                goto next_fmt;
+
+            // Otherwise, try and match this component
+            int size = data->component_size[i];
+            if (size && (idx >= 4 || fmt->host_bits[idx] != size))
+                goto next_fmt;
+            out_map[idx++] = data->component_map[i];
+        }
+
+        // Reject misaligned formats, check this last to only log such errors
+        // if this is the only thing preventing a format from being used, as
+        // this is likely an issue in the API usage.
+        if (data->row_stride % fmt->texel_align) {
+            PL_WARN(gpu, "Rejecting texture format '%s' due to misalignment: "
+                    "Row stride %zu is not a clean multiple of texel size %zu! "
+                    "This is likely an API usage bug.",
+                    fmt->name, data->row_stride, fmt->texel_align);
+            continue;
+        }
+
+        return fmt;
+
+next_fmt: ; // acts as `continue`
+    }
+
+    return NULL;
+}
+
+bool pl_upload_plane(pl_gpu gpu, struct pl_plane *out_plane,
+                     pl_tex *tex, const struct pl_plane_data *data)
+{
+    pl_assert(!data->buf ^ !data->pixels); // exactly one
+
+    int out_map[4];
+    pl_fmt fmt = pl_plane_find_fmt(gpu, out_map, data);
+    if (!fmt) {
+        PL_ERR(gpu, "Failed picking any compatible texture format for a plane!");
+        return false;
+
+        // TODO: try soft-converting to a supported format using e.g zimg?
+    }
+
+    bool ok = pl_tex_recreate(gpu, tex, pl_tex_params(
+        .w = data->width,
+        .h = data->height,
+        .format = fmt,
+        .sampleable = true,
+        .host_writable = true,
+        .blit_src = fmt->caps & PL_FMT_CAP_BLITTABLE,
+    ));
+
+    if (!ok) {
+        PL_ERR(gpu, "Failed initializing plane texture!");
+        return false;
+    }
+
+    if (out_plane) {
+        out_plane->texture = *tex;
+        out_plane->components = 0;
+        for (int i = 0; i < PL_ARRAY_SIZE(out_map); i++) {
+            out_plane->component_mapping[i] = out_map[i];
+            if (out_map[i] >= 0)
+                out_plane->components = i+1;
+        }
+    }
+
+    struct pl_tex_transfer_params params = {
+        .tex        = *tex,
+        .rc.x1      = data->width, // set these for `pl_tex_transfer_size`
+        .rc.y1      = data->height,
+        .rc.z1      = 1,
+        .row_pitch  = PL_DEF(data->row_stride, data->width * fmt->texel_size),
+        .ptr        = (void *) data->pixels,
+        .buf        = data->buf,
+        .buf_offset = data->buf_offset,
+        .callback   = data->callback,
+        .priv       = data->priv,
+    };
+
+    pl_buf swapbuf = NULL;
+    if (data->swapped) {
+        const size_t aligned = PL_ALIGN2(pl_tex_transfer_size(&params), 4);
+        swapbuf = pl_buf_create(gpu, pl_buf_params(
+            .size           = aligned,
+            .storable       = true,
+            .initial_data   = params.ptr,
+
+            // Note: This may over-read from `ptr` if `ptr` is not aligned to a
+            // word boundary, but the extra texels will be ignored by
+            // `pl_tex_upload` so this UB should be a non-issue in practice.
+        ));
+        if (!swapbuf) {
+            PL_ERR(gpu, "Failed creating endian swapping buffer!");
+            return false;
+        }
+
+        struct pl_buf_copy_swap_params swap_params = {
+            .src        = swapbuf,
+            .dst        = swapbuf,
+            .size       = aligned,
+            .wordsize   = fmt->texel_size / fmt->num_components,
+        };
+
+        bool can_reuse = params.buf && params.buf->params.storable &&
+                         params.buf_offset % 4 == 0 &&
+                         params.buf_offset + aligned <= params.buf->params.size;
+
+        if (params.ptr) {
+            // Data is already uploaded (no-op), can swap in-place
+        } else if (can_reuse) {
+            // We can sample directly from the source buffer
+            swap_params.src = params.buf;
+            swap_params.src_offset = params.buf_offset;
+        } else {
+            // We sadly need to do a second memcpy
+            assert(params.buf);
+            PL_TRACE(gpu, "Double-slow path! pl_buf_copy -> pl_buf_copy_swap...");
+            pl_buf_copy(gpu, swapbuf, 0, params.buf, params.buf_offset,
+                        PL_MIN(aligned, params.buf->params.size - params.buf_offset));
+        }
+
+        if (!pl_buf_copy_swap(gpu, &swap_params)) {
+            PL_ERR(gpu, "Failed swapping endianness!");
+            pl_buf_destroy(gpu, &swapbuf);
+            return false;
+        }
+
+        params.ptr = NULL;
+        params.buf = swapbuf;
+        params.buf_offset = 0;
+    }
+
+    ok = pl_tex_upload(gpu, &params);
+    pl_buf_destroy(gpu, &swapbuf);
+    return ok;
+}
+
+bool pl_recreate_plane(pl_gpu gpu, struct pl_plane *out_plane,
+                       pl_tex *tex, const struct pl_plane_data *data)
+{
+    if (data->swapped) {
+        PL_ERR(gpu, "Cannot call pl_recreate_plane on non-native endian plane "
+               "data, this is only supported for `pl_upload_plane`!");
+        return false;
+    }
+
+    int out_map[4];
+    pl_fmt fmt = pl_plane_find_fmt(gpu, out_map, data);
+    if (!fmt) {
+        PL_ERR(gpu, "Failed picking any compatible texture format for a plane!");
+        return false;
+    }
+
+    bool ok = pl_tex_recreate(gpu, tex, pl_tex_params(
+        .w = data->width,
+        .h = data->height,
+        .format = fmt,
+        .renderable = true,
+        .host_readable = fmt->caps & PL_FMT_CAP_HOST_READABLE,
+        .blit_dst = fmt->caps & PL_FMT_CAP_BLITTABLE,
+        .storable = fmt->caps & PL_FMT_CAP_STORABLE,
+    ));
+
+    if (!ok) {
+        PL_ERR(gpu, "Failed initializing plane texture!");
+        return false;
+    }
+
+    if (out_plane) {
+        out_plane->texture = *tex;
+        out_plane->components = 0;
+        for (int i = 0; i < PL_ARRAY_SIZE(out_map); i++) {
+            out_plane->component_mapping[i] = out_map[i];
+            if (out_map[i] >= 0)
+                out_plane->components = i+1;
+        }
+    }
+
+    return true;
+}
diff --git a/src/version.h.in b/src/version.h.in
new file mode 100644
index 0000000..22bdee8
--- /dev/null
+++ b/src/version.h.in
@@ -0,0 +1 @@
+#define BUILD_VERSION "@buildver@"
diff --git a/src/vulkan/command.c b/src/vulkan/command.c
new file mode 100644
index 0000000..5020aff
--- /dev/null
+++ b/src/vulkan/command.c
@@ -0,0 +1,571 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "command.h"
+#include "utils.h"
+
+// returns VK_SUCCESS (completed), VK_TIMEOUT (not yet completed) or an error
+static VkResult vk_cmd_poll(struct vk_cmd *cmd, uint64_t timeout)
+{
+    struct vk_ctx *vk = cmd->pool->vk;
+    return vk->WaitSemaphores(vk->dev, &(VkSemaphoreWaitInfo) {
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
+        .semaphoreCount = 1,
+        .pSemaphores = &cmd->sync.sem,
+        .pValues = &cmd->sync.value,
+    }, timeout);
+}
+
+static void flush_callbacks(struct vk_ctx *vk)
+{
+    while (vk->num_pending_callbacks) {
+        const struct vk_callback *cb = vk->pending_callbacks++;
+        vk->num_pending_callbacks--;
+        cb->run(cb->priv, cb->arg);
+    }
+}
+
+static void vk_cmd_reset(struct vk_cmd *cmd)
+{
+    struct vk_ctx *vk = cmd->pool->vk;
+
+    // Flush possible callbacks left over from a previous command still in the
+    // process of being reset, whose callback triggered this command being
+    // reset.
+    flush_callbacks(vk);
+    vk->pending_callbacks = cmd->callbacks.elem;
+    vk->num_pending_callbacks = cmd->callbacks.num;
+    flush_callbacks(vk);
+
+    cmd->callbacks.num = 0;
+    cmd->deps.num = 0;
+    cmd->sigs.num = 0;
+}
+
+static void vk_cmd_destroy(struct vk_cmd *cmd)
+{
+    if (!cmd)
+        return;
+
+    struct vk_ctx *vk = cmd->pool->vk;
+    vk_cmd_poll(cmd, UINT64_MAX);
+    vk_cmd_reset(cmd);
+    vk->DestroySemaphore(vk->dev, cmd->sync.sem, PL_VK_ALLOC);
+    vk->FreeCommandBuffers(vk->dev, cmd->pool->pool, 1, &cmd->buf);
+
+    pl_free(cmd);
+}
+
+static struct vk_cmd *vk_cmd_create(struct vk_cmdpool *pool)
+{
+    struct vk_ctx *vk = pool->vk;
+    struct vk_cmd *cmd = pl_zalloc_ptr(NULL, cmd);
+    cmd->pool = pool;
+
+    VkCommandBufferAllocateInfo ainfo = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+        .commandPool = pool->pool,
+        .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+        .commandBufferCount = 1,
+    };
+
+    VK(vk->AllocateCommandBuffers(vk->dev, &ainfo, &cmd->buf));
+
+    static const VkSemaphoreTypeCreateInfo stinfo = {
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO,
+        .semaphoreType  = VK_SEMAPHORE_TYPE_TIMELINE,
+        .initialValue   = 0,
+    };
+
+    static const VkSemaphoreCreateInfo sinfo = {
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+        .pNext = &stinfo,
+    };
+
+    VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &cmd->sync.sem));
+    PL_VK_NAME(SEMAPHORE, cmd->sync.sem, "cmd");
+
+    return cmd;
+
+error:
+    vk_cmd_destroy(cmd);
+    vk->failed = true;
+    return NULL;
+}
+
+void vk_dev_callback(struct vk_ctx *vk, vk_cb callback,
+                     const void *priv, const void *arg)
+{
+    pl_mutex_lock(&vk->lock);
+    if (vk->cmds_pending.num > 0) {
+        struct vk_cmd *last_cmd = vk->cmds_pending.elem[vk->cmds_pending.num - 1];
+        vk_cmd_callback(last_cmd, callback, priv, arg);
+    } else {
+        // The device was already idle, so we can just immediately call it
+        callback((void *) priv, (void *) arg);
+    }
+    pl_mutex_unlock(&vk->lock);
+}
+
+void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback,
+                     const void *priv, const void *arg)
+{
+    PL_ARRAY_APPEND(cmd, cmd->callbacks, (struct vk_callback) {
+        .run  = callback,
+        .priv = (void *) priv,
+        .arg  = (void *) arg,
+    });
+}
+
+void vk_cmd_dep(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem dep)
+{
+    PL_ARRAY_APPEND(cmd, cmd->deps, (VkSemaphoreSubmitInfo) {
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
+        .semaphore  = dep.sem,
+        .value      = dep.value,
+        .stageMask  = stage,
+    });
+}
+
+void vk_cmd_sig(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem sig)
+{
+    VkSemaphoreSubmitInfo sinfo = {
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
+        .semaphore  = sig.sem,
+        .value      = sig.value,
+        .stageMask  = stage,
+    };
+
+    // Try updating existing semaphore signal operations in-place
+    for (int i = 0; i < cmd->sigs.num; i++) {
+        if (cmd->sigs.elem[i].semaphore == sig.sem) {
+            pl_assert(sig.value > cmd->sigs.elem[i].value);
+            cmd->sigs.elem[i] = sinfo;
+            return;
+        }
+    }
+
+    PL_ARRAY_APPEND(cmd, cmd->sigs, sinfo);
+}
+
+#define SET(FLAG, CHECK)  \
+    if (flags2 & (CHECK)) \
+        flags |= FLAG
+
+static VkAccessFlags lower_access2(VkAccessFlags2 flags2)
+{
+    VkAccessFlags flags = flags2 & VK_ACCESS_FLAG_BITS_MAX_ENUM;
+    SET(VK_ACCESS_SHADER_READ_BIT,  VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
+                                    VK_ACCESS_2_SHADER_STORAGE_READ_BIT);
+    SET(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT);
+    return flags;
+}
+
+static VkPipelineStageFlags lower_stage2(VkPipelineStageFlags2 flags2)
+{
+    VkPipelineStageFlags flags = flags2 & VK_PIPELINE_STAGE_FLAG_BITS_MAX_ENUM;
+    SET(VK_PIPELINE_STAGE_TRANSFER_BIT,     VK_PIPELINE_STAGE_2_COPY_BIT |
+                                            VK_PIPELINE_STAGE_2_RESOLVE_BIT |
+                                            VK_PIPELINE_STAGE_2_BLIT_BIT |
+                                            VK_PIPELINE_STAGE_2_CLEAR_BIT);
+    SET(VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
+                                            VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT);
+    return flags;
+}
+
+#undef SET
+
+void vk_cmd_barrier(struct vk_cmd *cmd, const VkDependencyInfo *info)
+{
+    struct vk_ctx *vk = cmd->pool->vk;
+    if (vk->CmdPipelineBarrier2KHR) {
+        vk->CmdPipelineBarrier2KHR(cmd->buf, info);
+        return;
+    }
+
+    pl_assert(!info->pNext);
+    pl_assert(info->memoryBarrierCount == 0);
+    pl_assert(info->bufferMemoryBarrierCount + info->imageMemoryBarrierCount == 1);
+
+    if (info->bufferMemoryBarrierCount) {
+
+        const VkBufferMemoryBarrier2 *barr2 = info->pBufferMemoryBarriers;
+        const VkBufferMemoryBarrier barr = {
+            .sType               = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
+            .pNext               = barr2->pNext,
+            .srcAccessMask       = lower_access2(barr2->srcAccessMask),
+            .dstAccessMask       = lower_access2(barr2->dstAccessMask),
+            .srcQueueFamilyIndex = barr2->srcQueueFamilyIndex,
+            .dstQueueFamilyIndex = barr2->dstQueueFamilyIndex,
+            .buffer              = barr2->buffer,
+            .offset              = barr2->offset,
+            .size                = barr2->size,
+        };
+
+        vk->CmdPipelineBarrier(cmd->buf, lower_stage2(barr2->srcStageMask),
+                               lower_stage2(barr2->dstStageMask),
+                               info->dependencyFlags,
+                               0, NULL, 1, &barr, 0, NULL);
+
+    } else {
+
+        const VkImageMemoryBarrier2 *barr2 = info->pImageMemoryBarriers;
+        const VkImageMemoryBarrier barr = {
+            .sType               = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+            .pNext               = barr2->pNext,
+            .srcAccessMask       = lower_access2(barr2->srcAccessMask),
+            .dstAccessMask       = lower_access2(barr2->dstAccessMask),
+            .oldLayout           = barr2->oldLayout,
+            .newLayout           = barr2->newLayout,
+            .srcQueueFamilyIndex = barr2->srcQueueFamilyIndex,
+            .dstQueueFamilyIndex = barr2->dstQueueFamilyIndex,
+            .image               = barr2->image,
+            .subresourceRange    = barr2->subresourceRange,
+        };
+
+        vk->CmdPipelineBarrier(cmd->buf, lower_stage2(barr2->srcStageMask),
+                               lower_stage2(barr2->dstStageMask),
+                               info->dependencyFlags,
+                               0, NULL, 0, NULL, 1, &barr);
+    }
+}
+
+struct vk_sync_scope vk_sem_barrier(struct vk_cmd *cmd, struct vk_sem *sem,
+                                    VkPipelineStageFlags2 stage,
+                                    VkAccessFlags2 access, bool is_trans)
+{
+    bool is_write = (access & vk_access_write) || is_trans;
+
+    // Writes need to be synchronized against the last *read* (which is
+    // transitively synchronized against the last write), reads only
+    // need to be synchronized against the last write.
+    struct vk_sync_scope last = sem->write;
+    if (is_write && sem->read.access)
+        last = sem->read;
+
+    if (last.queue != cmd->queue) {
+        if (!is_write && sem->read.queue == cmd->queue) {
+            // No semaphore needed in this case because the implicit submission
+            // order execution dependencies already transitively imply a wait
+            // for the previous write
+        } else if (last.sync.sem) {
+            // Image barrier still needs to depend on this stage for implicit
+            // ordering guarantees to apply properly
+            vk_cmd_dep(cmd, stage, last.sync);
+            last.stage = stage;
+        }
+
+        // Last access is on different queue, so no pipeline barrier needed
+        last.access = 0;
+    }
+
+    if (!is_write && sem->read.queue == cmd->queue &&
+        (sem->read.stage & stage) == stage &&
+        (sem->read.access & access) == access)
+    {
+        // A past pipeline barrier already covers this access transitively, so
+        // we don't need to emit another pipeline barrier at all
+        last.access = 0;
+    }
+
+    if (is_write) {
+        sem->write = (struct vk_sync_scope) {
+            .sync = cmd->sync,
+            .queue = cmd->queue,
+            .stage = stage,
+            .access = access,
+        };
+
+        sem->read = (struct vk_sync_scope) {
+            .sync = cmd->sync,
+            .queue = cmd->queue,
+            // no stage or access scope, because no reads happened yet
+        };
+    } else if (sem->read.queue == cmd->queue) {
+        // Coalesce multiple same-queue reads into a single access scope
+        sem->read.sync = cmd->sync;
+        sem->read.stage |= stage;
+        sem->read.access |= access;
+    } else {
+        sem->read = (struct vk_sync_scope) {
+            .sync = cmd->sync,
+            .queue = cmd->queue,
+            .stage = stage,
+            .access = access,
+        };
+    }
+
+    // We never need to include pipeline barriers for reads, only writes
+    last.access &= vk_access_write;
+    return last;
+}
+
+struct vk_cmdpool *vk_cmdpool_create(struct vk_ctx *vk, int qf, int qnum,
+                                     VkQueueFamilyProperties props)
+{
+    struct vk_cmdpool *pool = pl_alloc_ptr(NULL, pool);
+    *pool = (struct vk_cmdpool) {
+        .vk         = vk,
+        .props      = props,
+        .qf         = qf,
+        .queues     = pl_calloc(pool, qnum, sizeof(VkQueue)),
+        .num_queues = qnum,
+    };
+
+    for (int n = 0; n < qnum; n++)
+        vk->GetDeviceQueue(vk->dev, qf, n, &pool->queues[n]);
+
+    VkCommandPoolCreateInfo cinfo = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+        .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT |
+                 VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
+        .queueFamilyIndex = qf,
+    };
+
+    VK(vk->CreateCommandPool(vk->dev, &cinfo, PL_VK_ALLOC, &pool->pool));
+    return pool;
+
+error:
+    vk_cmdpool_destroy(pool);
+    vk->failed = true;
+    return NULL;
+}
+
+void vk_cmdpool_destroy(struct vk_cmdpool *pool)
+{
+    if (!pool)
+        return;
+
+    for (int i = 0; i < pool->cmds.num; i++)
+        vk_cmd_destroy(pool->cmds.elem[i]);
+
+    struct vk_ctx *vk = pool->vk;
+    vk->DestroyCommandPool(vk->dev, pool->pool, PL_VK_ALLOC);
+    pl_free(pool);
+}
+
+struct vk_cmd *vk_cmd_begin(struct vk_cmdpool *pool, pl_debug_tag debug_tag)
+{
+    struct vk_ctx *vk = pool->vk;
+
+    // Garbage collect the cmdpool first, to increase the chances of getting
+    // an already-available command buffer.
+    vk_poll_commands(vk, 0);
+
+    struct vk_cmd *cmd = NULL;
+    pl_mutex_lock(&vk->lock);
+    if (!PL_ARRAY_POP(pool->cmds, &cmd)) {
+        cmd = vk_cmd_create(pool);
+        if (!cmd) {
+            pl_mutex_unlock(&vk->lock);
+            goto error;
+        }
+    }
+
+    cmd->qindex = pool->idx_queues;
+    cmd->queue = pool->queues[cmd->qindex];
+    pl_mutex_unlock(&vk->lock);
+
+    VkCommandBufferBeginInfo binfo = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+        .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
+    };
+
+    VK(vk->BeginCommandBuffer(cmd->buf, &binfo));
+
+    debug_tag = PL_DEF(debug_tag, "vk_cmd");
+    PL_VK_NAME_HANDLE(COMMAND_BUFFER, cmd->buf, debug_tag);
+    PL_VK_NAME(SEMAPHORE, cmd->sync.sem, debug_tag);
+
+    cmd->sync.value++;
+    vk_cmd_sig(cmd, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, cmd->sync);
+    return cmd;
+
+error:
+    // Something has to be seriously messed up if we get to this point
+    vk_cmd_destroy(cmd);
+    vk->failed = true;
+    return NULL;
+}
+
+static VkResult vk_queue_submit2(struct vk_ctx *vk, VkQueue queue,
+                                 const VkSubmitInfo2 *info2, VkFence fence)
+{
+    if (vk->QueueSubmit2KHR)
+        return vk->QueueSubmit2KHR(queue, 1, info2, fence);
+
+    const uint32_t num_deps = info2->waitSemaphoreInfoCount;
+    const uint32_t num_sigs = info2->signalSemaphoreInfoCount;
+    const uint32_t num_cmds = info2->commandBufferInfoCount;
+
+    void *tmp = pl_tmp(NULL);
+    VkSemaphore *deps           = pl_calloc_ptr(tmp, num_deps, deps);
+    VkPipelineStageFlags *masks = pl_calloc_ptr(tmp, num_deps, masks);
+    uint64_t *depvals           = pl_calloc_ptr(tmp, num_deps, depvals);
+    VkSemaphore *sigs           = pl_calloc_ptr(tmp, num_sigs, sigs);
+    uint64_t *sigvals           = pl_calloc_ptr(tmp, num_sigs, sigvals);
+    VkCommandBuffer *cmds       = pl_calloc_ptr(tmp, num_cmds, cmds);
+
+    for (int i = 0; i < num_deps; i++) {
+        deps[i] = info2->pWaitSemaphoreInfos[i].semaphore;
+        masks[i] = info2->pWaitSemaphoreInfos[i].stageMask;
+        depvals[i] = info2->pWaitSemaphoreInfos[i].value;
+    }
+    for (int i = 0; i < num_sigs; i++) {
+        sigs[i] = info2->pSignalSemaphoreInfos[i].semaphore;
+        sigvals[i] = info2->pSignalSemaphoreInfos[i].value;
+    }
+    for (int i = 0; i < num_cmds; i++)
+        cmds[i] = info2->pCommandBufferInfos[i].commandBuffer;
+
+    const VkTimelineSemaphoreSubmitInfo tinfo = {
+        .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO,
+        .pNext = info2->pNext,
+        .waitSemaphoreValueCount = num_deps,
+        .pWaitSemaphoreValues = depvals,
+        .signalSemaphoreValueCount = num_sigs,
+        .pSignalSemaphoreValues = sigvals,
+    };
+
+    const VkSubmitInfo info = {
+        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+        .pNext = &tinfo,
+        .waitSemaphoreCount = num_deps,
+        .pWaitSemaphores = deps,
+        .pWaitDstStageMask = masks,
+        .commandBufferCount = num_cmds,
+        .pCommandBuffers = cmds,
+        .signalSemaphoreCount = num_sigs,
+        .pSignalSemaphores = sigs,
+    };
+
+    VkResult res = vk->QueueSubmit(queue, 1, &info, fence);
+    pl_free(tmp);
+    return res;
+}
+
+bool vk_cmd_submit(struct vk_cmd **pcmd)
+{
+    struct vk_cmd *cmd = *pcmd;
+    if (!cmd)
+        return true;
+
+    *pcmd = NULL;
+    struct vk_cmdpool *pool = cmd->pool;
+    struct vk_ctx *vk = pool->vk;
+
+    VK(vk->EndCommandBuffer(cmd->buf));
+
+    VkSubmitInfo2 sinfo = {
+        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
+        .waitSemaphoreInfoCount = cmd->deps.num,
+        .pWaitSemaphoreInfos = cmd->deps.elem,
+        .signalSemaphoreInfoCount = cmd->sigs.num,
+        .pSignalSemaphoreInfos = cmd->sigs.elem,
+        .commandBufferInfoCount = 1,
+        .pCommandBufferInfos = &(VkCommandBufferSubmitInfo) {
+            .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+            .commandBuffer = cmd->buf,
+        },
+    };
+
+    if (pl_msg_test(vk->log, PL_LOG_TRACE)) {
+        PL_TRACE(vk, "Submitting command %p on queue %p (QF %d):",
+                 (void *) cmd->buf, (void *) cmd->queue, pool->qf);
+        for (int n = 0; n < cmd->deps.num; n++) {
+            PL_TRACE(vk, "    waits on semaphore 0x%"PRIx64" = %"PRIu64,
+                     (uint64_t) cmd->deps.elem[n].semaphore, cmd->deps.elem[n].value);
+        }
+        for (int n = 0; n < cmd->sigs.num; n++) {
+            PL_TRACE(vk, "    signals semaphore 0x%"PRIx64" = %"PRIu64,
+                    (uint64_t) cmd->sigs.elem[n].semaphore, cmd->sigs.elem[n].value);
+        }
+        if (cmd->callbacks.num)
+            PL_TRACE(vk, "    signals %d callbacks", cmd->callbacks.num);
+    }
+
+    vk->lock_queue(vk->queue_ctx, pool->qf, cmd->qindex);
+    VkResult res = vk_queue_submit2(vk, cmd->queue, &sinfo, VK_NULL_HANDLE);
+    vk->unlock_queue(vk->queue_ctx, pool->qf, cmd->qindex);
+    PL_VK_ASSERT(res, "vkQueueSubmit2");
+
+    pl_mutex_lock(&vk->lock);
+    PL_ARRAY_APPEND(vk->alloc, vk->cmds_pending, cmd);
+    pl_mutex_unlock(&vk->lock);
+    return true;
+
+error:
+    vk_cmd_reset(cmd);
+    pl_mutex_lock(&vk->lock);
+    PL_ARRAY_APPEND(pool, pool->cmds, cmd);
+    pl_mutex_unlock(&vk->lock);
+    vk->failed = true;
+    return false;
+}
+
+bool vk_poll_commands(struct vk_ctx *vk, uint64_t timeout)
+{
+    bool ret = false;
+    pl_mutex_lock(&vk->lock);
+
+    while (vk->cmds_pending.num) {
+        struct vk_cmd *cmd = vk->cmds_pending.elem[0];
+        struct vk_cmdpool *pool = cmd->pool;
+        pl_mutex_unlock(&vk->lock); // don't hold mutex while blocking
+        if (vk_cmd_poll(cmd, timeout) == VK_TIMEOUT)
+            return ret;
+        pl_mutex_lock(&vk->lock);
+        if (!vk->cmds_pending.num || vk->cmds_pending.elem[0] != cmd)
+            continue; // another thread modified this state while blocking
+
+        PL_TRACE(vk, "VkSemaphore signalled: 0x%"PRIx64" = %"PRIu64,
+                 (uint64_t) cmd->sync.sem, cmd->sync.value);
+        PL_ARRAY_REMOVE_AT(vk->cmds_pending, 0); // remove before callbacks
+        vk_cmd_reset(cmd);
+        PL_ARRAY_APPEND(pool, pool->cmds, cmd);
+        ret = true;
+
+        // If we've successfully spent some time waiting for at least one
+        // command, disable the timeout. This has the dual purpose of both
+        // making sure we don't over-wait due to repeat timeout application,
+        // but also makes sure we don't block on future commands if we've
+        // already spend time waiting for one.
+        timeout = 0;
+    }
+
+    pl_mutex_unlock(&vk->lock);
+    return ret;
+}
+
+void vk_rotate_queues(struct vk_ctx *vk)
+{
+    pl_mutex_lock(&vk->lock);
+
+    // Rotate the queues to ensure good parallelism across frames
+    for (int i = 0; i < vk->pools.num; i++) {
+        struct vk_cmdpool *pool = vk->pools.elem[i];
+        pool->idx_queues = (pool->idx_queues + 1) % pool->num_queues;
+        PL_TRACE(vk, "QF %d: %d/%d", pool->qf, pool->idx_queues, pool->num_queues);
+    }
+
+    pl_mutex_unlock(&vk->lock);
+}
+
+void vk_wait_idle(struct vk_ctx *vk)
+{
+    while (vk_poll_commands(vk, UINT64_MAX)) ;
+}
diff --git a/src/vulkan/command.h b/src/vulkan/command.h
new file mode 100644
index 0000000..4c70482
--- /dev/null
+++ b/src/vulkan/command.h
@@ -0,0 +1,142 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include "common.h"
+
+// Since lots of vulkan operations need to be done lazily once the affected
+// resources are no longer in use, provide an abstraction for tracking these.
+// In practice, these are only checked and run when submitting new commands, so
+// the actual execution may be delayed by a frame.
+typedef void (*vk_cb)(void *p, void *arg);
+
+struct vk_callback {
+    vk_cb run;
+    void *priv;
+    void *arg;
+};
+
+// Associate a callback with the completion of all currently pending commands.
+// This will essentially run once the device is completely idle.
+void vk_dev_callback(struct vk_ctx *vk, vk_cb callback,
+                     const void *priv, const void *arg);
+
+// Helper wrapper around command buffers that also track dependencies,
+// callbacks and synchronization primitives
+//
+// Thread-safety: Unsafe
+struct vk_cmd {
+    struct vk_cmdpool *pool; // pool it was allocated from
+    pl_vulkan_sem sync;      // pending execution, tied to lifetime of device
+    VkQueue queue;           // the submission queue (for recording/pending)
+    int qindex;              // the index of `queue` in `pool`
+    VkCommandBuffer buf;     // the command buffer itself
+    // Command dependencies and signals. Not owned by the vk_cmd.
+    PL_ARRAY(VkSemaphoreSubmitInfo) deps;
+    PL_ARRAY(VkSemaphoreSubmitInfo) sigs;
+    // "Callbacks" to fire once a command completes. These are used for
+    // multiple purposes, ranging from resource deallocation to fencing.
+    PL_ARRAY(struct vk_callback) callbacks;
+};
+
+// Associate a callback with the completion of the current command. This
+// function will be run once the command completes, or shortly thereafter.
+void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback,
+                     const void *priv, const void *arg);
+
+// Associate a raw dependency for the current command. This semaphore must
+// signal by the corresponding stage before the command may execute.
+void vk_cmd_dep(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem dep);
+
+// Associate a raw signal with the current command. This semaphore will signal
+// after the given stage completes.
+void vk_cmd_sig(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem sig);
+
+// Compatibility wrappers for vkCmdPipelineBarrier2 (works with pre-1.3)
+void vk_cmd_barrier(struct vk_cmd *cmd, const VkDependencyInfo *info);
+
+// Synchronization scope
+struct vk_sync_scope {
+    pl_vulkan_sem sync;         // semaphore of last access
+    VkQueue queue;              // source queue of last access
+    VkPipelineStageFlags2 stage;// stage bitmask of last access
+    VkAccessFlags2 access;      // access type bitmask
+};
+
+// Synchronization primitive
+struct vk_sem {
+    struct vk_sync_scope read, write;
+};
+
+// Updates the `vk_sem` state for a given access. If `is_trans` is set, this
+// access is treated as a write (since it alters the resource's state).
+//
+// Returns a struct describing the previous access to a resource. A pipeline
+// barrier is only required if the previous access scope is nonzero.
+struct vk_sync_scope vk_sem_barrier(struct vk_cmd *cmd, struct vk_sem *sem,
+                                    VkPipelineStageFlags2 stage,
+                                    VkAccessFlags2 access, bool is_trans);
+
+// Command pool / queue family hybrid abstraction
+struct vk_cmdpool {
+    struct vk_ctx *vk;
+    VkQueueFamilyProperties props;
+    int qf; // queue family index
+    VkCommandPool pool;
+    VkQueue *queues;
+    int num_queues;
+    int idx_queues;
+    // Command buffers associated with this queue. These are available for
+    // re-recording
+    PL_ARRAY(struct vk_cmd *) cmds;
+};
+
+// Set up a vk_cmdpool corresponding to a queue family. `qnum` may be less than
+// `props.queueCount`, to restrict the number of queues in this queue family.
+struct vk_cmdpool *vk_cmdpool_create(struct vk_ctx *vk, int qf, int qnum,
+                                     VkQueueFamilyProperties props);
+
+void vk_cmdpool_destroy(struct vk_cmdpool *pool);
+
+// Fetch a command buffer from a command pool and begin recording to it.
+// Returns NULL on failure.
+struct vk_cmd *vk_cmd_begin(struct vk_cmdpool *pool, pl_debug_tag debug_tag);
+
+// Finish recording a command buffer and submit it for execution. This function
+// takes over ownership of **cmd, and sets *cmd to NULL in doing so.
+bool vk_cmd_submit(struct vk_cmd **cmd);
+
+// Block until some commands complete executing. This is the only function that
+// actually processes the callbacks. Will wait at most `timeout` nanoseconds
+// for the completion of any command. The timeout may also be passed as 0, in
+// which case this function will not block, but only poll for completed
+// commands. Returns whether any forward progress was made.
+//
+// This does *not* flush any queued commands, forgetting to do so may result
+// in infinite loops if waiting for the completion of callbacks that were
+// never flushed!
+bool vk_poll_commands(struct vk_ctx *vk, uint64_t timeout);
+
+// Rotate through queues in each command pool. Call this once per frame, after
+// submitting all of the command buffers for that frame. Calling this more
+// often than that is possible but bad for performance.
+void vk_rotate_queues(struct vk_ctx *vk);
+
+// Wait until all commands are complete, i.e. the device is idle. This is
+// basically equivalent to calling `vk_poll_commands` with a timeout of
+// UINT64_MAX until it returns `false`.
+void vk_wait_idle(struct vk_ctx *vk);
diff --git a/src/vulkan/common.h b/src/vulkan/common.h
new file mode 100644
index 0000000..31b309e
--- /dev/null
+++ b/src/vulkan/common.h
@@ -0,0 +1,234 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#define VK_NO_PROTOTYPES
+#define VK_ENABLE_BETA_EXTENSIONS // for VK_KHR_portability_subset
+#define VK_USE_PLATFORM_METAL_EXT
+
+#include "../common.h"
+#include "../log.h"
+#include "../pl_thread.h"
+
+#include <libplacebo/vulkan.h>
+
+#ifdef PL_HAVE_WIN32
+#include <windows.h>
+#include <vulkan/vulkan_win32.h>
+#endif
+
+// Vulkan allows the optional use of a custom allocator. We don't need one but
+// mark this parameter with a better name in case we ever decide to change this
+// in the future. (And to make the code more readable)
+#define PL_VK_ALLOC NULL
+
+// Type of a vulkan function that needs to be loaded
+#define PL_VK_FUN(name) PFN_vk##name name
+
+// Load a vulkan instance-level extension function directly (on the stack)
+#define PL_VK_LOAD_FUN(inst, name, get_addr) \
+    PL_VK_FUN(name) = (PFN_vk##name) get_addr(inst, "vk" #name);
+
+#ifndef VK_VENDOR_ID_NVIDIA
+#define VK_VENDOR_ID_NVIDIA 0x10DE
+#endif
+
+// Shared struct used to hold vulkan context information
+struct vk_ctx {
+    pl_mutex lock;
+    pl_vulkan vulkan;
+    void *alloc; // host allocations bound to the lifetime of this vk_ctx
+    struct vk_malloc *ma; // VRAM malloc layer
+    pl_vk_inst internal_instance;
+    pl_log log;
+    VkInstance inst;
+    VkPhysicalDevice physd;
+    VkPhysicalDeviceProperties props;
+    VkPhysicalDeviceFeatures2 features;
+    uint32_t api_ver; // device API version
+    VkDevice dev;
+    bool imported; // device was not created by us
+
+    // Generic error flag for catching "failed" devices
+    bool failed;
+
+    // Enabled extensions
+    PL_ARRAY(const char *) exts;
+
+    // Command pools (one per queue family)
+    PL_ARRAY(struct vk_cmdpool *) pools;
+
+    // Pointers into `pools` (always set)
+    struct vk_cmdpool *pool_graphics;
+    struct vk_cmdpool *pool_compute;
+    struct vk_cmdpool *pool_transfer;
+
+    // Queue locking functions
+    PL_ARRAY(PL_ARRAY(pl_mutex)) queue_locks;
+    void (*lock_queue)(void *queue_ctx, uint32_t qf, uint32_t idx);
+    void (*unlock_queue)(void *queue_ctx, uint32_t qf, uint32_t idx);
+    void *queue_ctx;
+
+    // Pending commands. These are shared for the entire mpvk_ctx to ensure
+    // submission and callbacks are FIFO
+    PL_ARRAY(struct vk_cmd *) cmds_pending; // submitted but not completed
+
+    // Pending callbacks that still need to be drained before processing
+    // callbacks for the next command (in case commands are recursively being
+    // polled from another callback)
+    const struct vk_callback *pending_callbacks;
+    int num_pending_callbacks;
+
+    // Instance-level function pointers
+    PL_VK_FUN(CreateDevice);
+    PL_VK_FUN(EnumerateDeviceExtensionProperties);
+    PL_VK_FUN(GetDeviceProcAddr);
+    PL_VK_FUN(GetInstanceProcAddr);
+    PL_VK_FUN(GetPhysicalDeviceExternalBufferProperties);
+    PL_VK_FUN(GetPhysicalDeviceExternalSemaphoreProperties);
+    PL_VK_FUN(GetPhysicalDeviceFeatures2KHR);
+    PL_VK_FUN(GetPhysicalDeviceFormatProperties);
+    PL_VK_FUN(GetPhysicalDeviceFormatProperties2KHR);
+    PL_VK_FUN(GetPhysicalDeviceImageFormatProperties2KHR);
+    PL_VK_FUN(GetPhysicalDeviceMemoryProperties);
+    PL_VK_FUN(GetPhysicalDeviceProperties);
+    PL_VK_FUN(GetPhysicalDeviceProperties2);
+    PL_VK_FUN(GetPhysicalDeviceQueueFamilyProperties);
+    PL_VK_FUN(GetPhysicalDeviceSurfaceCapabilitiesKHR);
+    PL_VK_FUN(GetPhysicalDeviceSurfaceFormatsKHR);
+    PL_VK_FUN(GetPhysicalDeviceSurfacePresentModesKHR);
+    PL_VK_FUN(GetPhysicalDeviceSurfaceSupportKHR);
+
+    // Device-level function pointers
+    PL_VK_FUN(AcquireNextImageKHR);
+    PL_VK_FUN(AllocateCommandBuffers);
+    PL_VK_FUN(AllocateDescriptorSets);
+    PL_VK_FUN(AllocateMemory);
+    PL_VK_FUN(BeginCommandBuffer);
+    PL_VK_FUN(BindBufferMemory);
+    PL_VK_FUN(BindImageMemory);
+    PL_VK_FUN(CmdBeginDebugUtilsLabelEXT);
+    PL_VK_FUN(CmdBeginRenderPass);
+    PL_VK_FUN(CmdBindDescriptorSets);
+    PL_VK_FUN(CmdBindIndexBuffer);
+    PL_VK_FUN(CmdBindPipeline);
+    PL_VK_FUN(CmdBindVertexBuffers);
+    PL_VK_FUN(CmdBlitImage);
+    PL_VK_FUN(CmdClearColorImage);
+    PL_VK_FUN(CmdCopyBuffer);
+    PL_VK_FUN(CmdCopyBufferToImage);
+    PL_VK_FUN(CmdCopyImage);
+    PL_VK_FUN(CmdCopyImageToBuffer);
+    PL_VK_FUN(CmdDispatch);
+    PL_VK_FUN(CmdDraw);
+    PL_VK_FUN(CmdDrawIndexed);
+    PL_VK_FUN(CmdEndDebugUtilsLabelEXT);
+    PL_VK_FUN(CmdEndRenderPass);
+    PL_VK_FUN(CmdPipelineBarrier);
+    PL_VK_FUN(CmdPipelineBarrier2KHR);
+    PL_VK_FUN(CmdPushConstants);
+    PL_VK_FUN(CmdPushDescriptorSetKHR);
+    PL_VK_FUN(CmdResetQueryPool);
+    PL_VK_FUN(CmdSetScissor);
+    PL_VK_FUN(CmdSetViewport);
+    PL_VK_FUN(CmdUpdateBuffer);
+    PL_VK_FUN(CmdWriteTimestamp);
+    PL_VK_FUN(CreateBuffer);
+    PL_VK_FUN(CreateBufferView);
+    PL_VK_FUN(CreateCommandPool);
+    PL_VK_FUN(CreateComputePipelines);
+    PL_VK_FUN(CreateDebugReportCallbackEXT);
+    PL_VK_FUN(CreateDescriptorPool);
+    PL_VK_FUN(CreateDescriptorSetLayout);
+    PL_VK_FUN(CreateFence);
+    PL_VK_FUN(CreateFramebuffer);
+    PL_VK_FUN(CreateGraphicsPipelines);
+    PL_VK_FUN(CreateImage);
+    PL_VK_FUN(CreateImageView);
+    PL_VK_FUN(CreatePipelineCache);
+    PL_VK_FUN(CreatePipelineLayout);
+    PL_VK_FUN(CreateQueryPool);
+    PL_VK_FUN(CreateRenderPass);
+    PL_VK_FUN(CreateSampler);
+    PL_VK_FUN(CreateSemaphore);
+    PL_VK_FUN(CreateShaderModule);
+    PL_VK_FUN(CreateSwapchainKHR);
+    PL_VK_FUN(DestroyBuffer);
+    PL_VK_FUN(DestroyBufferView);
+    PL_VK_FUN(DestroyCommandPool);
+    PL_VK_FUN(DestroyDebugReportCallbackEXT);
+    PL_VK_FUN(DestroyDescriptorPool);
+    PL_VK_FUN(DestroyDescriptorSetLayout);
+    PL_VK_FUN(DestroyDevice);
+    PL_VK_FUN(DestroyFence);
+    PL_VK_FUN(DestroyFramebuffer);
+    PL_VK_FUN(DestroyImage);
+    PL_VK_FUN(DestroyImageView);
+    PL_VK_FUN(DestroyInstance);
+    PL_VK_FUN(DestroyPipeline);
+    PL_VK_FUN(DestroyPipelineCache);
+    PL_VK_FUN(DestroyPipelineLayout);
+    PL_VK_FUN(DestroyQueryPool);
+    PL_VK_FUN(DestroyRenderPass);
+    PL_VK_FUN(DestroySampler);
+    PL_VK_FUN(DestroySemaphore);
+    PL_VK_FUN(DestroyShaderModule);
+    PL_VK_FUN(DestroySwapchainKHR);
+    PL_VK_FUN(DeviceWaitIdle);
+    PL_VK_FUN(EndCommandBuffer);
+    PL_VK_FUN(FlushMappedMemoryRanges);
+    PL_VK_FUN(FreeCommandBuffers);
+    PL_VK_FUN(FreeMemory);
+    PL_VK_FUN(GetBufferMemoryRequirements);
+    PL_VK_FUN(GetDeviceQueue);
+    PL_VK_FUN(GetImageDrmFormatModifierPropertiesEXT);
+    PL_VK_FUN(GetImageMemoryRequirements2);
+    PL_VK_FUN(GetImageSubresourceLayout);
+    PL_VK_FUN(GetMemoryFdKHR);
+    PL_VK_FUN(GetMemoryFdPropertiesKHR);
+    PL_VK_FUN(GetMemoryHostPointerPropertiesEXT);
+    PL_VK_FUN(GetPipelineCacheData);
+    PL_VK_FUN(GetQueryPoolResults);
+    PL_VK_FUN(GetSemaphoreFdKHR);
+    PL_VK_FUN(GetSwapchainImagesKHR);
+    PL_VK_FUN(InvalidateMappedMemoryRanges);
+    PL_VK_FUN(MapMemory);
+    PL_VK_FUN(QueuePresentKHR);
+    PL_VK_FUN(QueueSubmit);
+    PL_VK_FUN(QueueSubmit2KHR);
+    PL_VK_FUN(QueueWaitIdle);
+    PL_VK_FUN(ResetFences);
+    PL_VK_FUN(ResetQueryPool);
+    PL_VK_FUN(SetDebugUtilsObjectNameEXT);
+    PL_VK_FUN(SetHdrMetadataEXT);
+    PL_VK_FUN(UpdateDescriptorSets);
+    PL_VK_FUN(WaitForFences);
+    PL_VK_FUN(WaitSemaphores);
+
+#ifdef PL_HAVE_WIN32
+    PL_VK_FUN(GetMemoryWin32HandleKHR);
+    PL_VK_FUN(GetSemaphoreWin32HandleKHR);
+#endif
+
+#ifdef VK_EXT_metal_objects
+    PL_VK_FUN(ExportMetalObjectsEXT);
+#endif
+#ifdef VK_EXT_full_screen_exclusive
+    PL_VK_FUN(AcquireFullScreenExclusiveModeEXT);
+#endif
+};
diff --git a/src/vulkan/context.c b/src/vulkan/context.c
new file mode 100644
index 0000000..ad8a859
--- /dev/null
+++ b/src/vulkan/context.c
@@ -0,0 +1,1704 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include "command.h"
+#include "utils.h"
+#include "gpu.h"
+
+#ifdef PL_HAVE_VK_PROC_ADDR
+VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetInstanceProcAddr(
+    VkInstance                                  instance,
+    const char*                                 pName);
+#endif
+
+const struct pl_vk_inst_params pl_vk_inst_default_params = {0};
+
+struct vk_fun {
+    const char *name;
+    size_t offset;
+    bool device_level;
+};
+
+struct vk_ext {
+    const char *name;
+    const struct vk_fun *funs;
+};
+
+#define PL_VK_INST_FUN(N)                   \
+    { .name = "vk" #N,                      \
+      .offset = offsetof(struct vk_ctx, N), \
+    }
+
+#define PL_VK_DEV_FUN(N)                    \
+    { .name = "vk" #N,                      \
+      .offset = offsetof(struct vk_ctx, N), \
+      .device_level = true,                 \
+    }
+
+// Table of optional vulkan instance extensions
+static const char *vk_instance_extensions[] = {
+    VK_KHR_SURFACE_EXTENSION_NAME,
+    VK_EXT_SWAPCHAIN_COLOR_SPACE_EXTENSION_NAME,
+    VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME,
+    VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME,
+    VK_KHR_GET_SURFACE_CAPABILITIES_2_EXTENSION_NAME,
+};
+
+// List of mandatory instance-level function pointers, including functions
+// associated with mandatory instance extensions
+static const struct vk_fun vk_inst_funs[] = {
+    PL_VK_INST_FUN(CreateDevice),
+    PL_VK_INST_FUN(EnumerateDeviceExtensionProperties),
+    PL_VK_INST_FUN(GetDeviceProcAddr),
+    PL_VK_INST_FUN(GetPhysicalDeviceExternalBufferProperties),
+    PL_VK_INST_FUN(GetPhysicalDeviceExternalSemaphoreProperties),
+    PL_VK_INST_FUN(GetPhysicalDeviceFeatures2KHR),
+    PL_VK_INST_FUN(GetPhysicalDeviceFormatProperties),
+    PL_VK_INST_FUN(GetPhysicalDeviceFormatProperties2KHR),
+    PL_VK_INST_FUN(GetPhysicalDeviceImageFormatProperties2KHR),
+    PL_VK_INST_FUN(GetPhysicalDeviceMemoryProperties),
+    PL_VK_INST_FUN(GetPhysicalDeviceProperties),
+    PL_VK_INST_FUN(GetPhysicalDeviceProperties2),
+    PL_VK_INST_FUN(GetPhysicalDeviceQueueFamilyProperties),
+
+    // These are not actually mandatory, but they're universal enough that we
+    // just load them unconditionally (in lieu of not having proper support for
+    // loading arbitrary instance extensions). Their use is generally guarded
+    // behind various VkSurfaceKHR values already being provided by the API
+    // user (implying this extension is loaded).
+    PL_VK_INST_FUN(GetPhysicalDeviceSurfaceCapabilitiesKHR),
+    PL_VK_INST_FUN(GetPhysicalDeviceSurfaceFormatsKHR),
+    PL_VK_INST_FUN(GetPhysicalDeviceSurfacePresentModesKHR),
+    PL_VK_INST_FUN(GetPhysicalDeviceSurfaceSupportKHR),
+};
+
+// Table of vulkan device extensions and functions they load, including
+// functions exported by dependent instance-level extensions
+static const struct vk_ext vk_device_extensions[] = {
+    {
+        .name = VK_KHR_SWAPCHAIN_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(AcquireNextImageKHR),
+            PL_VK_DEV_FUN(CreateSwapchainKHR),
+            PL_VK_DEV_FUN(DestroySwapchainKHR),
+            PL_VK_DEV_FUN(GetSwapchainImagesKHR),
+            PL_VK_DEV_FUN(QueuePresentKHR),
+            {0}
+        },
+    }, {
+        .name = VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(CmdPushDescriptorSetKHR),
+            {0}
+        },
+    }, {
+        .name = VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(GetMemoryFdKHR),
+            {0}
+        },
+    }, {
+        .name = VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(GetMemoryFdPropertiesKHR),
+            {0}
+        },
+#ifdef PL_HAVE_WIN32
+    }, {
+        .name = VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(GetMemoryWin32HandleKHR),
+            {0}
+        },
+#endif
+    }, {
+        .name = VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(GetMemoryHostPointerPropertiesEXT),
+            {0}
+        },
+    }, {
+        .name = VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(GetSemaphoreFdKHR),
+            {0}
+        },
+#ifdef PL_HAVE_WIN32
+    }, {
+        .name = VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(GetSemaphoreWin32HandleKHR),
+            {0}
+        },
+#endif
+    }, {
+        .name = VK_EXT_PCI_BUS_INFO_EXTENSION_NAME,
+    }, {
+        .name = VK_EXT_HDR_METADATA_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(SetHdrMetadataEXT),
+            {0}
+        },
+    }, {
+        .name = VK_EXT_IMAGE_DRM_FORMAT_MODIFIER_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(GetImageDrmFormatModifierPropertiesEXT),
+            {0}
+        },
+#ifdef VK_KHR_portability_subset
+    }, {
+        .name = VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME,
+#endif
+#ifdef VK_EXT_metal_objects
+    }, {
+        .name = VK_EXT_METAL_OBJECTS_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(ExportMetalObjectsEXT),
+            {0}
+        },
+#endif
+#ifdef VK_EXT_full_screen_exclusive
+    }, {
+        .name = VK_EXT_FULL_SCREEN_EXCLUSIVE_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(AcquireFullScreenExclusiveModeEXT),
+            {0}
+        },
+#endif
+    }, {
+        .name = VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(CmdPipelineBarrier2KHR),
+            PL_VK_DEV_FUN(QueueSubmit2KHR),
+            {0}
+        },
+    },
+};
+
+// Make sure to keep this in sync with the above!
+const char * const pl_vulkan_recommended_extensions[] = {
+    VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME,
+    VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME,
+    VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME,
+    VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME,
+    VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME,
+#ifdef PL_HAVE_WIN32
+    VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME,
+    VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME,
+#endif
+    VK_EXT_PCI_BUS_INFO_EXTENSION_NAME,
+    VK_EXT_HDR_METADATA_EXTENSION_NAME,
+    VK_EXT_IMAGE_DRM_FORMAT_MODIFIER_EXTENSION_NAME,
+#ifdef VK_KHR_portability_subset
+    VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME,
+#endif
+#ifdef VK_EXT_metal_objects
+    VK_EXT_METAL_OBJECTS_EXTENSION_NAME,
+#endif
+#ifdef VK_EXT_full_screen_exclusive
+    VK_EXT_FULL_SCREEN_EXCLUSIVE_EXTENSION_NAME,
+#endif
+    VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME,
+};
+
+const int pl_vulkan_num_recommended_extensions =
+    PL_ARRAY_SIZE(pl_vulkan_recommended_extensions);
+
+// +1 because VK_KHR_swapchain is not automatically pulled in
+static_assert(PL_ARRAY_SIZE(pl_vulkan_recommended_extensions) + 1 ==
+              PL_ARRAY_SIZE(vk_device_extensions),
+              "pl_vulkan_recommended_extensions out of sync with "
+              "vk_device_extensions?");
+
+// Recommended features; keep in sync with libavutil vulkan hwcontext
+static const VkPhysicalDeviceVulkan13Features recommended_vk13 = {
+    .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES,
+    .computeFullSubgroups = true,
+    .maintenance4 = true,
+    .shaderZeroInitializeWorkgroupMemory = true,
+    .synchronization2 = true,
+};
+
+static const VkPhysicalDeviceVulkan12Features recommended_vk12 = {
+    .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES,
+    .pNext = (void *) &recommended_vk13,
+    .bufferDeviceAddress = true,
+    .storagePushConstant8 = true,
+    .shaderInt8 = true,
+    .shaderFloat16 = true,
+    .shaderSharedInt64Atomics = true,
+    .storageBuffer8BitAccess = true,
+    .uniformAndStorageBuffer8BitAccess = true,
+    .vulkanMemoryModel = true,
+    .vulkanMemoryModelDeviceScope = true,
+};
+
+static const VkPhysicalDeviceVulkan11Features recommended_vk11 = {
+    .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES,
+    .pNext = (void *) &recommended_vk12,
+    .samplerYcbcrConversion = true,
+    .storagePushConstant16 = true,
+};
+
+const VkPhysicalDeviceFeatures2 pl_vulkan_recommended_features = {
+    .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2,
+    .pNext = (void *) &recommended_vk11,
+    .features = {
+        .shaderImageGatherExtended = true,
+        .shaderStorageImageReadWithoutFormat = true,
+        .shaderStorageImageWriteWithoutFormat = true,
+
+        // Needed for GPU-assisted validation, but not harmful to enable
+        .fragmentStoresAndAtomics = true,
+        .vertexPipelineStoresAndAtomics = true,
+        .shaderInt64 = true,
+    }
+};
+
+// Required features
+static const VkPhysicalDeviceVulkan12Features required_vk12 = {
+    .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES,
+    .hostQueryReset = true,
+    .timelineSemaphore = true,
+};
+
+static const VkPhysicalDeviceVulkan11Features required_vk11 = {
+    .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES,
+    .pNext = (void *) &required_vk12,
+};
+
+const VkPhysicalDeviceFeatures2 pl_vulkan_required_features = {
+    .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2,
+    .pNext = (void *) &required_vk11,
+};
+
+static bool check_required_features(struct vk_ctx *vk)
+{
+    #define CHECK_FEATURE(maj, min, feat) do {                                  \
+        const VkPhysicalDeviceVulkan##maj##min##Features *f;                    \
+        f = vk_find_struct(&vk->features,                                       \
+            VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_##maj##_##min##_FEATURES); \
+        if (!f || !f->feat) {                                                   \
+            PL_ERR(vk, "Missing device feature: " #feat);                       \
+            return false;                                                       \
+        }                                                                       \
+    } while (0)
+
+    CHECK_FEATURE(1, 2, hostQueryReset);
+    CHECK_FEATURE(1, 2, timelineSemaphore);
+
+    #undef CHECK_FEATURE
+    return true;
+}
+
+
+// List of mandatory device-level functions
+//
+// Note: Also includes VK_EXT_debug_utils functions, even though they aren't
+// mandatory, simply because we load that extension in a special way.
+static const struct vk_fun vk_dev_funs[] = {
+    PL_VK_DEV_FUN(AllocateCommandBuffers),
+    PL_VK_DEV_FUN(AllocateDescriptorSets),
+    PL_VK_DEV_FUN(AllocateMemory),
+    PL_VK_DEV_FUN(BeginCommandBuffer),
+    PL_VK_DEV_FUN(BindBufferMemory),
+    PL_VK_DEV_FUN(BindImageMemory),
+    PL_VK_DEV_FUN(CmdBeginDebugUtilsLabelEXT),
+    PL_VK_DEV_FUN(CmdBeginRenderPass),
+    PL_VK_DEV_FUN(CmdBindDescriptorSets),
+    PL_VK_DEV_FUN(CmdBindIndexBuffer),
+    PL_VK_DEV_FUN(CmdBindPipeline),
+    PL_VK_DEV_FUN(CmdBindVertexBuffers),
+    PL_VK_DEV_FUN(CmdBlitImage),
+    PL_VK_DEV_FUN(CmdClearColorImage),
+    PL_VK_DEV_FUN(CmdCopyBuffer),
+    PL_VK_DEV_FUN(CmdCopyBufferToImage),
+    PL_VK_DEV_FUN(CmdCopyImage),
+    PL_VK_DEV_FUN(CmdCopyImageToBuffer),
+    PL_VK_DEV_FUN(CmdDispatch),
+    PL_VK_DEV_FUN(CmdDraw),
+    PL_VK_DEV_FUN(CmdDrawIndexed),
+    PL_VK_DEV_FUN(CmdEndDebugUtilsLabelEXT),
+    PL_VK_DEV_FUN(CmdEndRenderPass),
+    PL_VK_DEV_FUN(CmdPipelineBarrier),
+    PL_VK_DEV_FUN(CmdPushConstants),
+    PL_VK_DEV_FUN(CmdResetQueryPool),
+    PL_VK_DEV_FUN(CmdSetScissor),
+    PL_VK_DEV_FUN(CmdSetViewport),
+    PL_VK_DEV_FUN(CmdUpdateBuffer),
+    PL_VK_DEV_FUN(CmdWriteTimestamp),
+    PL_VK_DEV_FUN(CreateBuffer),
+    PL_VK_DEV_FUN(CreateBufferView),
+    PL_VK_DEV_FUN(CreateCommandPool),
+    PL_VK_DEV_FUN(CreateComputePipelines),
+    PL_VK_DEV_FUN(CreateDescriptorPool),
+    PL_VK_DEV_FUN(CreateDescriptorSetLayout),
+    PL_VK_DEV_FUN(CreateFence),
+    PL_VK_DEV_FUN(CreateFramebuffer),
+    PL_VK_DEV_FUN(CreateGraphicsPipelines),
+    PL_VK_DEV_FUN(CreateImage),
+    PL_VK_DEV_FUN(CreateImageView),
+    PL_VK_DEV_FUN(CreatePipelineCache),
+    PL_VK_DEV_FUN(CreatePipelineLayout),
+    PL_VK_DEV_FUN(CreateQueryPool),
+    PL_VK_DEV_FUN(CreateRenderPass),
+    PL_VK_DEV_FUN(CreateSampler),
+    PL_VK_DEV_FUN(CreateSemaphore),
+    PL_VK_DEV_FUN(CreateShaderModule),
+    PL_VK_DEV_FUN(DestroyBuffer),
+    PL_VK_DEV_FUN(DestroyBufferView),
+    PL_VK_DEV_FUN(DestroyCommandPool),
+    PL_VK_DEV_FUN(DestroyDescriptorPool),
+    PL_VK_DEV_FUN(DestroyDescriptorSetLayout),
+    PL_VK_DEV_FUN(DestroyDevice),
+    PL_VK_DEV_FUN(DestroyFence),
+    PL_VK_DEV_FUN(DestroyFramebuffer),
+    PL_VK_DEV_FUN(DestroyImage),
+    PL_VK_DEV_FUN(DestroyImageView),
+    PL_VK_DEV_FUN(DestroyInstance),
+    PL_VK_DEV_FUN(DestroyPipeline),
+    PL_VK_DEV_FUN(DestroyPipelineCache),
+    PL_VK_DEV_FUN(DestroyPipelineLayout),
+    PL_VK_DEV_FUN(DestroyQueryPool),
+    PL_VK_DEV_FUN(DestroyRenderPass),
+    PL_VK_DEV_FUN(DestroySampler),
+    PL_VK_DEV_FUN(DestroySemaphore),
+    PL_VK_DEV_FUN(DestroyShaderModule),
+    PL_VK_DEV_FUN(DeviceWaitIdle),
+    PL_VK_DEV_FUN(EndCommandBuffer),
+    PL_VK_DEV_FUN(FlushMappedMemoryRanges),
+    PL_VK_DEV_FUN(FreeCommandBuffers),
+    PL_VK_DEV_FUN(FreeMemory),
+    PL_VK_DEV_FUN(GetBufferMemoryRequirements),
+    PL_VK_DEV_FUN(GetDeviceQueue),
+    PL_VK_DEV_FUN(GetImageMemoryRequirements2),
+    PL_VK_DEV_FUN(GetImageSubresourceLayout),
+    PL_VK_DEV_FUN(GetPipelineCacheData),
+    PL_VK_DEV_FUN(GetQueryPoolResults),
+    PL_VK_DEV_FUN(InvalidateMappedMemoryRanges),
+    PL_VK_DEV_FUN(MapMemory),
+    PL_VK_DEV_FUN(QueueSubmit),
+    PL_VK_DEV_FUN(QueueWaitIdle),
+    PL_VK_DEV_FUN(ResetFences),
+    PL_VK_DEV_FUN(ResetQueryPool),
+    PL_VK_DEV_FUN(SetDebugUtilsObjectNameEXT),
+    PL_VK_DEV_FUN(UpdateDescriptorSets),
+    PL_VK_DEV_FUN(WaitForFences),
+    PL_VK_DEV_FUN(WaitSemaphores),
+};
+
+static void load_vk_fun(struct vk_ctx *vk, const struct vk_fun *fun)
+{
+    PFN_vkVoidFunction *pfn = (void *) ((uintptr_t) vk + (ptrdiff_t) fun->offset);
+
+    if (fun->device_level) {
+        *pfn = vk->GetDeviceProcAddr(vk->dev, fun->name);
+    } else {
+        *pfn = vk->GetInstanceProcAddr(vk->inst, fun->name);
+    };
+
+    if (!*pfn) {
+        // Some functions get their extension suffix stripped when promoted
+        // to core. As a very simple work-around to this, try loading the
+        // function a second time with the reserved suffixes stripped.
+        static const char *ext_suffixes[] = { "KHR", "EXT" };
+        pl_str fun_name = pl_str0(fun->name);
+        char buf[64];
+
+        for (int i = 0; i < PL_ARRAY_SIZE(ext_suffixes); i++) {
+            if (!pl_str_eatend0(&fun_name, ext_suffixes[i]))
+                continue;
+
+            pl_assert(sizeof(buf) > fun_name.len);
+            snprintf(buf, sizeof(buf), "%.*s", PL_STR_FMT(fun_name));
+            if (fun->device_level) {
+                *pfn = vk->GetDeviceProcAddr(vk->dev, buf);
+            } else {
+                *pfn = vk->GetInstanceProcAddr(vk->inst, buf);
+            }
+            return;
+        }
+    }
+}
+
+// Private struct for pl_vk_inst
+struct priv {
+    VkDebugUtilsMessengerEXT debug_utils_cb;
+};
+
+void pl_vk_inst_destroy(pl_vk_inst *inst_ptr)
+{
+    pl_vk_inst inst = *inst_ptr;
+    if (!inst)
+        return;
+
+    struct priv *p = PL_PRIV(inst);
+    if (p->debug_utils_cb) {
+        PL_VK_LOAD_FUN(inst->instance, DestroyDebugUtilsMessengerEXT, inst->get_proc_addr);
+        DestroyDebugUtilsMessengerEXT(inst->instance, p->debug_utils_cb, PL_VK_ALLOC);
+    }
+
+    PL_VK_LOAD_FUN(inst->instance, DestroyInstance, inst->get_proc_addr);
+    DestroyInstance(inst->instance, PL_VK_ALLOC);
+    pl_free_ptr((void **) inst_ptr);
+}
+
+static VkBool32 VKAPI_PTR vk_dbg_utils_cb(VkDebugUtilsMessageSeverityFlagBitsEXT sev,
+                                          VkDebugUtilsMessageTypeFlagsEXT msgType,
+                                          const VkDebugUtilsMessengerCallbackDataEXT *data,
+                                          void *priv)
+{
+    pl_log log = priv;
+
+    // Ignore errors for messages that we consider false positives
+    switch (data->messageIdNumber) {
+    case 0x7cd0911d: // VUID-VkSwapchainCreateInfoKHR-imageExtent-01274
+    case 0x8928392f: // UNASSIGNED-BestPractices-NonSuccess-Result
+    case 0xdc18ad6b: // UNASSIGNED-BestPractices-vkAllocateMemory-small-allocation
+    case 0xb3d4346b: // UNASSIGNED-BestPractices-vkBindMemory-small-dedicated-allocation
+    case 0x6cfe18a5: // UNASSIGNED-BestPractices-SemaphoreCount
+    case 0x48a09f6c: // UNASSIGNED-BestPractices-pipeline-stage-flags
+    // profile chain expectations
+    case 0x30f4ac70: // VUID-VkImageCreateInfo-pNext-06811
+        return false;
+
+    case 0x5f379b89: // UNASSIGNED-BestPractices-Error-Result
+        if (strstr(data->pMessage, "VK_ERROR_FORMAT_NOT_SUPPORTED"))
+            return false;
+        break;
+
+    case 0xf6a37cfa: // VUID-vkGetImageSubresourceLayout-format-04461
+        // Work around https://github.com/KhronosGroup/Vulkan-Docs/issues/2109
+        return false;
+    }
+
+    enum pl_log_level lev;
+    switch (sev) {
+    case VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT:     lev = PL_LOG_ERR;   break;
+    case VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT:   lev = PL_LOG_WARN;  break;
+    case VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT:      lev = PL_LOG_DEBUG; break;
+    case VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT:   lev = PL_LOG_TRACE; break;
+    default:                                                lev = PL_LOG_INFO;  break;
+    }
+
+    pl_msg(log, lev, "vk %s", data->pMessage);
+
+    for (int i = 0; i < data->queueLabelCount; i++)
+        pl_msg(log, lev, "    during %s", data->pQueueLabels[i].pLabelName);
+    for (int i = 0; i < data->cmdBufLabelCount; i++)
+        pl_msg(log, lev, "    inside %s", data->pCmdBufLabels[i].pLabelName);
+    for (int i = 0; i < data->objectCount; i++) {
+        const VkDebugUtilsObjectNameInfoEXT *obj = &data->pObjects[i];
+        pl_msg(log, lev, "    using %s: %s (0x%llx)",
+               vk_obj_type(obj->objectType),
+               obj->pObjectName ? obj->pObjectName : "anon",
+               (unsigned long long) obj->objectHandle);
+    }
+
+    // The return value of this function determines whether the call will
+    // be explicitly aborted (to prevent GPU errors) or not. In this case,
+    // we generally want this to be on for the validation errors, but nothing
+    // else (e.g. performance warnings)
+    bool is_error = (sev & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) &&
+                    (msgType & VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT);
+
+    if (is_error) {
+        pl_log_stack_trace(log, lev);
+        pl_debug_abort();
+        return true;
+    }
+
+    return false;
+}
+
+static PFN_vkGetInstanceProcAddr get_proc_addr_fallback(pl_log log,
+                                    PFN_vkGetInstanceProcAddr get_proc_addr)
+{
+    if (get_proc_addr)
+        return get_proc_addr;
+
+#ifdef PL_HAVE_VK_PROC_ADDR
+    return vkGetInstanceProcAddr;
+#else
+    pl_fatal(log, "No `vkGetInstanceProcAddr` function provided, and "
+             "libplacebo built without linking against this function!");
+    return NULL;
+#endif
+}
+
+#define PRINTF_VER(ver) \
+    (int) VK_API_VERSION_MAJOR(ver), \
+    (int) VK_API_VERSION_MINOR(ver), \
+    (int) VK_API_VERSION_PATCH(ver)
+
+pl_vk_inst pl_vk_inst_create(pl_log log, const struct pl_vk_inst_params *params)
+{
+    void *tmp = pl_tmp(NULL);
+    params = PL_DEF(params, &pl_vk_inst_default_params);
+    VkInstance inst = NULL;
+    pl_clock_t start;
+
+    PL_ARRAY(const char *) exts = {0};
+
+    PFN_vkGetInstanceProcAddr get_addr;
+    if (!(get_addr = get_proc_addr_fallback(log, params->get_proc_addr)))
+        goto error;
+
+    // Query instance version support
+    uint32_t api_ver = VK_API_VERSION_1_0;
+    PL_VK_LOAD_FUN(NULL, EnumerateInstanceVersion, get_addr);
+    if (EnumerateInstanceVersion && EnumerateInstanceVersion(&api_ver) != VK_SUCCESS)
+        goto error;
+
+    pl_debug(log, "Available instance version: %d.%d.%d", PRINTF_VER(api_ver));
+
+    if (params->max_api_version) {
+        api_ver = PL_MIN(api_ver, params->max_api_version);
+        pl_info(log, "Restricting API version to %d.%d.%d... new version %d.%d.%d",
+                PRINTF_VER(params->max_api_version), PRINTF_VER(api_ver));
+    }
+
+    if (api_ver < PL_VK_MIN_VERSION) {
+        pl_fatal(log, "Instance API version %d.%d.%d is lower than the minimum "
+                 "required version of %d.%d.%d, cannot proceed!",
+                 PRINTF_VER(api_ver), PRINTF_VER(PL_VK_MIN_VERSION));
+        goto error;
+    }
+
+    VkInstanceCreateInfo info = {
+        .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
+        .pApplicationInfo = &(VkApplicationInfo) {
+            .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
+            .apiVersion = api_ver,
+        },
+    };
+
+    // Enumerate all supported layers
+    start = pl_clock_now();
+    PL_VK_LOAD_FUN(NULL, EnumerateInstanceLayerProperties, get_addr);
+    uint32_t num_layers_avail = 0;
+    EnumerateInstanceLayerProperties(&num_layers_avail, NULL);
+    VkLayerProperties *layers_avail = pl_calloc_ptr(tmp, num_layers_avail, layers_avail);
+    EnumerateInstanceLayerProperties(&num_layers_avail, layers_avail);
+    pl_log_cpu_time(log, start, pl_clock_now(), "enumerating instance layers");
+
+    pl_debug(log, "Available layers:");
+    for (int i = 0; i < num_layers_avail; i++) {
+        pl_debug(log, "    %s (v%d.%d.%d)", layers_avail[i].layerName,
+                 PRINTF_VER(layers_avail[i].specVersion));
+    }
+
+    PL_ARRAY(const char *) layers = {0};
+
+    // Sorted by priority
+    static const char *debug_layers[] = {
+        "VK_LAYER_KHRONOS_validation",
+        "VK_LAYER_LUNARG_standard_validation",
+    };
+
+    // This layer has to be initialized first, otherwise all sorts of weirdness
+    // happens (random segfaults, yum)
+    bool debug = params->debug;
+    uint32_t debug_layer = 0; // layer idx of debug layer
+    uint32_t debug_layer_version = 0;
+    if (debug) {
+        for (int i = 0; i < PL_ARRAY_SIZE(debug_layers); i++) {
+            for (int n = 0; n < num_layers_avail; n++) {
+                if (strcmp(debug_layers[i], layers_avail[n].layerName) != 0)
+                    continue;
+
+                debug_layer = n;
+                debug_layer_version = layers_avail[n].specVersion;
+                pl_info(log, "Enabling debug meta layer: %s (v%d.%d.%d)",
+                        debug_layers[i], PRINTF_VER(debug_layer_version));
+                PL_ARRAY_APPEND(tmp, layers, debug_layers[i]);
+                goto debug_layers_done;
+            }
+        }
+
+        // No layer found..
+        pl_warn(log, "API debugging requested but no debug meta layers present... ignoring");
+        debug = false;
+    }
+
+debug_layers_done: ;
+
+    for (int i = 0; i < params->num_layers; i++)
+        PL_ARRAY_APPEND(tmp, layers, params->layers[i]);
+
+    for (int i = 0; i < params->num_opt_layers; i++) {
+        const char *layer = params->opt_layers[i];
+        for (int n = 0; n < num_layers_avail; n++) {
+            if (strcmp(layer, layers_avail[n].layerName) == 0) {
+                PL_ARRAY_APPEND(tmp, layers, layer);
+                break;
+            }
+        }
+    }
+
+    // Enumerate all supported extensions
+    start = pl_clock_now();
+    PL_VK_LOAD_FUN(NULL, EnumerateInstanceExtensionProperties, get_addr);
+    uint32_t num_exts_avail = 0;
+    EnumerateInstanceExtensionProperties(NULL, &num_exts_avail, NULL);
+    VkExtensionProperties *exts_avail = pl_calloc_ptr(tmp, num_exts_avail, exts_avail);
+    EnumerateInstanceExtensionProperties(NULL, &num_exts_avail, exts_avail);
+
+    struct {
+        VkExtensionProperties *exts;
+        uint32_t num_exts;
+    } *layer_exts = pl_calloc_ptr(tmp, num_layers_avail, layer_exts);
+
+    // Enumerate extensions from layers
+    for (int i = 0; i < num_layers_avail; i++) {
+        VkExtensionProperties **lexts = &layer_exts[i].exts;
+        uint32_t *num = &layer_exts[i].num_exts;
+
+        EnumerateInstanceExtensionProperties(layers_avail[i].layerName, num, NULL);
+        *lexts = pl_calloc_ptr(tmp, *num, *lexts);
+        EnumerateInstanceExtensionProperties(layers_avail[i].layerName, num, *lexts);
+
+        // Replace all extensions that are already available globally by {0}
+        for (int j = 0; j < *num; j++) {
+            for (int k = 0; k < num_exts_avail; k++) {
+                if (strcmp((*lexts)[j].extensionName, exts_avail[k].extensionName) == 0)
+                    (*lexts)[j] = (VkExtensionProperties) {0};
+            }
+        }
+    }
+
+    pl_log_cpu_time(log, start, pl_clock_now(), "enumerating instance extensions");
+    pl_debug(log, "Available instance extensions:");
+    for (int i = 0; i < num_exts_avail; i++)
+        pl_debug(log, "    %s", exts_avail[i].extensionName);
+    for (int i = 0; i < num_layers_avail; i++) {
+        for (int j = 0; j < layer_exts[i].num_exts; j++) {
+            if (!layer_exts[i].exts[j].extensionName[0])
+                continue;
+
+            pl_debug(log, "    %s (via %s)",
+                     layer_exts[i].exts[j].extensionName,
+                     layers_avail[i].layerName);
+        }
+    }
+
+    // Add mandatory extensions
+    PL_ARRAY_APPEND(tmp, exts, VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME);
+
+    // Add optional extensions
+    for (int i = 0; i < PL_ARRAY_SIZE(vk_instance_extensions); i++) {
+        const char *ext = vk_instance_extensions[i];
+        for (int n = 0; n < num_exts_avail; n++) {
+            if (strcmp(ext, exts_avail[n].extensionName) == 0) {
+                PL_ARRAY_APPEND(tmp, exts, ext);
+                break;
+            }
+        }
+    }
+
+#ifdef VK_KHR_portability_enumeration
+    // Required for macOS ( MoltenVK ) compatibility
+    for (int n = 0; n < num_exts_avail; n++) {
+        if (strcmp(VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME, exts_avail[n].extensionName) == 0) {
+            PL_ARRAY_APPEND(tmp, exts, VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME);
+            info.flags |= VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR;
+            break;
+        }
+    }
+#endif
+
+    // Add extra user extensions
+    for (int i = 0; i < params->num_extensions; i++) {
+        const char *ext = params->extensions[i];
+        PL_ARRAY_APPEND(tmp, exts, ext);
+
+        // Enable any additional layers that are required for this extension
+        for (int n = 0; n < num_layers_avail; n++) {
+            for (int j = 0; j < layer_exts[n].num_exts; j++) {
+                if (!layer_exts[n].exts[j].extensionName[0])
+                    continue;
+                if (strcmp(ext, layer_exts[n].exts[j].extensionName) == 0) {
+                    PL_ARRAY_APPEND(tmp, layers, layers_avail[n].layerName);
+                    goto next_user_ext;
+                }
+            }
+        }
+
+next_user_ext: ;
+    }
+
+    // Add extra optional user extensions
+    for (int i = 0; i < params->num_opt_extensions; i++) {
+        const char *ext = params->opt_extensions[i];
+        for (int n = 0; n < num_exts_avail; n++) {
+            if (strcmp(ext, exts_avail[n].extensionName) == 0) {
+                PL_ARRAY_APPEND(tmp, exts, ext);
+                goto next_opt_user_ext;
+            }
+        }
+
+        for (int n = 0; n < num_layers_avail; n++) {
+            for (int j = 0; j < layer_exts[n].num_exts; j++) {
+                if (!layer_exts[n].exts[j].extensionName[0])
+                    continue;
+                if (strcmp(ext, layer_exts[n].exts[j].extensionName) == 0) {
+                    PL_ARRAY_APPEND(tmp, exts, ext);
+                    PL_ARRAY_APPEND(tmp, layers, layers_avail[n].layerName);
+                    goto next_opt_user_ext;
+                }
+            }
+        }
+
+next_opt_user_ext: ;
+    }
+
+    // If debugging is enabled, load the necessary debug utils extension
+    if (debug) {
+        const char * const ext = VK_EXT_DEBUG_UTILS_EXTENSION_NAME;
+        for (int n = 0; n < num_exts_avail; n++) {
+            if (strcmp(ext, exts_avail[n].extensionName) == 0) {
+                PL_ARRAY_APPEND(tmp, exts, ext);
+                goto debug_ext_done;
+            }
+        }
+
+        for (int n = 0; n < layer_exts[debug_layer].num_exts; n++) {
+            if (strcmp(ext, layer_exts[debug_layer].exts[n].extensionName) == 0) {
+                PL_ARRAY_APPEND(tmp, exts, ext);
+                goto debug_ext_done;
+            }
+        }
+
+        // No extension found
+        pl_warn(log, "API debug layers enabled but no debug report extension "
+                "found... ignoring. Debug messages may be spilling to "
+                "stdout/stderr!");
+        debug = false;
+    }
+
+debug_ext_done: ;
+
+    // Limit this to 1.3.250+ because of bugs in older versions.
+    if (debug && params->debug_extra &&
+        debug_layer_version >= VK_MAKE_API_VERSION(0, 1, 3, 259))
+    {
+        // Try enabling as many validation features as possible
+        static const VkValidationFeatureEnableEXT validation_features[] = {
+            VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT,
+            VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT,
+            VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT,
+            VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT,
+        };
+
+        static const VkValidationFeaturesEXT vinfo = {
+            .sType = VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT,
+            .pEnabledValidationFeatures = validation_features,
+            .enabledValidationFeatureCount = PL_ARRAY_SIZE(validation_features),
+        };
+
+        const char * const ext = VK_EXT_VALIDATION_FEATURES_EXTENSION_NAME;
+        for (int n = 0; n < num_exts_avail; n++) {
+            if (strcmp(ext, exts_avail[n].extensionName) == 0) {
+                PL_ARRAY_APPEND(tmp, exts, ext);
+                vk_link_struct(&info, &vinfo);
+                goto debug_extra_ext_done;
+            }
+        }
+
+        for (int n = 0; n < layer_exts[debug_layer].num_exts; n++) {
+            if (strcmp(ext, layer_exts[debug_layer].exts[n].extensionName) == 0) {
+                PL_ARRAY_APPEND(tmp, exts, ext);
+                vk_link_struct(&info, &vinfo);
+                goto debug_extra_ext_done;
+            }
+        }
+
+        pl_warn(log, "GPU-assisted validation enabled but not supported by "
+                "instance, disabling...");
+    }
+
+debug_extra_ext_done: ;
+
+    info.ppEnabledExtensionNames = exts.elem;
+    info.enabledExtensionCount = exts.num;
+    info.ppEnabledLayerNames = layers.elem;
+    info.enabledLayerCount = layers.num;
+
+    pl_info(log, "Creating vulkan instance%s", exts.num ? " with extensions:" : "");
+    for (int i = 0; i < exts.num; i++)
+        pl_info(log, "    %s", exts.elem[i]);
+
+    if (layers.num) {
+        pl_info(log, "  and layers:");
+        for (int i = 0; i < layers.num; i++)
+            pl_info(log, "    %s", layers.elem[i]);
+    }
+
+    start = pl_clock_now();
+    PL_VK_LOAD_FUN(NULL, CreateInstance, get_addr);
+    VkResult res = CreateInstance(&info, PL_VK_ALLOC, &inst);
+    pl_log_cpu_time(log, start, pl_clock_now(), "creating vulkan instance");
+    if (res != VK_SUCCESS) {
+        pl_fatal(log, "Failed creating instance: %s", vk_res_str(res));
+        goto error;
+    }
+
+    struct pl_vk_inst_t *pl_vk = pl_zalloc_obj(NULL, pl_vk, struct priv);
+    struct priv *p = PL_PRIV(pl_vk);
+    *pl_vk = (struct pl_vk_inst_t) {
+        .instance = inst,
+        .api_version = api_ver,
+        .get_proc_addr = get_addr,
+        .extensions = pl_steal(pl_vk, exts.elem),
+        .num_extensions = exts.num,
+        .layers = pl_steal(pl_vk, layers.elem),
+        .num_layers = layers.num,
+    };
+
+    // Set up a debug callback to catch validation messages
+    if (debug) {
+        VkDebugUtilsMessengerCreateInfoEXT dinfo = {
+            .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT,
+            .messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT |
+                               VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT |
+                               VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT |
+                               VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT,
+            .messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT |
+                           VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT |
+                           VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT,
+            .pfnUserCallback = vk_dbg_utils_cb,
+            .pUserData = (void *) log,
+        };
+
+        PL_VK_LOAD_FUN(inst, CreateDebugUtilsMessengerEXT, get_addr);
+        CreateDebugUtilsMessengerEXT(inst, &dinfo, PL_VK_ALLOC, &p->debug_utils_cb);
+    }
+
+    pl_free(tmp);
+    return pl_vk;
+
+error:
+    pl_fatal(log, "Failed initializing vulkan instance");
+    if (inst) {
+        PL_VK_LOAD_FUN(inst, DestroyInstance, get_addr);
+        DestroyInstance(inst, PL_VK_ALLOC);
+    }
+    pl_free(tmp);
+    return NULL;
+}
+
+const struct pl_vulkan_params pl_vulkan_default_params = { PL_VULKAN_DEFAULTS };
+
+void pl_vulkan_destroy(pl_vulkan *pl_vk)
+{
+    if (!*pl_vk)
+        return;
+
+    struct vk_ctx *vk = PL_PRIV(*pl_vk);
+    if (vk->dev) {
+        if ((*pl_vk)->gpu) {
+            PL_DEBUG(vk, "Waiting for remaining commands...");
+            pl_gpu_finish((*pl_vk)->gpu);
+            pl_assert(vk->cmds_pending.num == 0);
+
+            pl_gpu_destroy((*pl_vk)->gpu);
+        }
+        vk_malloc_destroy(&vk->ma);
+        for (int i = 0; i < vk->pools.num; i++)
+            vk_cmdpool_destroy(vk->pools.elem[i]);
+
+        if (!vk->imported)
+            vk->DestroyDevice(vk->dev, PL_VK_ALLOC);
+    }
+
+    for (int i = 0; i < vk->queue_locks.num; i++) {
+        for (int n = 0; n < vk->queue_locks.elem[i].num; n++)
+            pl_mutex_destroy(&vk->queue_locks.elem[i].elem[n]);
+    }
+
+    pl_vk_inst_destroy(&vk->internal_instance);
+    pl_mutex_destroy(&vk->lock);
+    pl_free_ptr((void **) pl_vk);
+}
+
+static bool supports_surf(pl_log log, VkInstance inst,
+                          PFN_vkGetInstanceProcAddr get_addr,
+                          VkPhysicalDevice physd, VkSurfaceKHR surf)
+{
+    // Hack for the VK macro's logging to work
+    struct { pl_log log; } *vk = (void *) &log;
+
+    PL_VK_LOAD_FUN(inst, GetPhysicalDeviceQueueFamilyProperties, get_addr);
+    PL_VK_LOAD_FUN(inst, GetPhysicalDeviceSurfaceSupportKHR, get_addr);
+    uint32_t qfnum = 0;
+    GetPhysicalDeviceQueueFamilyProperties(physd, &qfnum, NULL);
+
+    for (int i = 0; i < qfnum; i++) {
+        VkBool32 sup = false;
+        VK(GetPhysicalDeviceSurfaceSupportKHR(physd, i, surf, &sup));
+        if (sup)
+            return true;
+    }
+
+error:
+    return false;
+}
+
+VkPhysicalDevice pl_vulkan_choose_device(pl_log log,
+                                const struct pl_vulkan_device_params *params)
+{
+    // Hack for the VK macro's logging to work
+    struct { pl_log log; } *vk = (void *) &log;
+    PL_INFO(vk, "Probing for vulkan devices:");
+
+    pl_assert(params->instance);
+    VkInstance inst = params->instance;
+    VkPhysicalDevice dev = VK_NULL_HANDLE;
+
+    PFN_vkGetInstanceProcAddr get_addr;
+    if (!(get_addr = get_proc_addr_fallback(log, params->get_proc_addr)))
+        return NULL;
+
+    PL_VK_LOAD_FUN(inst, EnumeratePhysicalDevices, get_addr);
+    PL_VK_LOAD_FUN(inst, GetPhysicalDeviceProperties2, get_addr);
+    pl_assert(GetPhysicalDeviceProperties2);
+
+    pl_clock_t start = pl_clock_now();
+    VkPhysicalDevice *devices = NULL;
+    uint32_t num = 0;
+    VK(EnumeratePhysicalDevices(inst, &num, NULL));
+    devices = pl_calloc_ptr(NULL, num, devices);
+    VK(EnumeratePhysicalDevices(inst, &num, devices));
+    pl_log_cpu_time(log, start, pl_clock_now(), "enumerating physical devices");
+
+    static const struct { const char *name; int priority; } types[] = {
+        [VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU]   = {"discrete",   5},
+        [VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU] = {"integrated", 4},
+        [VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU]    = {"virtual",    3},
+        [VK_PHYSICAL_DEVICE_TYPE_CPU]            = {"software",   2},
+        [VK_PHYSICAL_DEVICE_TYPE_OTHER]          = {"other",      1},
+    };
+
+    static const uint8_t nil[VK_UUID_SIZE] = {0};
+    bool uuid_set = memcmp(params->device_uuid, nil, VK_UUID_SIZE) != 0;
+
+    int best = -1;
+    for (int i = 0; i < num; i++) {
+        VkPhysicalDeviceIDPropertiesKHR id_props = {
+            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR,
+        };
+
+        VkPhysicalDeviceProperties2 prop = {
+            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
+            .pNext = &id_props,
+        };
+
+        GetPhysicalDeviceProperties2(devices[i], &prop);
+        VkPhysicalDeviceType t = prop.properties.deviceType;
+        const char *dtype = t < PL_ARRAY_SIZE(types) ? types[t].name : "unknown?";
+        PL_INFO(vk, "    GPU %d: %s v%d.%d.%d (%s)", i, prop.properties.deviceName,
+                PRINTF_VER(prop.properties.apiVersion), dtype);
+        PL_INFO(vk, "           uuid: %s", PRINT_UUID(id_props.deviceUUID));
+
+        if (params->surface) {
+            if (!supports_surf(log, inst, get_addr, devices[i], params->surface)) {
+                PL_DEBUG(vk, "      -> excluding due to lack of surface support");
+                continue;
+            }
+        }
+
+        if (uuid_set) {
+            if (memcmp(id_props.deviceUUID, params->device_uuid, VK_UUID_SIZE) == 0) {
+                dev = devices[i];
+                continue;
+            } else {
+                PL_DEBUG(vk, "     -> excluding due to UUID mismatch");
+                continue;
+            }
+        } else if (params->device_name && params->device_name[0] != '\0') {
+            if (strcmp(params->device_name, prop.properties.deviceName) == 0) {
+                dev = devices[i];
+                continue;
+            } else {
+                PL_DEBUG(vk, "      -> excluding due to name mismatch");
+                continue;
+            }
+        }
+
+        if (!params->allow_software && t == VK_PHYSICAL_DEVICE_TYPE_CPU) {
+            PL_DEBUG(vk, "      -> excluding due to !params->allow_software");
+            continue;
+        }
+
+        if (prop.properties.apiVersion < PL_VK_MIN_VERSION) {
+            PL_DEBUG(vk, "      -> excluding due to too low API version");
+            continue;
+        }
+
+        int priority = t < PL_ARRAY_SIZE(types) ? types[t].priority : 0;
+        if (priority > best) {
+            dev = devices[i];
+            best = priority;
+        }
+    }
+
+error:
+    pl_free(devices);
+    return dev;
+}
+
+static void lock_queue_internal(void *priv, uint32_t qf, uint32_t qidx)
+{
+    struct vk_ctx *vk = priv;
+    pl_mutex_lock(&vk->queue_locks.elem[qf].elem[qidx]);
+}
+
+static void unlock_queue_internal(void *priv, uint32_t qf, uint32_t qidx)
+{
+    struct vk_ctx *vk = priv;
+    pl_mutex_unlock(&vk->queue_locks.elem[qf].elem[qidx]);
+}
+
+static void init_queue_locks(struct vk_ctx *vk, uint32_t qfnum,
+                             const VkQueueFamilyProperties *qfs)
+{
+    vk->queue_locks.elem = pl_calloc_ptr(vk->alloc, qfnum, vk->queue_locks.elem);
+    vk->queue_locks.num = qfnum;
+    for (int i = 0; i < qfnum; i++) {
+        const uint32_t qnum = qfs[i].queueCount;
+        vk->queue_locks.elem[i].elem = pl_calloc(vk->alloc, qnum, sizeof(pl_mutex));
+        vk->queue_locks.elem[i].num = qnum;
+        for (int n = 0; n < qnum; n++)
+            pl_mutex_init(&vk->queue_locks.elem[i].elem[n]);
+    }
+
+    vk->lock_queue = lock_queue_internal;
+    vk->unlock_queue = unlock_queue_internal;
+    vk->queue_ctx = vk;
+}
+
+// Find the most specialized queue supported a combination of flags. In cases
+// where there are multiple queue families at the same specialization level,
+// this finds the one with the most queues. Returns -1 if no queue was found.
+static int find_qf(VkQueueFamilyProperties *qfs, int qfnum, VkQueueFlags flags)
+{
+    int idx = -1;
+    for (int i = 0; i < qfnum; i++) {
+        if ((qfs[i].queueFlags & flags) != flags)
+            continue;
+
+        // QF is more specialized. Since we don't care about other bits like
+        // SPARSE_BIT, mask the ones we're interestew in
+        const VkQueueFlags mask = VK_QUEUE_GRAPHICS_BIT |
+                                  VK_QUEUE_TRANSFER_BIT |
+                                  VK_QUEUE_COMPUTE_BIT;
+
+        if (idx < 0 || (qfs[i].queueFlags & mask) < (qfs[idx].queueFlags & mask))
+            idx = i;
+
+        // QF has more queues (at the same specialization level)
+        if (qfs[i].queueFlags == qfs[idx].queueFlags &&
+            qfs[i].queueCount > qfs[idx].queueCount)
+            idx = i;
+    }
+
+    return idx;
+}
+
+static bool device_init(struct vk_ctx *vk, const struct pl_vulkan_params *params)
+{
+    pl_assert(vk->physd);
+    void *tmp = pl_tmp(NULL);
+
+    // Enumerate the queue families and find suitable families for each task
+    uint32_t qfnum = 0;
+    vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, NULL);
+    VkQueueFamilyProperties *qfs = pl_calloc_ptr(tmp, qfnum, qfs);
+    vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, qfs);
+    init_queue_locks(vk, qfnum, qfs);
+
+    PL_DEBUG(vk, "Queue families supported by device:");
+    for (int i = 0; i < qfnum; i++) {
+        PL_DEBUG(vk, "    %d: flags 0x%"PRIx32" num %"PRIu32, i,
+                 qfs[i].queueFlags, qfs[i].queueCount);
+    }
+
+    VkQueueFlagBits gfx_flags = VK_QUEUE_GRAPHICS_BIT;
+    if (!params->async_compute)
+        gfx_flags |= VK_QUEUE_COMPUTE_BIT;
+
+    int idx_gfx  = find_qf(qfs, qfnum, gfx_flags);
+    int idx_comp = find_qf(qfs, qfnum, VK_QUEUE_COMPUTE_BIT);
+    int idx_tf   = find_qf(qfs, qfnum, VK_QUEUE_TRANSFER_BIT);
+    if (idx_tf < 0)
+        idx_tf = idx_comp;
+
+    if (!params->async_compute)
+        idx_comp = idx_gfx;
+    if (!params->async_transfer)
+        idx_tf = idx_gfx;
+
+    PL_DEBUG(vk, "Using graphics queue %d", idx_gfx);
+    if (idx_tf != idx_gfx)
+        PL_INFO(vk, "Using async transfer (queue %d)", idx_tf);
+    if (idx_comp != idx_gfx)
+        PL_INFO(vk, "Using async compute (queue %d)", idx_comp);
+
+    // Vulkan requires at least one GRAPHICS+COMPUTE queue, so if this fails
+    // something is horribly wrong.
+    pl_assert(idx_gfx >= 0 && idx_comp >= 0 && idx_tf >= 0);
+
+    // If needed, ensure we can actually present to the surface using this queue
+    if (params->surface) {
+        VkBool32 sup = false;
+        VK(vk->GetPhysicalDeviceSurfaceSupportKHR(vk->physd, idx_gfx,
+                                                  params->surface, &sup));
+        if (!sup) {
+            PL_FATAL(vk, "Queue family does not support surface presentation!");
+            goto error;
+        }
+    }
+
+    // Enumerate all supported extensions
+    pl_clock_t start = pl_clock_now();
+    uint32_t num_exts_avail = 0;
+    VK(vk->EnumerateDeviceExtensionProperties(vk->physd, NULL, &num_exts_avail, NULL));
+    VkExtensionProperties *exts_avail = pl_calloc_ptr(tmp, num_exts_avail, exts_avail);
+    VK(vk->EnumerateDeviceExtensionProperties(vk->physd, NULL, &num_exts_avail, exts_avail));
+    pl_log_cpu_time(vk->log, start, pl_clock_now(), "enumerating device extensions");
+
+    PL_DEBUG(vk, "Available device extensions:");
+    for (int i = 0; i < num_exts_avail; i++)
+        PL_DEBUG(vk, "    %s", exts_avail[i].extensionName);
+
+    // Add all extensions we need
+    if (params->surface)
+        PL_ARRAY_APPEND(vk->alloc, vk->exts, VK_KHR_SWAPCHAIN_EXTENSION_NAME);
+
+    // Keep track of all optional function pointers associated with extensions
+    PL_ARRAY(const struct vk_fun *) ext_funs = {0};
+
+    // Add all optional device-level extensions extensions
+    for (int i = 0; i < PL_ARRAY_SIZE(vk_device_extensions); i++) {
+        const struct vk_ext *ext = &vk_device_extensions[i];
+        uint32_t core_ver = vk_ext_promoted_ver(ext->name);
+        if (core_ver && vk->api_ver >= core_ver) {
+            // Layer is already implicitly enabled by the API version
+            for (const struct vk_fun *f = ext->funs; f && f->name; f++)
+                PL_ARRAY_APPEND(tmp, ext_funs,  f);
+            continue;
+        }
+
+        for (int n = 0; n < num_exts_avail; n++) {
+            if (strcmp(ext->name, exts_avail[n].extensionName) == 0) {
+                PL_ARRAY_APPEND(vk->alloc, vk->exts, ext->name);
+                for (const struct vk_fun *f = ext->funs; f && f->name; f++)
+                    PL_ARRAY_APPEND(tmp, ext_funs, f);
+                break;
+            }
+        }
+    }
+
+    // Add extra user extensions
+    for (int i = 0; i < params->num_extensions; i++)
+        PL_ARRAY_APPEND(vk->alloc, vk->exts, params->extensions[i]);
+
+    // Add optional extra user extensions
+    for (int i = 0; i < params->num_opt_extensions; i++) {
+        const char *ext = params->opt_extensions[i];
+        for (int n = 0; n < num_exts_avail; n++) {
+            if (strcmp(ext, exts_avail[n].extensionName) == 0) {
+                PL_ARRAY_APPEND(vk->alloc, vk->exts, ext);
+                break;
+            }
+        }
+    }
+
+    VkPhysicalDeviceFeatures2 features = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR
+    };
+
+    vk_features_normalize(tmp, &pl_vulkan_required_features, vk->api_ver, &features);
+    vk_features_normalize(tmp, &pl_vulkan_recommended_features, vk->api_ver, &features);
+    vk_features_normalize(tmp, params->features, vk->api_ver, &features);
+
+    // Explicitly clear the features struct before querying feature support
+    // from the driver. This way, we don't mistakenly mark as supported
+    // features coming from structs the driver doesn't have support for.
+    VkPhysicalDeviceFeatures2 *features_sup = vk_chain_memdup(tmp, &features);;
+    for (VkBaseOutStructure *out = (void *) features_sup; out; out = out->pNext) {
+        const size_t size = vk_struct_size(out->sType);
+        memset(&out[1], 0, size - sizeof(out[0]));
+    }
+
+    vk->GetPhysicalDeviceFeatures2KHR(vk->physd, features_sup);
+
+    // Filter out unsupported features
+    for (VkBaseOutStructure *f = (VkBaseOutStructure *) &features; f; f = f->pNext) {
+        const VkBaseInStructure *sup = vk_find_struct(features_sup, f->sType);
+        VkBool32 *flags = (VkBool32 *) &f[1];
+        const VkBool32 *flags_sup = (const VkBool32 *) &sup[1];
+        const size_t size = vk_struct_size(f->sType) - sizeof(VkBaseOutStructure);
+        for (int i = 0; i < size / sizeof(VkBool32); i++)
+            flags[i] &= flags_sup[i];
+    }
+
+    // Construct normalized output chain
+    vk->features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
+    vk_features_normalize(vk->alloc, &features, 0, &vk->features);
+    if (!check_required_features(vk)) {
+        PL_FATAL(vk, "Vulkan device does not support all required features!");
+        goto error;
+    }
+
+    // Enable all queues at device creation time, to maximize compatibility
+    // with other API users (e.g. FFmpeg)
+    PL_ARRAY(VkDeviceQueueCreateInfo) qinfos = {0};
+    for (int i = 0; i < qfnum; i++) {
+        bool use_qf = i == idx_gfx || i == idx_comp || i == idx_tf;
+        use_qf |= qfs[i].queueFlags & params->extra_queues;
+        if (!use_qf)
+            continue;
+        PL_ARRAY_APPEND(tmp, qinfos, (VkDeviceQueueCreateInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
+            .queueFamilyIndex = i,
+            .queueCount = qfs[i].queueCount,
+            .pQueuePriorities = pl_calloc(tmp, qfs[i].queueCount, sizeof(float)),
+        });
+    }
+
+    VkDeviceCreateInfo dinfo = {
+        .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
+        .pNext = &features,
+        .pQueueCreateInfos = qinfos.elem,
+        .queueCreateInfoCount = qinfos.num,
+        .ppEnabledExtensionNames = vk->exts.elem,
+        .enabledExtensionCount = vk->exts.num,
+    };
+
+    PL_INFO(vk, "Creating vulkan device%s", vk->exts.num ? " with extensions:" : "");
+    for (int i = 0; i < vk->exts.num; i++)
+        PL_INFO(vk, "    %s", vk->exts.elem[i]);
+
+    start = pl_clock_now();
+    VK(vk->CreateDevice(vk->physd, &dinfo, PL_VK_ALLOC, &vk->dev));
+    pl_log_cpu_time(vk->log, start, pl_clock_now(), "creating vulkan device");
+
+    // Load all mandatory device-level functions
+    for (int i = 0; i < PL_ARRAY_SIZE(vk_dev_funs); i++)
+        load_vk_fun(vk, &vk_dev_funs[i]);
+
+    // Load all of the optional functions from the extensions we enabled
+    for (int i = 0; i < ext_funs.num; i++)
+        load_vk_fun(vk, ext_funs.elem[i]);
+
+    // Create the command pools for the queues we care about
+    const uint32_t qmax = PL_DEF(params->queue_count, UINT32_MAX);
+    for (int i = 0; i < qfnum; i++) {
+        if (i != idx_gfx && i != idx_tf && i != idx_comp)
+            continue; // ignore QFs not used internally
+
+        int qnum = qfs[i].queueCount;
+        if (qmax < qnum) {
+            PL_DEBUG(vk, "Restricting QF %d from %d queues to %d", i, qnum, qmax);
+            qnum = qmax;
+        }
+
+        struct vk_cmdpool *pool = vk_cmdpool_create(vk, i, qnum, qfs[i]);
+        if (!pool)
+            goto error;
+        PL_ARRAY_APPEND(vk->alloc, vk->pools, pool);
+
+        // Update the pool_* pointers based on the corresponding index
+        const char *qf_name = NULL;
+        if (i == idx_tf) {
+            vk->pool_transfer = pool;
+            qf_name = "transfer";
+        }
+        if (i == idx_comp) {
+            vk->pool_compute = pool;
+            qf_name = "compute";
+        }
+        if (i == idx_gfx) {
+            vk->pool_graphics = pool;
+            qf_name = "graphics";
+        }
+
+        for (int n = 0; n < pool->num_queues; n++)
+            PL_VK_NAME_HANDLE(QUEUE, pool->queues[n], qf_name);
+    }
+
+    pl_free(tmp);
+    return true;
+
+error:
+    PL_FATAL(vk, "Failed creating logical device!");
+    pl_free(tmp);
+    vk->failed = true;
+    return false;
+}
+
+static void lock_queue(pl_vulkan pl_vk, uint32_t qf, uint32_t qidx)
+{
+    struct vk_ctx *vk = PL_PRIV(pl_vk);
+    vk->lock_queue(vk->queue_ctx, qf, qidx);
+}
+
+static void unlock_queue(pl_vulkan pl_vk, uint32_t qf, uint32_t qidx)
+{
+    struct vk_ctx *vk = PL_PRIV(pl_vk);
+    vk->unlock_queue(vk->queue_ctx, qf, qidx);
+}
+
+static bool finalize_context(struct pl_vulkan_t *pl_vk, int max_glsl_version)
+{
+    struct vk_ctx *vk = PL_PRIV(pl_vk);
+
+    pl_assert(vk->pool_graphics);
+    pl_assert(vk->pool_compute);
+    pl_assert(vk->pool_transfer);
+
+    vk->ma = vk_malloc_create(vk);
+    if (!vk->ma)
+        return false;
+
+    pl_vk->gpu = pl_gpu_create_vk(vk);
+    if (!pl_vk->gpu)
+        return false;
+
+    // Blacklist / restrict features
+    if (max_glsl_version) {
+        struct pl_glsl_version *glsl = (struct pl_glsl_version *) &pl_vk->gpu->glsl;
+        glsl->version = PL_MIN(glsl->version, max_glsl_version);
+        glsl->version = PL_MAX(glsl->version, 140); // required for GL_KHR_vulkan_glsl
+        PL_INFO(vk, "Restricting GLSL version to %d... new version is %d",
+                max_glsl_version, glsl->version);
+    }
+
+    // Expose the resulting vulkan objects
+    pl_vk->instance = vk->inst;
+    pl_vk->phys_device = vk->physd;
+    pl_vk->device = vk->dev;
+    pl_vk->get_proc_addr = vk->GetInstanceProcAddr;
+    pl_vk->api_version = vk->api_ver;
+    pl_vk->extensions = vk->exts.elem;
+    pl_vk->num_extensions = vk->exts.num;
+    pl_vk->features = &vk->features;
+    pl_vk->num_queues = vk->pools.num;
+    pl_vk->queues = pl_calloc_ptr(vk->alloc, vk->pools.num, pl_vk->queues);
+    pl_vk->lock_queue = lock_queue;
+    pl_vk->unlock_queue = unlock_queue;
+
+    for (int i = 0; i < vk->pools.num; i++) {
+        struct pl_vulkan_queue *queues = (struct pl_vulkan_queue *) pl_vk->queues;
+        queues[i] = (struct pl_vulkan_queue) {
+            .index = vk->pools.elem[i]->qf,
+            .count = vk->pools.elem[i]->num_queues,
+        };
+
+        if (vk->pools.elem[i] == vk->pool_graphics)
+            pl_vk->queue_graphics = queues[i];
+        if (vk->pools.elem[i] == vk->pool_compute)
+            pl_vk->queue_compute = queues[i];
+        if (vk->pools.elem[i] == vk->pool_transfer)
+            pl_vk->queue_transfer = queues[i];
+    }
+
+    pl_assert(vk->lock_queue);
+    pl_assert(vk->unlock_queue);
+    return true;
+}
+
+pl_vulkan pl_vulkan_create(pl_log log, const struct pl_vulkan_params *params)
+{
+    params = PL_DEF(params, &pl_vulkan_default_params);
+    struct pl_vulkan_t *pl_vk = pl_zalloc_obj(NULL, pl_vk, struct vk_ctx);
+    struct vk_ctx *vk = PL_PRIV(pl_vk);
+    *vk = (struct vk_ctx) {
+        .vulkan = pl_vk,
+        .alloc = pl_vk,
+        .log = log,
+        .inst = params->instance,
+        .GetInstanceProcAddr = get_proc_addr_fallback(log, params->get_proc_addr),
+    };
+
+    pl_mutex_init_type(&vk->lock, PL_MUTEX_RECURSIVE);
+    if (!vk->GetInstanceProcAddr)
+        goto error;
+
+    if (!vk->inst) {
+        pl_assert(!params->surface);
+        pl_assert(!params->device);
+        PL_DEBUG(vk, "No VkInstance provided, creating one...");
+
+        // Mirror the instance params here to set `get_proc_addr` correctly
+        struct pl_vk_inst_params iparams;
+        iparams = *PL_DEF(params->instance_params, &pl_vk_inst_default_params);
+        iparams.get_proc_addr = params->get_proc_addr;
+        vk->internal_instance = pl_vk_inst_create(log, &iparams);
+        if (!vk->internal_instance)
+            goto error;
+        vk->inst = vk->internal_instance->instance;
+    }
+
+    // Directly load all mandatory instance-level function pointers, since
+    // these will be required for all further device creation logic
+    for (int i = 0; i < PL_ARRAY_SIZE(vk_inst_funs); i++)
+        load_vk_fun(vk, &vk_inst_funs[i]);
+
+    // Choose the physical device
+    if (params->device) {
+        PL_DEBUG(vk, "Using specified VkPhysicalDevice");
+        vk->physd = params->device;
+    } else {
+        struct pl_vulkan_device_params dparams = {
+            .instance       = vk->inst,
+            .get_proc_addr  = params->get_proc_addr,
+            .surface        = params->surface,
+            .device_name    = params->device_name,
+            .allow_software = params->allow_software,
+        };
+        memcpy(dparams.device_uuid, params->device_uuid, VK_UUID_SIZE);
+
+        vk->physd = pl_vulkan_choose_device(log, &dparams);
+        if (!vk->physd) {
+            PL_FATAL(vk, "Found no suitable device, giving up.");
+            goto error;
+        }
+    }
+
+    VkPhysicalDeviceIDPropertiesKHR id_props = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR,
+    };
+
+    VkPhysicalDeviceProperties2KHR prop = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
+        .pNext = &id_props,
+    };
+
+    vk->GetPhysicalDeviceProperties2(vk->physd, &prop);
+    vk->props = prop.properties;
+
+    PL_INFO(vk, "Vulkan device properties:");
+    PL_INFO(vk, "    Device Name: %s", prop.properties.deviceName);
+    PL_INFO(vk, "    Device ID: %"PRIx32":%"PRIx32, prop.properties.vendorID,
+            prop.properties.deviceID);
+    PL_INFO(vk, "    Device UUID: %s", PRINT_UUID(id_props.deviceUUID));
+    PL_INFO(vk, "    Driver version: %"PRIx32, prop.properties.driverVersion);
+    PL_INFO(vk, "    API version: %d.%d.%d", PRINTF_VER(prop.properties.apiVersion));
+
+    // Needed by device_init
+    vk->api_ver = prop.properties.apiVersion;
+    if (params->max_api_version) {
+        vk->api_ver = PL_MIN(vk->api_ver, params->max_api_version);
+        PL_INFO(vk, "Restricting API version to %d.%d.%d... new version %d.%d.%d",
+                PRINTF_VER(params->max_api_version), PRINTF_VER(vk->api_ver));
+    }
+
+    if (vk->api_ver < PL_VK_MIN_VERSION) {
+        PL_FATAL(vk, "Device API version %d.%d.%d is lower than the minimum "
+                 "required version of %d.%d.%d, cannot proceed!",
+                 PRINTF_VER(vk->api_ver), PRINTF_VER(PL_VK_MIN_VERSION));
+        goto error;
+    }
+
+    // Finally, initialize the logical device and the rest of the vk_ctx
+    if (!device_init(vk, params))
+        goto error;
+
+    if (!finalize_context(pl_vk, params->max_glsl_version))
+        goto error;
+
+    return pl_vk;
+
+error:
+    PL_FATAL(vk, "Failed initializing vulkan device");
+    pl_vulkan_destroy((pl_vulkan *) &pl_vk);
+    return NULL;
+}
+
+pl_vulkan pl_vulkan_import(pl_log log, const struct pl_vulkan_import_params *params)
+{
+    void *tmp = pl_tmp(NULL);
+
+    struct pl_vulkan_t *pl_vk = pl_zalloc_obj(NULL, pl_vk, struct vk_ctx);
+    struct vk_ctx *vk = PL_PRIV(pl_vk);
+    *vk = (struct vk_ctx) {
+        .vulkan = pl_vk,
+        .alloc = pl_vk,
+        .log = log,
+        .imported = true,
+        .inst = params->instance,
+        .physd = params->phys_device,
+        .dev = params->device,
+        .GetInstanceProcAddr = get_proc_addr_fallback(log, params->get_proc_addr),
+        .lock_queue = params->lock_queue,
+        .unlock_queue = params->unlock_queue,
+        .queue_ctx = params->queue_ctx,
+    };
+
+    pl_mutex_init_type(&vk->lock, PL_MUTEX_RECURSIVE);
+    if (!vk->GetInstanceProcAddr)
+        goto error;
+
+    for (int i = 0; i < PL_ARRAY_SIZE(vk_inst_funs); i++)
+        load_vk_fun(vk, &vk_inst_funs[i]);
+
+    VkPhysicalDeviceIDPropertiesKHR id_props = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR,
+    };
+
+    VkPhysicalDeviceProperties2KHR prop = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
+        .pNext = &id_props,
+    };
+
+    pl_assert(vk->GetPhysicalDeviceProperties2);
+    vk->GetPhysicalDeviceProperties2(vk->physd, &prop);
+    vk->props = prop.properties;
+
+    PL_INFO(vk, "Imported vulkan device properties:");
+    PL_INFO(vk, "    Device Name: %s", prop.properties.deviceName);
+    PL_INFO(vk, "    Device ID: %"PRIx32":%"PRIx32, prop.properties.vendorID,
+            prop.properties.deviceID);
+    PL_INFO(vk, "    Device UUID: %s", PRINT_UUID(id_props.deviceUUID));
+    PL_INFO(vk, "    Driver version: %"PRIx32, prop.properties.driverVersion);
+    PL_INFO(vk, "    API version: %d.%d.%d", PRINTF_VER(prop.properties.apiVersion));
+
+    vk->api_ver = prop.properties.apiVersion;
+    if (params->max_api_version) {
+        vk->api_ver = PL_MIN(vk->api_ver, params->max_api_version);
+        PL_INFO(vk, "Restricting API version to %d.%d.%d... new version %d.%d.%d",
+                PRINTF_VER(params->max_api_version), PRINTF_VER(vk->api_ver));
+    }
+
+    if (vk->api_ver < PL_VK_MIN_VERSION) {
+        PL_FATAL(vk, "Device API version %d.%d.%d is lower than the minimum "
+                 "required version of %d.%d.%d, cannot proceed!",
+                 PRINTF_VER(vk->api_ver), PRINTF_VER(PL_VK_MIN_VERSION));
+        goto error;
+    }
+
+    vk->features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
+    vk_features_normalize(vk->alloc, params->features, 0, &vk->features);
+    if (!check_required_features(vk)) {
+        PL_FATAL(vk, "Imported Vulkan device was not created with all required "
+                 "features!");
+        goto error;
+    }
+
+    // Load all mandatory device-level functions
+    for (int i = 0; i < PL_ARRAY_SIZE(vk_dev_funs); i++)
+        load_vk_fun(vk, &vk_dev_funs[i]);
+
+    // Load all of the optional functions from the extensions enabled
+    for (int i = 0; i < PL_ARRAY_SIZE(vk_device_extensions); i++) {
+        const struct vk_ext *ext = &vk_device_extensions[i];
+        uint32_t core_ver = vk_ext_promoted_ver(ext->name);
+        if (core_ver && vk->api_ver >= core_ver) {
+            for (const struct vk_fun *f = ext->funs; f && f->name; f++)
+                load_vk_fun(vk, f);
+            continue;
+        }
+        for (int n = 0; n < params->num_extensions; n++) {
+            if (strcmp(ext->name, params->extensions[n]) == 0) {
+                for (const struct vk_fun *f = ext->funs; f && f->name; f++)
+                    load_vk_fun(vk, f);
+                break;
+            }
+        }
+    }
+
+    uint32_t qfnum = 0;
+    vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, NULL);
+    VkQueueFamilyProperties *qfs = pl_calloc_ptr(tmp, qfnum, qfs);
+    vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, qfs);
+    if (!params->lock_queue)
+        init_queue_locks(vk, qfnum, qfs);
+
+    // Create the command pools for each unique qf that exists
+    struct {
+        const struct pl_vulkan_queue *info;
+        struct vk_cmdpool **pool;
+        VkQueueFlagBits flags; // *any* of these flags provide the cap
+    } qinfos[] = {
+        {
+            .info = &params->queue_graphics,
+            .pool = &vk->pool_graphics,
+            .flags = VK_QUEUE_GRAPHICS_BIT,
+        }, {
+            .info = &params->queue_compute,
+            .pool = &vk->pool_compute,
+            .flags = VK_QUEUE_COMPUTE_BIT,
+        }, {
+            .info = &params->queue_transfer,
+            .pool = &vk->pool_transfer,
+            .flags = VK_QUEUE_TRANSFER_BIT |
+                     VK_QUEUE_GRAPHICS_BIT |
+                     VK_QUEUE_COMPUTE_BIT,
+        }
+    };
+
+    for (int i = 0; i < PL_ARRAY_SIZE(qinfos); i++) {
+        int qf = qinfos[i].info->index;
+        struct vk_cmdpool **pool = qinfos[i].pool;
+        if (!qinfos[i].info->count)
+            continue;
+
+        // API sanity check
+        pl_assert(qfs[qf].queueFlags & qinfos[i].flags);
+
+        // See if we already created a pool for this queue family
+        for (int j = 0; j < i; j++) {
+            if (qinfos[j].info->count && qinfos[j].info->index == qf) {
+                *pool = *qinfos[j].pool;
+                goto next_qf;
+            }
+        }
+
+        *pool = vk_cmdpool_create(vk, qf, qinfos[i].info->count, qfs[qf]);
+        if (!*pool)
+            goto error;
+        PL_ARRAY_APPEND(vk->alloc, vk->pools, *pool);
+
+        // Pre-emptively set "lower priority" pools as well
+        for (int j = i+1; j < PL_ARRAY_SIZE(qinfos); j++) {
+            if (qfs[qf].queueFlags & qinfos[j].flags)
+                *qinfos[j].pool = *pool;
+        }
+
+next_qf: ;
+    }
+
+    if (!vk->pool_graphics) {
+        PL_ERR(vk, "No valid queues provided?");
+        goto error;
+    }
+
+    if (!finalize_context(pl_vk, params->max_glsl_version))
+        goto error;
+
+    pl_free(tmp);
+    return pl_vk;
+
+error:
+    PL_FATAL(vk, "Failed importing vulkan device");
+    pl_vulkan_destroy((pl_vulkan *) &pl_vk);
+    pl_free(tmp);
+    return NULL;
+}
diff --git a/src/vulkan/formats.c b/src/vulkan/formats.c
new file mode 100644
index 0000000..f0eb0fb
--- /dev/null
+++ b/src/vulkan/formats.c
@@ -0,0 +1,616 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "formats.h"
+
+#define FMT(_name, num, size, ftype, bits, idx) \
+    (struct pl_fmt_t) {                         \
+        .name = _name,                          \
+        .type = PL_FMT_##ftype,                 \
+        .num_components  = num,                 \
+        .component_depth = bits,                \
+        .internal_size   = size,                \
+        .opaque          = false,               \
+        .texel_size      = size,                \
+        .texel_align     = size,                \
+        .host_bits       = bits,                \
+        .sample_order    = idx,                 \
+    }
+
+#define IDX(...)  {__VA_ARGS__}
+#define BITS(...) {__VA_ARGS__}
+
+#define REGFMT(name, num, bits, type)           \
+    FMT(name, num, (num) * (bits) / 8, type,    \
+        BITS(bits, bits, bits, bits),           \
+        IDX(0, 1, 2, 3))
+
+#define EMUFMT(_name, in, en, ib, eb, ftype)    \
+    (struct pl_fmt_t) {                         \
+        .name = _name,                          \
+        .type = PL_FMT_##ftype,                 \
+        .num_components  = en,                  \
+        .component_depth = BITS(ib, ib, ib, ib),\
+        .internal_size   = (in) * (ib) / 8,     \
+        .opaque          = false,               \
+        .emulated        = true,                \
+        .texel_size      = (en) * (eb) / 8,     \
+        .texel_align     = (eb) / 8,            \
+        .host_bits       = BITS(eb, eb, eb, eb),\
+        .sample_order    = IDX(0, 1, 2, 3),     \
+    }
+
+#define PACKED16FMT(_name, num, b)              \
+    (struct pl_fmt_t) {                         \
+        .name            = _name,               \
+        .type            = PL_FMT_UNORM,        \
+        .num_components  = num,                 \
+        .component_depth = BITS(b, b, b, b),    \
+        .internal_size   = (num) * 2,           \
+        .texel_size      = (num) * 2,           \
+        .texel_align     = (num) * 2,           \
+        .host_bits       = BITS(16, 16, 16, 16),\
+        .sample_order    = IDX(0, 1, 2, 3),     \
+    }
+
+#define PLANARFMT(_name, planes, size, bits)    \
+    (struct pl_fmt_t) {                         \
+        .name            = _name,               \
+        .type            = PL_FMT_UNORM,        \
+        .num_planes      = planes,              \
+        .num_components  = 3,                   \
+        .component_depth = {bits, bits, bits},  \
+        .internal_size   = size,                \
+        .opaque          = true,                \
+    }
+
+static const struct vk_format rgb8e = {
+    .tfmt   = VK_FORMAT_R8G8B8A8_UNORM,
+    .bfmt   = VK_FORMAT_R8G8B8_UNORM,
+    .icomps = 4,
+    .fmt    = EMUFMT("rgb8", 4, 3, 8, 8, UNORM),
+};
+
+static const struct vk_format rgb16e = {
+    .tfmt   = VK_FORMAT_R16G16B16A16_UNORM,
+    .bfmt   = VK_FORMAT_R16G16B16_UNORM,
+    .icomps = 4,
+    .fmt    = EMUFMT("rgb16", 4, 3, 16, 16, UNORM),
+};
+
+static const struct vk_format vk_formats[] = {
+    // Regular, byte-aligned integer formats
+    {VK_FORMAT_R8_UNORM,              REGFMT("r8",       1,  8, UNORM)},
+    {VK_FORMAT_R8G8_UNORM,            REGFMT("rg8",      2,  8, UNORM)},
+    {VK_FORMAT_R8G8B8_UNORM,          REGFMT("rgb8",     3,  8, UNORM), .emufmt = &rgb8e},
+    {VK_FORMAT_R8G8B8A8_UNORM,        REGFMT("rgba8",    4,  8, UNORM)},
+    {VK_FORMAT_R16_UNORM,             REGFMT("r16",      1, 16, UNORM)},
+    {VK_FORMAT_R16G16_UNORM,          REGFMT("rg16",     2, 16, UNORM)},
+    {VK_FORMAT_R16G16B16_UNORM,       REGFMT("rgb16",    3, 16, UNORM), .emufmt = &rgb16e},
+    {VK_FORMAT_R16G16B16A16_UNORM,    REGFMT("rgba16",   4, 16, UNORM)},
+
+    {VK_FORMAT_R8_SNORM,              REGFMT("r8s",      1,  8, SNORM)},
+    {VK_FORMAT_R8G8_SNORM,            REGFMT("rg8s",     2,  8, SNORM)},
+    {VK_FORMAT_R8G8B8_SNORM,          REGFMT("rgb8s",    3,  8, SNORM)},
+    {VK_FORMAT_R8G8B8A8_SNORM,        REGFMT("rgba8s",   4,  8, SNORM)},
+    {VK_FORMAT_R16_SNORM,             REGFMT("r16s",     1, 16, SNORM)},
+    {VK_FORMAT_R16G16_SNORM,          REGFMT("rg16s",    2, 16, SNORM)},
+    {VK_FORMAT_R16G16B16_SNORM,       REGFMT("rgb16s",   3, 16, SNORM)},
+    {VK_FORMAT_R16G16B16A16_SNORM,    REGFMT("rgba16s",  4, 16, SNORM)},
+
+    // Float formats (native formats: hf = half float, df = double float)
+    {VK_FORMAT_R16_SFLOAT,            REGFMT("r16hf",    1, 16, FLOAT)},
+    {VK_FORMAT_R16G16_SFLOAT,         REGFMT("rg16hf",   2, 16, FLOAT)},
+    {VK_FORMAT_R16G16B16_SFLOAT,      REGFMT("rgb16hf",  3, 16, FLOAT)},
+    {VK_FORMAT_R16G16B16A16_SFLOAT,   REGFMT("rgba16hf", 4, 16, FLOAT)},
+    {VK_FORMAT_R32_SFLOAT,            REGFMT("r32f",     1, 32, FLOAT)},
+    {VK_FORMAT_R32G32_SFLOAT,         REGFMT("rg32f",    2, 32, FLOAT)},
+    {VK_FORMAT_R32G32B32_SFLOAT,      REGFMT("rgb32f",   3, 32, FLOAT)},
+    {VK_FORMAT_R32G32B32A32_SFLOAT,   REGFMT("rgba32f",  4, 32, FLOAT)},
+
+    // Float formats (emulated upload/download)
+    {VK_FORMAT_R16_SFLOAT,            EMUFMT("r16f",     1, 1, 16, 32, FLOAT)},
+    {VK_FORMAT_R16G16_SFLOAT,         EMUFMT("rg16f",    2, 2, 16, 32, FLOAT)},
+    {VK_FORMAT_R16G16B16_SFLOAT,      EMUFMT("rgb16f",   3, 3, 16, 32, FLOAT)},
+    {VK_FORMAT_R16G16B16A16_SFLOAT,   EMUFMT("rgba16f",  4, 4, 16, 32, FLOAT)},
+
+    // Integer-sampled formats
+    {VK_FORMAT_R8_UINT,               REGFMT("r8u",      1,  8, UINT)},
+    {VK_FORMAT_R8G8_UINT,             REGFMT("rg8u",     2,  8, UINT)},
+    {VK_FORMAT_R8G8B8_UINT,           REGFMT("rgb8u",    3,  8, UINT)},
+    {VK_FORMAT_R8G8B8A8_UINT,         REGFMT("rgba8u",   4,  8, UINT)},
+    {VK_FORMAT_R16_UINT,              REGFMT("r16u",     1, 16, UINT)},
+    {VK_FORMAT_R16G16_UINT,           REGFMT("rg16u",    2, 16, UINT)},
+    {VK_FORMAT_R16G16B16_UINT,        REGFMT("rgb16u",   3, 16, UINT)},
+    {VK_FORMAT_R16G16B16A16_UINT,     REGFMT("rgba16u",  4, 16, UINT)},
+    {VK_FORMAT_R32_UINT,              REGFMT("r32u",     1, 32, UINT)},
+    {VK_FORMAT_R32G32_UINT,           REGFMT("rg32u",    2, 32, UINT)},
+    {VK_FORMAT_R32G32B32_UINT,        REGFMT("rgb32u",   3, 32, UINT)},
+    {VK_FORMAT_R32G32B32A32_UINT,     REGFMT("rgba32u",  4, 32, UINT)},
+
+    {VK_FORMAT_R8_SINT,               REGFMT("r8i",      1,  8, SINT)},
+    {VK_FORMAT_R8G8_SINT,             REGFMT("rg8i",     2,  8, SINT)},
+    {VK_FORMAT_R8G8B8_SINT,           REGFMT("rgb8i",    3,  8, SINT)},
+    {VK_FORMAT_R8G8B8A8_SINT,         REGFMT("rgba8i",   4,  8, SINT)},
+    {VK_FORMAT_R16_SINT,              REGFMT("r16i",     1, 16, SINT)},
+    {VK_FORMAT_R16G16_SINT,           REGFMT("rg16i",    2, 16, SINT)},
+    {VK_FORMAT_R16G16B16_SINT,        REGFMT("rgb16i",   3, 16, SINT)},
+    {VK_FORMAT_R16G16B16A16_SINT,     REGFMT("rgba16i",  4, 16, SINT)},
+    {VK_FORMAT_R32_SINT,              REGFMT("r32i",     1, 32, SINT)},
+    {VK_FORMAT_R32G32_SINT,           REGFMT("rg32i",    2, 32, SINT)},
+    {VK_FORMAT_R32G32B32_SINT,        REGFMT("rgb32i",   3, 32, SINT)},
+    {VK_FORMAT_R32G32B32A32_SINT,     REGFMT("rgba32i",  4, 32, SINT)},
+
+    // "Swapped" component order formats
+    {VK_FORMAT_B8G8R8_UNORM,             FMT("bgr8",     3,  3, UNORM, BITS(8,  8,  8),     IDX(2, 1, 0))},
+    {VK_FORMAT_B8G8R8A8_UNORM,           FMT("bgra8",    4,  4, UNORM, BITS(8,  8,  8,  8), IDX(2, 1, 0, 3))},
+
+    {VK_FORMAT_B8G8R8_UINT,              FMT("bgr8u",    3,  3, UINT,  BITS(8,  8,  8),     IDX(2, 1, 0))},
+    {VK_FORMAT_B8G8R8A8_UINT,            FMT("bgra8u",   4,  4, UINT,  BITS(8,  8,  8,  8), IDX(2, 1, 0, 3))},
+
+    {VK_FORMAT_B8G8R8_SINT,              FMT("bgr8i",    3,  3, SINT,  BITS(8,  8,  8),     IDX(2, 1, 0))},
+    {VK_FORMAT_B8G8R8A8_SINT,            FMT("bgra8i",   4,  4, SINT,  BITS(8,  8,  8,  8), IDX(2, 1, 0, 3))},
+
+    // "Packed" integer formats
+    //
+    // Note: These have the component order reversed from what the vulkan name
+    // implies, because we order our IDX from LSB to MSB (consistent with the
+    // usual ordering from lowest byte to highest byte, on little endian
+    // platforms), but Vulkan names them from MSB to LSB.
+    {VK_FORMAT_R4G4_UNORM_PACK8,         FMT("gr4",      2,  1, UNORM, BITS(4,  4),         IDX(1, 0))},
+    {VK_FORMAT_B4G4R4A4_UNORM_PACK16,    FMT("argb4",    4,  2, UNORM, BITS(4,  4,  4,  4), IDX(3, 0, 1, 2))},
+    {VK_FORMAT_R4G4B4A4_UNORM_PACK16,    FMT("abgr4",    4,  2, UNORM, BITS(4,  4,  4,  4), IDX(3, 2, 1, 0))},
+
+    {VK_FORMAT_R5G6B5_UNORM_PACK16,      FMT("bgr565",   3,  2, UNORM, BITS(5,  6,  5),     IDX(2, 1, 0))},
+    {VK_FORMAT_B5G6R5_UNORM_PACK16,      FMT("rgb565",   3,  2, UNORM, BITS(5,  6,  5),     IDX(0, 1, 2))},
+
+    {VK_FORMAT_R5G5B5A1_UNORM_PACK16,    FMT("a1bgr5",   4,  2, UNORM, BITS(1,  5,  5,  5), IDX(3, 2, 1, 0))},
+    {VK_FORMAT_B5G5R5A1_UNORM_PACK16,    FMT("a1rgb5",   4,  2, UNORM, BITS(1,  5,  5,  5), IDX(3, 0, 1, 2))},
+    {VK_FORMAT_A1R5G5B5_UNORM_PACK16,    FMT("bgr5a1",   4,  2, UNORM, BITS(5,  5,  5,  1), IDX(2, 1, 0, 3))},
+
+    {VK_FORMAT_A2B10G10R10_UNORM_PACK32, FMT("rgb10a2",  4,  4, UNORM, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))},
+    {VK_FORMAT_A2R10G10B10_UNORM_PACK32, FMT("bgr10a2",  4,  4, UNORM, BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))},
+    {VK_FORMAT_A2B10G10R10_SNORM_PACK32, FMT("rgb10a2s", 4,  4, SNORM, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))},
+    {VK_FORMAT_A2R10G10B10_SNORM_PACK32, FMT("bgr10a2s", 4,  4, SNORM, BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))},
+    {VK_FORMAT_A2B10G10R10_UINT_PACK32,  FMT("rgb10a2u", 4,  4, UINT,  BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))},
+    {VK_FORMAT_A2R10G10B10_UINT_PACK32,  FMT("bgr10a2u", 4,  4, UINT,  BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))},
+    {VK_FORMAT_A2B10G10R10_SINT_PACK32,  FMT("rgb10a2i", 4,  4, SINT,  BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))},
+    {VK_FORMAT_A2R10G10B10_SINT_PACK32,  FMT("bgr10a2i", 4,  4, SINT,  BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))},
+
+
+    // Packed 16 bit formats
+    {VK_FORMAT_R10X6_UNORM_PACK16,                  PACKED16FMT("rx10",         1, 10)},
+    {VK_FORMAT_R10X6G10X6_UNORM_2PACK16,            PACKED16FMT("rxgx10",       2, 10)},
+    {VK_FORMAT_R12X4_UNORM_PACK16,                  PACKED16FMT("rx12",         1, 12)},
+    {VK_FORMAT_R12X4G12X4_UNORM_2PACK16,            PACKED16FMT("rxgx12",       2, 12)},
+
+    // FIXME: enabling these requires VK_EXT_rgba10x6_formats or equivalent
+    // {VK_FORMAT_R10X6G10X6B10X6A10X6_UNORM_4PACK16,  PACKED16FMT("rxgxbxax10",   4, 10)},
+    // {VK_FORMAT_R12X4G12X4B12X4A12X4_UNORM_4PACK16,  PACKED16FMT("rxgxbxax12",   4, 12)},
+
+    // Planar formats
+    {VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM, PLANARFMT("g8_b8_r8_420", 3, 12, 8),
+        .pfmt = {
+            {VK_FORMAT_R8_UNORM},
+            {VK_FORMAT_R8_UNORM, .sx = 1, .sy = 1},
+            {VK_FORMAT_R8_UNORM, .sx = 1, .sy = 1},
+        },
+    },
+    {VK_FORMAT_G8_B8_R8_3PLANE_422_UNORM, PLANARFMT("g8_b8_r8_422", 3, 16, 8),
+        .pfmt = {
+            {VK_FORMAT_R8_UNORM},
+            {VK_FORMAT_R8_UNORM, .sx = 1},
+            {VK_FORMAT_R8_UNORM, .sx = 1},
+        },
+    },
+    {VK_FORMAT_G8_B8_R8_3PLANE_444_UNORM, PLANARFMT("g8_b8_r8_444", 3, 24, 8),
+        .pfmt = {
+            {VK_FORMAT_R8_UNORM},
+            {VK_FORMAT_R8_UNORM},
+            {VK_FORMAT_R8_UNORM},
+        },
+    },
+
+    {VK_FORMAT_G16_B16_R16_3PLANE_420_UNORM, PLANARFMT("g16_b16_r16_420", 3, 24, 16),
+        .pfmt = {
+            {VK_FORMAT_R16_UNORM},
+            {VK_FORMAT_R16_UNORM, .sx = 1, .sy = 1},
+            {VK_FORMAT_R16_UNORM, .sx = 1, .sy = 1},
+        },
+    },
+    {VK_FORMAT_G16_B16_R16_3PLANE_422_UNORM, PLANARFMT("g16_b16_r16_422", 3, 32, 16),
+        .pfmt = {
+            {VK_FORMAT_R16_UNORM},
+            {VK_FORMAT_R16_UNORM, .sx = 1},
+            {VK_FORMAT_R16_UNORM, .sx = 1},
+        },
+    },
+    {VK_FORMAT_G16_B16_R16_3PLANE_444_UNORM, PLANARFMT("g16_b16_r16_444", 3, 48, 16),
+        .pfmt = {
+            {VK_FORMAT_R16_UNORM},
+            {VK_FORMAT_R16_UNORM},
+            {VK_FORMAT_R16_UNORM},
+        },
+    },
+
+    {VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_420_UNORM_3PACK16, PLANARFMT("gx10_bx10_rx10_420", 3, 24, 10),
+        .pfmt = {
+            {VK_FORMAT_R10X6_UNORM_PACK16},
+            {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1, .sy = 1},
+            {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1, .sy = 1},
+        },
+    },
+    {VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_422_UNORM_3PACK16, PLANARFMT("gx10_bx10_rx10_422", 3, 32, 10),
+        .pfmt = {
+            {VK_FORMAT_R10X6_UNORM_PACK16},
+            {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1},
+            {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1},
+        },
+    },
+    {VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_444_UNORM_3PACK16, PLANARFMT("gx10_bx10_rx10_444", 3, 48, 10),
+        .pfmt = {
+            {VK_FORMAT_R10X6_UNORM_PACK16},
+            {VK_FORMAT_R10X6_UNORM_PACK16},
+            {VK_FORMAT_R10X6_UNORM_PACK16},
+        },
+    },
+
+    {VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_420_UNORM_3PACK16, PLANARFMT("gx12_bx12_rx12_420", 3, 24, 12),
+        .pfmt = {
+            {VK_FORMAT_R12X4_UNORM_PACK16},
+            {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1, .sy = 1},
+            {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1, .sy = 1},
+        },
+    },
+    {VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_422_UNORM_3PACK16, PLANARFMT("gx12_bx12_rx12_422", 3, 32, 12),
+        .pfmt = {
+            {VK_FORMAT_R12X4_UNORM_PACK16},
+            {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1},
+            {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1},
+        },
+    },
+    {VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_444_UNORM_3PACK16, PLANARFMT("gx12_bx12_rx12_444", 3, 48, 12),
+        .pfmt = {
+            {VK_FORMAT_R12X4_UNORM_PACK16},
+            {VK_FORMAT_R12X4_UNORM_PACK16},
+            {VK_FORMAT_R12X4_UNORM_PACK16},
+        },
+    },
+
+    {VK_FORMAT_G8_B8R8_2PLANE_420_UNORM, PLANARFMT("g8_br8_420", 2, 12, 8),
+        .pfmt = {
+            {VK_FORMAT_R8_UNORM},
+            {VK_FORMAT_R8G8_UNORM, .sx = 1, .sy = 1},
+        },
+    },
+    {VK_FORMAT_G8_B8R8_2PLANE_422_UNORM, PLANARFMT("g8_br8_422", 2, 16, 8),
+        .pfmt = {
+            {VK_FORMAT_R8_UNORM},
+            {VK_FORMAT_R8G8_UNORM, .sx = 1},
+        },
+    },
+    {VK_FORMAT_G8_B8R8_2PLANE_444_UNORM, PLANARFMT("g8_br8_444", 2, 24, 8),
+        .min_ver = VK_API_VERSION_1_3,
+        .pfmt = {
+            {VK_FORMAT_R8_UNORM},
+            {VK_FORMAT_R8G8_UNORM},
+        },
+    },
+
+    {VK_FORMAT_G16_B16R16_2PLANE_420_UNORM, PLANARFMT("g16_br16_420", 2, 24, 16),
+        .pfmt = {
+            {VK_FORMAT_R16_UNORM},
+            {VK_FORMAT_R16G16_UNORM, .sx = 1, .sy = 1},
+        },
+    },
+    {VK_FORMAT_G16_B16R16_2PLANE_422_UNORM, PLANARFMT("g16_br16_422", 2, 32, 16),
+        .pfmt = {
+            {VK_FORMAT_R16_UNORM},
+            {VK_FORMAT_R16G16_UNORM, .sx = 1},
+        },
+    },
+    {VK_FORMAT_G16_B16R16_2PLANE_444_UNORM, PLANARFMT("g16_br16_444", 2, 48, 16),
+        .min_ver = VK_API_VERSION_1_3,
+        .pfmt = {
+            {VK_FORMAT_R16_UNORM},
+            {VK_FORMAT_R16G16_UNORM},
+        },
+    },
+
+    {VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16, PLANARFMT("gx10_bxrx10_420", 2, 24, 10),
+        .pfmt = {
+            {VK_FORMAT_R10X6_UNORM_PACK16},
+            {VK_FORMAT_R10X6G10X6_UNORM_2PACK16, .sx = 1, .sy = 1},
+        },
+    },
+    {VK_FORMAT_G10X6_B10X6R10X6_2PLANE_422_UNORM_3PACK16, PLANARFMT("gx10_bxrx10_422", 2, 32, 10),
+        .pfmt = {
+            {VK_FORMAT_R10X6_UNORM_PACK16},
+            {VK_FORMAT_R10X6G10X6_UNORM_2PACK16, .sx = 1},
+        },
+    },
+    {VK_FORMAT_G10X6_B10X6R10X6_2PLANE_444_UNORM_3PACK16, PLANARFMT("gx10_bxrx10_444", 2, 48, 10),
+        .min_ver = VK_API_VERSION_1_3,
+        .pfmt = {
+            {VK_FORMAT_R10X6_UNORM_PACK16},
+            {VK_FORMAT_R10X6G10X6_UNORM_2PACK16},
+        },
+    },
+
+    {VK_FORMAT_G12X4_B12X4R12X4_2PLANE_420_UNORM_3PACK16, PLANARFMT("gx12_bxrx12_420", 2, 24, 12),
+        .pfmt = {
+            {VK_FORMAT_R12X4_UNORM_PACK16},
+            {VK_FORMAT_R12X4G12X4_UNORM_2PACK16, .sx = 1, .sy = 1},
+        },
+    },
+    {VK_FORMAT_G12X4_B12X4R12X4_2PLANE_422_UNORM_3PACK16, PLANARFMT("gx12_bxrx12_422", 2, 32, 12),
+        .pfmt = {
+            {VK_FORMAT_R12X4_UNORM_PACK16},
+            {VK_FORMAT_R12X4G12X4_UNORM_2PACK16, .sx = 1},
+        },
+    },
+    {VK_FORMAT_G12X4_B12X4R12X4_2PLANE_444_UNORM_3PACK16, PLANARFMT("gx12_bxrx12_444", 2, 48, 12),
+        .min_ver = VK_API_VERSION_1_3,
+        .pfmt = {
+            {VK_FORMAT_R12X4_UNORM_PACK16},
+            {VK_FORMAT_R12X4G12X4_UNORM_2PACK16},
+        },
+    },
+
+    {0}
+};
+
+#undef BITS
+#undef IDX
+#undef REGFMT
+#undef FMT
+
+void vk_setup_formats(struct pl_gpu_t *gpu)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    PL_ARRAY(pl_fmt) formats = {0};
+
+    // Texture format emulation requires at least support for texel buffers
+    bool has_emu = gpu->glsl.compute && gpu->limits.max_buffer_texels;
+
+    for (const struct vk_format *pvk_fmt = vk_formats; pvk_fmt->tfmt; pvk_fmt++) {
+        const struct vk_format *vk_fmt = pvk_fmt;
+
+        // Skip formats that require a too new version of Vulkan
+        if (vk_fmt->min_ver > vk->api_ver)
+            continue;
+
+        // Skip formats with innately emulated representation if unsupported
+        if (vk_fmt->fmt.emulated && !has_emu)
+            continue;
+
+        // Suppress some errors/warnings spit out by the format probing code
+        pl_log_level_cap(vk->log, PL_LOG_INFO);
+
+        bool has_drm_mods = vk->GetImageDrmFormatModifierPropertiesEXT;
+        VkDrmFormatModifierPropertiesEXT modifiers[16] = {0};
+        VkDrmFormatModifierPropertiesListEXT drm_props = {
+            .sType = VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT,
+            .drmFormatModifierCount = PL_ARRAY_SIZE(modifiers),
+            .pDrmFormatModifierProperties = modifiers,
+        };
+
+        VkFormatProperties2KHR prop2 = {
+            .sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2,
+            .pNext = has_drm_mods ? &drm_props : NULL,
+        };
+
+        vk->GetPhysicalDeviceFormatProperties2KHR(vk->physd, vk_fmt->tfmt, &prop2);
+
+        // If wholly unsupported, try falling back to the emulation formats
+        // for texture operations
+        VkFormatProperties *prop = &prop2.formatProperties;
+        while (has_emu && !prop->optimalTilingFeatures && vk_fmt->emufmt) {
+            vk_fmt = vk_fmt->emufmt;
+            vk->GetPhysicalDeviceFormatProperties2KHR(vk->physd, vk_fmt->tfmt, &prop2);
+        }
+
+        VkFormatFeatureFlags texflags = prop->optimalTilingFeatures;
+        VkFormatFeatureFlags bufflags = prop->bufferFeatures;
+        if (vk_fmt->fmt.emulated) {
+            // Emulated formats might have a different buffer representation
+            // than their texture representation. If they don't, assume their
+            // buffer representation is nonsensical (e.g. r16f)
+            if (vk_fmt->bfmt) {
+                vk->GetPhysicalDeviceFormatProperties(vk->physd, vk_fmt->bfmt, prop);
+                bufflags = prop->bufferFeatures;
+            } else {
+                bufflags = 0;
+            }
+        } else if (vk_fmt->fmt.num_planes) {
+            // Planar textures cannot be used directly
+            texflags = bufflags = 0;
+        }
+
+        pl_log_level_cap(vk->log, PL_LOG_NONE);
+
+        struct pl_fmt_t *fmt = pl_alloc_obj(gpu, fmt, struct pl_fmt_vk);
+        struct pl_fmt_vk *fmtp = PL_PRIV(fmt);
+        *fmt = vk_fmt->fmt;
+        *fmtp = (struct pl_fmt_vk) {
+            .vk_fmt = vk_fmt
+        };
+
+        // Always set the signature to the actual texture format, so we can use
+        // it to guarantee renderpass compatibility.
+        fmt->signature = (uint64_t) vk_fmt->tfmt;
+
+        // For sanity, clear the superfluous fields
+        for (int i = fmt->num_components; i < 4; i++) {
+            fmt->component_depth[i] = 0;
+            fmt->sample_order[i] = 0;
+            fmt->host_bits[i] = 0;
+        }
+
+        // We can set this universally
+        fmt->fourcc = pl_fmt_fourcc(fmt);
+
+        if (has_drm_mods) {
+
+            if (drm_props.drmFormatModifierCount == PL_ARRAY_SIZE(modifiers)) {
+                PL_WARN(gpu, "DRM modifier list for format %s possibly truncated",
+                        fmt->name);
+            }
+
+            // Query the list of supported DRM modifiers from the driver
+            PL_ARRAY(uint64_t) modlist = {0};
+            for (int i = 0; i < drm_props.drmFormatModifierCount; i++) {
+                if (modifiers[i].drmFormatModifierPlaneCount > 1) {
+                    PL_TRACE(gpu, "Ignoring format modifier %s of "
+                             "format %s because its plane count %d > 1",
+                             PRINT_DRM_MOD(modifiers[i].drmFormatModifier),
+                             fmt->name, modifiers[i].drmFormatModifierPlaneCount);
+                    continue;
+                }
+
+                // Only warn about texture format features relevant to us
+                const VkFormatFeatureFlags flag_mask =
+                    VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT |
+                    VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT |
+                    VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT |
+                    VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT |
+                    VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
+                    VK_FORMAT_FEATURE_BLIT_SRC_BIT |
+                    VK_FORMAT_FEATURE_BLIT_DST_BIT;
+
+
+                VkFormatFeatureFlags flags = modifiers[i].drmFormatModifierTilingFeatures;
+                if ((flags & flag_mask) != (texflags & flag_mask)) {
+                    PL_DEBUG(gpu, "DRM format modifier %s of format %s "
+                            "supports fewer caps (0x%"PRIx32") than optimal tiling "
+                            "(0x%"PRIx32"), may result in limited capability!",
+                            PRINT_DRM_MOD(modifiers[i].drmFormatModifier),
+                            fmt->name, flags, texflags);
+                }
+
+                PL_ARRAY_APPEND(fmt, modlist, modifiers[i].drmFormatModifier);
+            }
+
+            fmt->num_modifiers = modlist.num;
+            fmt->modifiers = modlist.elem;
+
+        } else if (gpu->export_caps.tex & PL_HANDLE_DMA_BUF) {
+
+            // Hard-code a list of static mods that we're likely to support
+            static const uint64_t static_mods[2] = {
+                DRM_FORMAT_MOD_INVALID,
+                DRM_FORMAT_MOD_LINEAR,
+            };
+
+            fmt->num_modifiers = PL_ARRAY_SIZE(static_mods);
+            fmt->modifiers = static_mods;
+
+        }
+
+        struct { VkFormatFeatureFlags flags; enum pl_fmt_caps caps; } bufbits[] = {
+            {VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT,        PL_FMT_CAP_VERTEX},
+            {VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT, PL_FMT_CAP_TEXEL_UNIFORM},
+            {VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT, PL_FMT_CAP_TEXEL_STORAGE},
+        };
+
+        for (int i = 0; i < PL_ARRAY_SIZE(bufbits); i++) {
+            if ((bufflags & bufbits[i].flags) == bufbits[i].flags)
+                fmt->caps |= bufbits[i].caps;
+        }
+
+        if (fmt->caps) {
+            fmt->glsl_type = pl_var_glsl_type_name(pl_var_from_fmt(fmt, ""));
+            pl_assert(fmt->glsl_type);
+        }
+
+        struct { VkFormatFeatureFlags flags; enum pl_fmt_caps caps; } bits[] = {
+            {VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT,      PL_FMT_CAP_BLENDABLE},
+            {VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT, PL_FMT_CAP_LINEAR},
+            {VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT,               PL_FMT_CAP_SAMPLEABLE},
+            {VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT,               PL_FMT_CAP_STORABLE},
+            {VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT,            PL_FMT_CAP_RENDERABLE},
+
+            // We don't distinguish between the two blit modes for pl_fmt_caps
+            {VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT,
+                PL_FMT_CAP_BLITTABLE},
+        };
+
+        for (int i = 0; i < PL_ARRAY_SIZE(bits); i++) {
+            if ((texflags & bits[i].flags) == bits[i].flags)
+                fmt->caps |= bits[i].caps;
+        }
+
+        // For blit emulation via compute shaders
+        if (!(fmt->caps & PL_FMT_CAP_BLITTABLE) && (fmt->caps & PL_FMT_CAP_STORABLE)) {
+            fmt->caps |= PL_FMT_CAP_BLITTABLE;
+            fmtp->blit_emulated = true;
+        }
+
+        // This is technically supported for all textures, but the semantics
+        // of pl_gpu require it only be listed for non-opaque ones
+        if (!fmt->opaque)
+            fmt->caps |= PL_FMT_CAP_HOST_READABLE;
+
+        // Vulkan requires a minimum GLSL version that supports textureGather()
+        if (fmt->caps & PL_FMT_CAP_SAMPLEABLE)
+            fmt->gatherable = true;
+
+        // Disable implied capabilities where the dependencies are unavailable
+        enum pl_fmt_caps storable = PL_FMT_CAP_STORABLE | PL_FMT_CAP_TEXEL_STORAGE;
+        if (!(fmt->caps & PL_FMT_CAP_SAMPLEABLE))
+            fmt->caps &= ~PL_FMT_CAP_LINEAR;
+        if (!gpu->glsl.compute)
+            fmt->caps &= ~storable;
+
+        bool has_nofmt = vk->features.features.shaderStorageImageReadWithoutFormat &&
+                         vk->features.features.shaderStorageImageWriteWithoutFormat;
+
+        if (fmt->caps & storable) {
+            int real_comps = PL_DEF(vk_fmt->icomps, fmt->num_components);
+            fmt->glsl_format = pl_fmt_glsl_format(fmt, real_comps);
+            if (!fmt->glsl_format && !has_nofmt) {
+                PL_DEBUG(gpu, "Storable format '%s' has no matching GLSL "
+                         "format qualifier but read/write without format "
+                         "is not supported.. disabling", fmt->name);
+                fmt->caps &= ~storable;
+            }
+        }
+
+        if (fmt->caps & storable)
+            fmt->caps |= PL_FMT_CAP_READWRITE;
+
+        // Pick sub-plane formats for planar formats
+        for (int n = 0; n < fmt->num_planes; n++) {
+            for (int i = 0; i < formats.num; i++) {
+                if (formats.elem[i]->signature == vk_fmt->pfmt[n].fmt) {
+                    fmt->planes[n].format = formats.elem[i];
+                    fmt->planes[n].shift_x = vk_fmt->pfmt[n].sx;
+                    fmt->planes[n].shift_y = vk_fmt->pfmt[n].sy;
+                    break;
+                }
+            }
+
+            pl_assert(fmt->planes[n].format);
+        }
+
+        PL_ARRAY_APPEND(gpu, formats, fmt);
+    }
+
+    gpu->formats = formats.elem;
+    gpu->num_formats = formats.num;
+}
diff --git a/src/vulkan/formats.h b/src/vulkan/formats.h
new file mode 100644
index 0000000..b1408fd
--- /dev/null
+++ b/src/vulkan/formats.h
@@ -0,0 +1,34 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+#include "gpu.h"
+
+struct vk_format {
+    VkFormat tfmt;      // internal vulkan format enum (textures)
+    struct pl_fmt_t fmt;// pl_fmt template (features will be auto-detected)
+    int icomps;         // internal component count (or 0 to infer from `fmt`)
+    VkFormat bfmt;      // vulkan format for use as buffers (or 0 to use `tfmt`)
+    const struct vk_format *emufmt; // alternate format for emulation
+    uint32_t min_ver;   // minimum vulkan API version for this format to exist
+    struct { VkFormat fmt; int sx, sy; } pfmt[4]; // plane formats (for planar textures)
+};
+
+// Add all supported formats to the `pl_gpu` format list
+void vk_setup_formats(struct pl_gpu_t *gpu);
diff --git a/src/vulkan/gpu.c b/src/vulkan/gpu.c
new file mode 100644
index 0000000..69aca67
--- /dev/null
+++ b/src/vulkan/gpu.c
@@ -0,0 +1,924 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+#include "formats.h"
+#include "glsl/spirv.h"
+
+#ifdef PL_HAVE_UNIX
+#include <unistd.h>
+#endif
+
+// Gives us enough queries for 8 results
+#define QUERY_POOL_SIZE 16
+
+struct pl_timer_t {
+    VkQueryPool qpool; // even=start, odd=stop
+    int index_write; // next index to write to
+    int index_read; // next index to read from
+    uint64_t pending; // bitmask of queries that are still running
+};
+
+static inline uint64_t timer_bit(int index)
+{
+    return 1llu << (index / 2);
+}
+
+static void timer_destroy_cb(pl_gpu gpu, pl_timer timer)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    pl_assert(!timer->pending);
+    vk->DestroyQueryPool(vk->dev, timer->qpool, PL_VK_ALLOC);
+    pl_free(timer);
+}
+
+static pl_timer vk_timer_create(pl_gpu gpu)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    pl_timer timer = pl_alloc_ptr(NULL, timer);
+    *timer = (struct pl_timer_t) {0};
+
+    struct VkQueryPoolCreateInfo qinfo = {
+        .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
+        .queryType = VK_QUERY_TYPE_TIMESTAMP,
+        .queryCount = QUERY_POOL_SIZE,
+    };
+
+    VK(vk->CreateQueryPool(vk->dev, &qinfo, PL_VK_ALLOC, &timer->qpool));
+    return timer;
+
+error:
+    timer_destroy_cb(gpu, timer);
+    return NULL;
+}
+
+static void vk_timer_destroy(pl_gpu gpu, pl_timer timer)
+{
+    vk_gpu_idle_callback(gpu, (vk_cb) timer_destroy_cb, gpu, timer);
+}
+
+static uint64_t vk_timer_query(pl_gpu gpu, pl_timer timer)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    if (timer->index_read == timer->index_write)
+        return 0; // no more unprocessed results
+
+    vk_poll_commands(vk, 0);
+    if (timer->pending & timer_bit(timer->index_read))
+        return 0; // still waiting for results
+
+    VkResult res;
+    uint64_t ts[2] = {0};
+    res = vk->GetQueryPoolResults(vk->dev, timer->qpool, timer->index_read, 2,
+                                  sizeof(ts), &ts[0], sizeof(uint64_t),
+                                  VK_QUERY_RESULT_64_BIT);
+
+    switch (res) {
+    case VK_SUCCESS:
+        timer->index_read = (timer->index_read + 2) % QUERY_POOL_SIZE;
+        return (ts[1] - ts[0]) * vk->props.limits.timestampPeriod;
+    case VK_NOT_READY:
+        return 0;
+    default:
+        PL_VK_ASSERT(res, "Retrieving query pool results");
+    }
+
+error:
+    return 0;
+}
+
+static void timer_begin(pl_gpu gpu, struct vk_cmd *cmd, pl_timer timer)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    if (!timer)
+        return;
+
+    if (!cmd->pool->props.timestampValidBits) {
+        PL_TRACE(gpu, "QF %d does not support timestamp queries", cmd->pool->qf);
+        return;
+    }
+
+    vk_poll_commands(vk, 0);
+    if (timer->pending & timer_bit(timer->index_write))
+        return; // next query is still running, skip this timer
+
+    VkQueueFlags reset_flags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT;
+    if (cmd->pool->props.queueFlags & reset_flags) {
+        // Use direct command buffer resets
+        vk->CmdResetQueryPool(cmd->buf, timer->qpool, timer->index_write, 2);
+    } else {
+        // Use host query reset
+        vk->ResetQueryPool(vk->dev, timer->qpool, timer->index_write, 2);
+    }
+
+    vk->CmdWriteTimestamp(cmd->buf, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+                          timer->qpool, timer->index_write);
+
+    p->cmd_timer = timer;
+}
+
+static inline bool supports_marks(struct vk_cmd *cmd) {
+    // Spec says debug markers are only available on graphics/compute queues
+    VkQueueFlags flags = cmd->pool->props.queueFlags;
+    return flags & (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT);
+}
+
+struct vk_cmd *_begin_cmd(pl_gpu gpu, enum queue_type type, const char *label,
+                          pl_timer timer)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    pl_mutex_lock(&p->recording);
+
+    struct vk_cmdpool *pool;
+    switch (type) {
+    case ANY:      pool = p->cmd ? p->cmd->pool : vk->pool_graphics; break;
+    case GRAPHICS: pool = vk->pool_graphics; break;
+    case COMPUTE:  pool = vk->pool_compute;  break;
+    case TRANSFER: pool = vk->pool_transfer; break;
+    default: pl_unreachable();
+    }
+
+    if (!p->cmd || p->cmd->pool != pool) {
+        vk_cmd_submit(&p->cmd);
+        p->cmd = vk_cmd_begin(pool, label);
+        if (!p->cmd) {
+            pl_mutex_unlock(&p->recording);
+            return NULL;
+        }
+    }
+
+    if (vk->CmdBeginDebugUtilsLabelEXT && supports_marks(p->cmd)) {
+        vk->CmdBeginDebugUtilsLabelEXT(p->cmd->buf, &(VkDebugUtilsLabelEXT) {
+            .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,
+            .pLabelName = label,
+        });
+    }
+
+    timer_begin(gpu, p->cmd, timer);
+    return p->cmd;
+}
+
+static void timer_end_cb(void *ptimer, void *pindex)
+{
+    pl_timer timer = ptimer;
+    int index = (uintptr_t) pindex;
+    timer->pending &= ~timer_bit(index);
+}
+
+bool _end_cmd(pl_gpu gpu, struct vk_cmd **pcmd, bool submit)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    bool ret = true;
+    if (!pcmd) {
+        if (submit) {
+            pl_mutex_lock(&p->recording);
+            ret = vk_cmd_submit(&p->cmd);
+            pl_mutex_unlock(&p->recording);
+        }
+        return ret;
+    }
+
+    struct vk_cmd *cmd = *pcmd;
+    pl_assert(p->cmd == cmd);
+
+    if (p->cmd_timer) {
+        pl_timer timer = p->cmd_timer;
+        vk->CmdWriteTimestamp(cmd->buf, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+                              timer->qpool, timer->index_write + 1);
+
+        timer->pending |= timer_bit(timer->index_write);
+        vk_cmd_callback(cmd, (vk_cb) timer_end_cb, timer,
+                        (void *) (uintptr_t) timer->index_write);
+
+        timer->index_write = (timer->index_write + 2) % QUERY_POOL_SIZE;
+        if (timer->index_write == timer->index_read) {
+            // forcibly drop the least recent result to make space
+            timer->index_read = (timer->index_read + 2) % QUERY_POOL_SIZE;
+        }
+
+        p->cmd_timer = NULL;
+    }
+
+    if (vk->CmdEndDebugUtilsLabelEXT && supports_marks(cmd))
+        vk->CmdEndDebugUtilsLabelEXT(cmd->buf);
+
+    if (submit)
+        ret = vk_cmd_submit(&p->cmd);
+
+    pl_mutex_unlock(&p->recording);
+    return ret;
+}
+
+void vk_gpu_idle_callback(pl_gpu gpu, vk_cb cb, const void *priv, const void *arg)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    pl_mutex_lock(&p->recording);
+    if (p->cmd) {
+        vk_cmd_callback(p->cmd, cb, priv, arg);
+    } else {
+        vk_dev_callback(vk, cb, priv, arg);
+    }
+    pl_mutex_unlock(&p->recording);
+}
+
+static void vk_gpu_destroy(pl_gpu gpu)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    vk_cmd_submit(&p->cmd);
+    vk_wait_idle(vk);
+
+    for (enum pl_tex_sample_mode s = 0; s < PL_TEX_SAMPLE_MODE_COUNT; s++) {
+        for (enum pl_tex_address_mode a = 0; a < PL_TEX_ADDRESS_MODE_COUNT; a++)
+            vk->DestroySampler(vk->dev, p->samplers[s][a], PL_VK_ALLOC);
+    }
+
+    pl_spirv_destroy(&p->spirv);
+    pl_mutex_destroy(&p->recording);
+    pl_free((void *) gpu);
+}
+
+pl_vulkan pl_vulkan_get(pl_gpu gpu)
+{
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    if (impl->destroy == vk_gpu_destroy) {
+        struct pl_vk *p = (struct pl_vk *) impl;
+        return p->vk->vulkan;
+    }
+
+    return NULL;
+}
+
+static pl_handle_caps vk_sync_handle_caps(struct vk_ctx *vk)
+{
+    pl_handle_caps caps = 0;
+
+    for (int i = 0; vk_sync_handle_list[i]; i++) {
+        enum pl_handle_type type = vk_sync_handle_list[i];
+
+        VkPhysicalDeviceExternalSemaphoreInfo info = {
+            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_SEMAPHORE_INFO_KHR,
+            .handleType = vk_sync_handle_type(type),
+        };
+
+        VkExternalSemaphoreProperties props = {
+            .sType = VK_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_PROPERTIES_KHR,
+        };
+
+        vk->GetPhysicalDeviceExternalSemaphoreProperties(vk->physd, &info, &props);
+        VkExternalSemaphoreFeatureFlags flags = props.externalSemaphoreFeatures;
+        if ((props.compatibleHandleTypes & info.handleType) &&
+            (flags & VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT_KHR))
+        {
+            caps |= type;
+        }
+    }
+
+    return caps;
+}
+
+static pl_handle_caps vk_tex_handle_caps(struct vk_ctx *vk, bool import)
+{
+    pl_handle_caps caps = 0;
+
+    for (int i = 0; vk_mem_handle_list[i]; i++) {
+        enum pl_handle_type handle_type = vk_mem_handle_list[i];
+        if (handle_type == PL_HANDLE_DMA_BUF && !vk->GetImageDrmFormatModifierPropertiesEXT) {
+            PL_DEBUG(vk, "Tex caps for %s (0x%x) unsupported: no DRM modifiers",
+                     vk_handle_name(vk_mem_handle_type(PL_HANDLE_DMA_BUF)),
+                     (unsigned int) PL_HANDLE_DMA_BUF);
+            continue;
+        }
+
+        // Query whether creation of a "basic" dummy texture would work
+        VkPhysicalDeviceImageDrmFormatModifierInfoEXT drm_pinfo = {
+            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT,
+            .drmFormatModifier = DRM_FORMAT_MOD_LINEAR,
+            .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        };
+
+        VkPhysicalDeviceExternalImageFormatInfoKHR ext_pinfo = {
+            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO_KHR,
+            .handleType = vk_mem_handle_type(handle_type),
+        };
+
+        VkPhysicalDeviceImageFormatInfo2KHR pinfo = {
+            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2_KHR,
+            .pNext = &ext_pinfo,
+            .format = VK_FORMAT_R8_UNORM,
+            .type = VK_IMAGE_TYPE_2D,
+            .tiling = VK_IMAGE_TILING_OPTIMAL,
+            .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+        };
+
+        if (handle_type == PL_HANDLE_DMA_BUF) {
+            vk_link_struct(&pinfo, &drm_pinfo);
+            pinfo.tiling = VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
+        }
+
+        VkExternalImageFormatPropertiesKHR ext_props = {
+            .sType = VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES_KHR,
+        };
+
+        VkImageFormatProperties2KHR props = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2_KHR,
+            .pNext = &ext_props,
+        };
+
+        VkResult res;
+        res = vk->GetPhysicalDeviceImageFormatProperties2KHR(vk->physd, &pinfo, &props);
+        if (res != VK_SUCCESS) {
+            PL_DEBUG(vk, "Tex caps for %s (0x%x) unsupported: %s",
+                     vk_handle_name(ext_pinfo.handleType),
+                     (unsigned int) handle_type,
+                     vk_res_str(res));
+            continue;
+        }
+
+        if (vk_external_mem_check(vk, &ext_props.externalMemoryProperties,
+                                  handle_type, import))
+        {
+            caps |= handle_type;
+        }
+    }
+
+#ifdef VK_EXT_metal_objects
+    if (vk->ExportMetalObjectsEXT && import)
+        caps |= PL_HANDLE_MTL_TEX | PL_HANDLE_IOSURFACE;
+#endif
+
+    return caps;
+}
+
+static const VkFilter filters[PL_TEX_SAMPLE_MODE_COUNT] = {
+    [PL_TEX_SAMPLE_NEAREST] = VK_FILTER_NEAREST,
+    [PL_TEX_SAMPLE_LINEAR]  = VK_FILTER_LINEAR,
+};
+
+static inline struct pl_spirv_version get_spirv_version(const struct vk_ctx *vk)
+{
+    if (vk->api_ver >= VK_API_VERSION_1_3) {
+        const VkPhysicalDeviceMaintenance4Features *device_maintenance4;
+        device_maintenance4 = vk_find_struct(&vk->features,
+            VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_4_FEATURES);
+
+        if (device_maintenance4 && device_maintenance4->maintenance4) {
+            return (struct pl_spirv_version) {
+                .env_version = VK_API_VERSION_1_3,
+                .spv_version = PL_SPV_VERSION(1, 6),
+            };
+        }
+    }
+
+    pl_assert(vk->api_ver >= VK_API_VERSION_1_2);
+    return (struct pl_spirv_version) {
+        .env_version = VK_API_VERSION_1_2,
+        .spv_version = PL_SPV_VERSION(1, 5),
+    };
+}
+
+static const struct pl_gpu_fns pl_fns_vk;
+
+pl_gpu pl_gpu_create_vk(struct vk_ctx *vk)
+{
+    pl_assert(vk->dev);
+
+    struct pl_gpu_t *gpu = pl_zalloc_obj(NULL, gpu, struct pl_vk);
+    gpu->log = vk->log;
+
+    struct pl_vk *p = PL_PRIV(gpu);
+    pl_mutex_init(&p->recording);
+    p->vk = vk;
+    p->impl = pl_fns_vk;
+    p->spirv = pl_spirv_create(vk->log, get_spirv_version(vk));
+    if (!p->spirv)
+        goto error;
+
+    // Query all device properties
+    VkPhysicalDevicePCIBusInfoPropertiesEXT pci_props = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PCI_BUS_INFO_PROPERTIES_EXT,
+    };
+
+    VkPhysicalDeviceIDPropertiesKHR id_props = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR,
+        .pNext = &pci_props,
+    };
+
+    VkPhysicalDevicePushDescriptorPropertiesKHR pushd_props = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR,
+        .pNext = &id_props,
+    };
+
+    VkPhysicalDeviceSubgroupProperties group_props = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES,
+        .pNext = &pushd_props,
+    };
+
+    VkPhysicalDeviceExternalMemoryHostPropertiesEXT host_props = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_MEMORY_HOST_PROPERTIES_EXT,
+        .pNext = &group_props,
+    };
+
+    VkPhysicalDeviceProperties2KHR props = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
+        .pNext = &host_props,
+    };
+
+    bool is_portability = false;
+
+#ifdef VK_KHR_portability_subset
+    VkPhysicalDevicePortabilitySubsetPropertiesKHR port_props = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PORTABILITY_SUBSET_PROPERTIES_KHR,
+        .minVertexInputBindingStrideAlignment = 1,
+    };
+
+    for (int i = 0; i < vk->exts.num; i++) {
+        if (!strcmp(vk->exts.elem[i], VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME)) {
+            vk_link_struct(&props, &port_props);
+            is_portability = true;
+            break;
+        }
+    }
+#endif
+
+    vk->GetPhysicalDeviceProperties2(vk->physd, &props);
+    VkPhysicalDeviceLimits limits = props.properties.limits;
+
+    // Determine GLSL features and limits
+    gpu->glsl = (struct pl_glsl_version) {
+        .version = 450,
+        .vulkan = true,
+        .compute = true,
+        .max_shmem_size = limits.maxComputeSharedMemorySize,
+        .max_group_threads = limits.maxComputeWorkGroupInvocations,
+        .max_group_size = {
+            limits.maxComputeWorkGroupSize[0],
+            limits.maxComputeWorkGroupSize[1],
+            limits.maxComputeWorkGroupSize[2],
+        },
+    };
+
+    VkShaderStageFlags req_stages = VK_SHADER_STAGE_FRAGMENT_BIT |
+                                    VK_SHADER_STAGE_COMPUTE_BIT;
+    VkSubgroupFeatureFlags req_flags = VK_SUBGROUP_FEATURE_BASIC_BIT |
+                                       VK_SUBGROUP_FEATURE_VOTE_BIT |
+                                       VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
+                                       VK_SUBGROUP_FEATURE_BALLOT_BIT |
+                                       VK_SUBGROUP_FEATURE_SHUFFLE_BIT;
+
+    if ((group_props.supportedStages & req_stages) == req_stages &&
+        (group_props.supportedOperations & req_flags) == req_flags)
+    {
+        gpu->glsl.subgroup_size = group_props.subgroupSize;
+    }
+
+    if (vk->features.features.shaderImageGatherExtended) {
+        gpu->glsl.min_gather_offset = limits.minTexelGatherOffset;
+        gpu->glsl.max_gather_offset = limits.maxTexelGatherOffset;
+    }
+
+    const size_t max_size = vk_malloc_avail(vk->ma, 0);
+    gpu->limits = (struct pl_gpu_limits) {
+        // pl_gpu
+        .thread_safe        = true,
+        .callbacks          = true,
+        // pl_buf
+        .max_buf_size       = max_size,
+        .max_ubo_size       = PL_MIN(limits.maxUniformBufferRange, max_size),
+        .max_ssbo_size      = PL_MIN(limits.maxStorageBufferRange, max_size),
+        .max_vbo_size       = vk_malloc_avail(vk->ma, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT),
+        .max_mapped_size    = vk_malloc_avail(vk->ma, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT),
+        .max_buffer_texels  = PL_MIN(limits.maxTexelBufferElements, max_size),
+        .align_host_ptr     = host_props.minImportedHostPointerAlignment,
+        .host_cached        = vk_malloc_avail(vk->ma, VK_MEMORY_PROPERTY_HOST_CACHED_BIT),
+        // pl_tex
+        .max_tex_1d_dim     = limits.maxImageDimension1D,
+        .max_tex_2d_dim     = limits.maxImageDimension2D,
+        .max_tex_3d_dim     = limits.maxImageDimension3D,
+        .blittable_1d_3d    = true,
+        .buf_transfer       = true,
+        .align_tex_xfer_pitch  = limits.optimalBufferCopyRowPitchAlignment,
+        .align_tex_xfer_offset = pl_lcm(limits.optimalBufferCopyOffsetAlignment, 4),
+        // pl_pass
+        .max_variable_comps = 0, // vulkan doesn't support these at all
+        .max_constants      = SIZE_MAX,
+        .array_size_constants = !is_portability,
+        .max_pushc_size     = limits.maxPushConstantsSize,
+#ifdef VK_KHR_portability_subset
+        .align_vertex_stride = port_props.minVertexInputBindingStrideAlignment,
+#else
+        .align_vertex_stride = 1,
+#endif
+        .max_dispatch = {
+            limits.maxComputeWorkGroupCount[0],
+            limits.maxComputeWorkGroupCount[1],
+            limits.maxComputeWorkGroupCount[2],
+        },
+        .fragment_queues    = vk->pool_graphics->num_queues,
+        .compute_queues     = vk->pool_compute->num_queues,
+    };
+
+    gpu->export_caps.buf = vk_malloc_handle_caps(vk->ma, false);
+    gpu->import_caps.buf = vk_malloc_handle_caps(vk->ma, true);
+    gpu->export_caps.tex = vk_tex_handle_caps(vk, false);
+    gpu->import_caps.tex = vk_tex_handle_caps(vk, true);
+    gpu->export_caps.sync = vk_sync_handle_caps(vk);
+    gpu->import_caps.sync = 0; // Not supported yet
+
+    if (pl_gpu_supports_interop(gpu)) {
+        pl_static_assert(sizeof(gpu->uuid) == VK_UUID_SIZE);
+        memcpy(gpu->uuid, id_props.deviceUUID, sizeof(gpu->uuid));
+
+        gpu->pci.domain = pci_props.pciDomain;
+        gpu->pci.bus = pci_props.pciBus;
+        gpu->pci.device = pci_props.pciDevice;
+        gpu->pci.function = pci_props.pciFunction;
+    }
+
+    if (vk->CmdPushDescriptorSetKHR)
+        p->max_push_descriptors = pushd_props.maxPushDescriptors;
+
+    vk_setup_formats(gpu);
+
+    // Compute the correct minimum texture alignment
+    p->min_texel_alignment = 1;
+    for (int i = 0; i < gpu->num_formats; i++) {
+        if (gpu->formats[i]->emulated || gpu->formats[i]->opaque)
+            continue;
+        size_t texel_size = gpu->formats[i]->texel_size;
+        p->min_texel_alignment = pl_lcm(p->min_texel_alignment, texel_size);
+    }
+    PL_DEBUG(gpu, "Minimum texel alignment: %zu", p->min_texel_alignment);
+
+    // Initialize the samplers
+    for (enum pl_tex_sample_mode s = 0; s < PL_TEX_SAMPLE_MODE_COUNT; s++) {
+        for (enum pl_tex_address_mode a = 0; a < PL_TEX_ADDRESS_MODE_COUNT; a++) {
+            static const VkSamplerAddressMode modes[PL_TEX_ADDRESS_MODE_COUNT] = {
+                [PL_TEX_ADDRESS_CLAMP]  = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
+                [PL_TEX_ADDRESS_REPEAT] = VK_SAMPLER_ADDRESS_MODE_REPEAT,
+                [PL_TEX_ADDRESS_MIRROR] = VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT,
+            };
+
+            VkSamplerCreateInfo sinfo = {
+                .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
+                .magFilter = filters[s],
+                .minFilter = filters[s],
+                .addressModeU = modes[a],
+                .addressModeV = modes[a],
+                .addressModeW = modes[a],
+                .maxAnisotropy = 1.0,
+            };
+
+            VK(vk->CreateSampler(vk->dev, &sinfo, PL_VK_ALLOC, &p->samplers[s][a]));
+        }
+    }
+
+    return pl_gpu_finalize(gpu);
+
+error:
+    vk_gpu_destroy(gpu);
+    return NULL;
+}
+
+static void vk_sync_destroy(pl_gpu gpu, pl_sync sync)
+{
+    if (!sync)
+        return;
+
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_sync_vk *sync_vk = PL_PRIV(sync);
+
+#ifdef PL_HAVE_UNIX
+    if (sync->handle_type == PL_HANDLE_FD) {
+        if (sync->wait_handle.fd > -1)
+            close(sync->wait_handle.fd);
+        if (sync->signal_handle.fd > -1)
+            close(sync->signal_handle.fd);
+    }
+#endif
+#ifdef PL_HAVE_WIN32
+    if (sync->handle_type == PL_HANDLE_WIN32) {
+        if (sync->wait_handle.handle != NULL)
+            CloseHandle(sync->wait_handle.handle);
+        if (sync->signal_handle.handle != NULL)
+            CloseHandle(sync->signal_handle.handle);
+    }
+    // PL_HANDLE_WIN32_KMT is just an identifier. It doesn't get closed.
+#endif
+
+    vk->DestroySemaphore(vk->dev, sync_vk->wait, PL_VK_ALLOC);
+    vk->DestroySemaphore(vk->dev, sync_vk->signal, PL_VK_ALLOC);
+
+    pl_free((void *) sync);
+}
+
+void vk_sync_deref(pl_gpu gpu, pl_sync sync)
+{
+    if (!sync)
+        return;
+
+    struct pl_sync_vk *sync_vk = PL_PRIV(sync);
+    if (pl_rc_deref(&sync_vk->rc))
+        vk_sync_destroy(gpu, sync);
+}
+
+static pl_sync vk_sync_create(pl_gpu gpu, enum pl_handle_type handle_type)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    struct pl_sync_t *sync = pl_zalloc_obj(NULL, sync, struct pl_sync_vk);
+    sync->handle_type = handle_type;
+
+    struct pl_sync_vk *sync_vk = PL_PRIV(sync);
+    pl_rc_init(&sync_vk->rc);
+
+    VkExportSemaphoreCreateInfoKHR einfo = {
+        .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR,
+        .handleTypes = vk_sync_handle_type(handle_type),
+    };
+
+    switch (handle_type) {
+    case PL_HANDLE_FD:
+        sync->wait_handle.fd = -1;
+        sync->signal_handle.fd = -1;
+        break;
+    case PL_HANDLE_WIN32:
+    case PL_HANDLE_WIN32_KMT:
+        sync->wait_handle.handle = NULL;
+        sync->signal_handle.handle = NULL;
+        break;
+    case PL_HANDLE_DMA_BUF:
+    case PL_HANDLE_HOST_PTR:
+    case PL_HANDLE_MTL_TEX:
+    case PL_HANDLE_IOSURFACE:
+        pl_unreachable();
+    }
+
+    const VkSemaphoreCreateInfo sinfo = {
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+        .pNext = &einfo,
+    };
+
+    VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &sync_vk->wait));
+    VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &sync_vk->signal));
+    PL_VK_NAME(SEMAPHORE, sync_vk->wait, "sync wait");
+    PL_VK_NAME(SEMAPHORE, sync_vk->signal, "sync signal");
+
+#ifdef PL_HAVE_UNIX
+    if (handle_type == PL_HANDLE_FD) {
+        VkSemaphoreGetFdInfoKHR finfo = {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR,
+            .semaphore = sync_vk->wait,
+            .handleType = einfo.handleTypes,
+        };
+
+        VK(vk->GetSemaphoreFdKHR(vk->dev, &finfo, &sync->wait_handle.fd));
+
+        finfo.semaphore = sync_vk->signal;
+        VK(vk->GetSemaphoreFdKHR(vk->dev, &finfo, &sync->signal_handle.fd));
+    }
+#endif
+
+#ifdef PL_HAVE_WIN32
+    if (handle_type == PL_HANDLE_WIN32 ||
+        handle_type == PL_HANDLE_WIN32_KMT)
+    {
+        VkSemaphoreGetWin32HandleInfoKHR handle_info = {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR,
+            .semaphore = sync_vk->wait,
+            .handleType = einfo.handleTypes,
+        };
+
+        VK(vk->GetSemaphoreWin32HandleKHR(vk->dev, &handle_info,
+                                          &sync->wait_handle.handle));
+
+        handle_info.semaphore = sync_vk->signal;
+        VK(vk->GetSemaphoreWin32HandleKHR(vk->dev, &handle_info,
+                                          &sync->signal_handle.handle));
+    }
+#endif
+
+    return sync;
+
+error:
+    vk_sync_destroy(gpu, sync);
+    return NULL;
+}
+
+void pl_vulkan_sem_destroy(pl_gpu gpu, VkSemaphore *semaphore)
+{
+    VkSemaphore sem = *semaphore;
+    if (!sem)
+        return;
+
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    vk->DestroySemaphore(vk->dev, sem, PL_VK_ALLOC);
+    *semaphore = VK_NULL_HANDLE;
+}
+
+VkSemaphore pl_vulkan_sem_create(pl_gpu gpu, const struct pl_vulkan_sem_params *params)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    pl_assert(PL_ISPOT(params->export_handle));
+    if ((params->export_handle & gpu->export_caps.sync) != params->export_handle) {
+        PL_ERR(gpu, "Invalid handle type 0x%"PRIx64" specified for "
+               "`pl_vulkan_sem_create`!", (uint64_t) params->export_handle);
+        return VK_NULL_HANDLE;
+    }
+
+    switch (params->export_handle) {
+    case PL_HANDLE_FD:
+        params->out_handle->fd = -1;
+        break;
+    case PL_HANDLE_WIN32:
+    case PL_HANDLE_WIN32_KMT:
+        params->out_handle->handle = NULL;
+        break;
+    case PL_HANDLE_DMA_BUF:
+    case PL_HANDLE_HOST_PTR:
+    case PL_HANDLE_MTL_TEX:
+    case PL_HANDLE_IOSURFACE:
+        pl_unreachable();
+    }
+
+    const VkExportSemaphoreCreateInfoKHR einfo = {
+        .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR,
+        .handleTypes = vk_sync_handle_type(params->export_handle),
+    };
+
+    const VkSemaphoreTypeCreateInfo stinfo = {
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO,
+        .pNext = params->export_handle ? &einfo : NULL,
+        .semaphoreType = params->type,
+        .initialValue = params->initial_value,
+    };
+
+    const VkSemaphoreCreateInfo sinfo = {
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+        .pNext = &stinfo,
+    };
+
+    VkSemaphore sem = VK_NULL_HANDLE;
+    VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &sem));
+    PL_VK_NAME(SEMAPHORE, sem, PL_DEF(params->debug_tag, "pl_vulkan_sem"));
+
+#ifdef PL_HAVE_UNIX
+    if (params->export_handle == PL_HANDLE_FD) {
+        VkSemaphoreGetFdInfoKHR finfo = {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR,
+            .handleType = einfo.handleTypes,
+            .semaphore = sem,
+        };
+
+        VK(vk->GetSemaphoreFdKHR(vk->dev, &finfo, &params->out_handle->fd));
+    }
+#endif
+
+#ifdef PL_HAVE_WIN32
+    if (params->export_handle == PL_HANDLE_WIN32 ||
+        params->export_handle == PL_HANDLE_WIN32_KMT)
+    {
+        VkSemaphoreGetWin32HandleInfoKHR handle_info = {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR,
+            .handleType = einfo.handleTypes,
+            .semaphore = sem,
+        };
+
+        VK(vk->GetSemaphoreWin32HandleKHR(vk->dev, &handle_info,
+                                          &params->out_handle->handle));
+    }
+#endif
+
+    return sem;
+
+error:
+#ifdef PL_HAVE_UNIX
+    if (params->export_handle == PL_HANDLE_FD) {
+        if (params->out_handle->fd > -1)
+            close(params->out_handle->fd);
+    }
+#endif
+#ifdef PL_HAVE_WIN32
+    if (params->export_handle == PL_HANDLE_WIN32) {
+        if (params->out_handle->handle != NULL)
+            CloseHandle(params->out_handle->handle);
+    }
+    // PL_HANDLE_WIN32_KMT is just an identifier. It doesn't get closed.
+#endif
+    vk->DestroySemaphore(vk->dev, sem, PL_VK_ALLOC);
+    return VK_NULL_HANDLE;
+}
+
+static void vk_gpu_flush(pl_gpu gpu)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    CMD_SUBMIT(NULL);
+    vk_rotate_queues(vk);
+    vk_malloc_garbage_collect(vk->ma);
+}
+
+static void vk_gpu_finish(pl_gpu gpu)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    CMD_SUBMIT(NULL);
+    vk_wait_idle(vk);
+}
+
+static bool vk_gpu_is_failed(pl_gpu gpu)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    return vk->failed;
+}
+
+struct vk_cmd *pl_vk_steal_cmd(pl_gpu gpu)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    pl_mutex_lock(&p->recording);
+    struct vk_cmd *cmd = p->cmd;
+    p->cmd = NULL;
+    pl_mutex_unlock(&p->recording);
+
+    struct vk_cmdpool *pool = vk->pool_graphics;
+    if (!cmd || cmd->pool != pool) {
+        vk_cmd_submit(&cmd);
+        cmd = vk_cmd_begin(pool, NULL);
+    }
+
+    return cmd;
+}
+
+void pl_vk_print_heap(pl_gpu gpu, enum pl_log_level lev)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    vk_malloc_print_stats(vk->ma, lev);
+}
+
+static const struct pl_gpu_fns pl_fns_vk = {
+    .destroy                = vk_gpu_destroy,
+    .tex_create             = vk_tex_create,
+    .tex_destroy            = vk_tex_deref,
+    .tex_invalidate         = vk_tex_invalidate,
+    .tex_clear_ex           = vk_tex_clear_ex,
+    .tex_blit               = vk_tex_blit,
+    .tex_upload             = vk_tex_upload,
+    .tex_download           = vk_tex_download,
+    .tex_poll               = vk_tex_poll,
+    .tex_export             = vk_tex_export,
+    .buf_create             = vk_buf_create,
+    .buf_destroy            = vk_buf_deref,
+    .buf_write              = vk_buf_write,
+    .buf_read               = vk_buf_read,
+    .buf_copy               = vk_buf_copy,
+    .buf_export             = vk_buf_export,
+    .buf_poll               = vk_buf_poll,
+    .desc_namespace         = vk_desc_namespace,
+    .pass_create            = vk_pass_create,
+    .pass_destroy           = vk_pass_destroy,
+    .pass_run               = vk_pass_run,
+    .sync_create            = vk_sync_create,
+    .sync_destroy           = vk_sync_deref,
+    .timer_create           = vk_timer_create,
+    .timer_destroy          = vk_timer_destroy,
+    .timer_query            = vk_timer_query,
+    .gpu_flush              = vk_gpu_flush,
+    .gpu_finish             = vk_gpu_finish,
+    .gpu_is_failed          = vk_gpu_is_failed,
+};
diff --git a/src/vulkan/gpu.h b/src/vulkan/gpu.h
new file mode 100644
index 0000000..041de13
--- /dev/null
+++ b/src/vulkan/gpu.h
@@ -0,0 +1,175 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+#include "command.h"
+#include "formats.h"
+#include "malloc.h"
+#include "utils.h"
+
+#include "../gpu.h"
+#include "../glsl/spirv.h"
+#include "../pl_thread.h"
+
+pl_gpu pl_gpu_create_vk(struct vk_ctx *vk);
+
+// This function takes the current graphics command and steals it from the
+// GPU, so the caller can do custom vk_cmd_ calls on it. The caller should
+// submit it as well.
+struct vk_cmd *pl_vk_steal_cmd(pl_gpu gpu);
+
+// Print memory usage statistics
+void pl_vk_print_heap(pl_gpu, enum pl_log_level);
+
+// --- pl_gpu internal structs and helpers
+
+struct pl_fmt_vk {
+    const struct vk_format *vk_fmt;
+    bool blit_emulated;
+};
+
+enum queue_type {
+    GRAPHICS,
+    COMPUTE,
+    TRANSFER,
+    ANY,
+};
+
+struct pl_vk {
+    struct pl_gpu_fns impl;
+    struct vk_ctx *vk;
+    pl_spirv spirv;
+
+    // Some additional cached device limits and features checks
+    uint32_t max_push_descriptors;
+    size_t min_texel_alignment;
+
+    // The "currently recording" command. This will be queued and replaced by
+    // a new command every time we need to "switch" between queue families.
+    pl_mutex recording;
+    struct vk_cmd *cmd;
+    pl_timer cmd_timer;
+
+    // Array of VkSamplers for every combination of sample/address modes
+    VkSampler samplers[PL_TEX_SAMPLE_MODE_COUNT][PL_TEX_ADDRESS_MODE_COUNT];
+
+    // To avoid spamming warnings
+    bool warned_modless;
+};
+
+struct vk_cmd *_begin_cmd(pl_gpu, enum queue_type, const char *label, pl_timer);
+bool _end_cmd(pl_gpu, struct vk_cmd **, bool submit);
+
+#define CMD_BEGIN(type)              _begin_cmd(gpu, type, __func__, NULL)
+#define CMD_BEGIN_TIMED(type, timer) _begin_cmd(gpu, type, __func__, timer)
+#define CMD_FINISH(cmd) _end_cmd(gpu, cmd, false)
+#define CMD_SUBMIT(cmd) _end_cmd(gpu, cmd, true)
+
+// Helper to fire a callback the next time the `pl_gpu` is in an idle state
+//
+// Use this instead of `vk_dev_callback` when you need to clean up after
+// resources that might possibly still be in use by the `pl_gpu` at the time of
+// creating the callback.
+void vk_gpu_idle_callback(pl_gpu, vk_cb, const void *priv, const void *arg);
+
+struct pl_tex_vk {
+    pl_rc_t rc;
+    bool external_img;
+    enum queue_type transfer_queue;
+    VkImageType type;
+    VkImage img;
+    VkImageAspectFlags aspect;
+    struct vk_memslice mem;
+    // cached properties
+    VkFormat img_fmt;
+    VkImageUsageFlags usage_flags;
+    // for sampling
+    VkImageView view;
+    // for rendering
+    VkFramebuffer framebuffer;
+    // for vk_tex_upload/download fallback code
+    pl_fmt texel_fmt;
+    // for planar textures (as a convenience)
+    int num_planes;
+    struct pl_tex_vk *planes[4];
+
+    // synchronization and current state (planes only)
+    struct vk_sem sem;
+    VkImageLayout layout;
+    PL_ARRAY(pl_vulkan_sem) ext_deps; // external semaphore, not owned by the pl_tex
+    pl_sync ext_sync; // indicates an exported image
+    uint32_t qf; // last queue family to access this texture (for barriers)
+    bool may_invalidate;
+    bool held;
+};
+
+pl_tex vk_tex_create(pl_gpu, const struct pl_tex_params *);
+void vk_tex_deref(pl_gpu, pl_tex);
+void vk_tex_invalidate(pl_gpu, pl_tex);
+void vk_tex_clear_ex(pl_gpu, pl_tex, const union pl_clear_color);
+void vk_tex_blit(pl_gpu, const struct pl_tex_blit_params *);
+bool vk_tex_upload(pl_gpu, const struct pl_tex_transfer_params *);
+bool vk_tex_download(pl_gpu, const struct pl_tex_transfer_params *);
+bool vk_tex_poll(pl_gpu, pl_tex, uint64_t timeout);
+bool vk_tex_export(pl_gpu, pl_tex, pl_sync);
+void vk_tex_barrier(pl_gpu, struct vk_cmd *, pl_tex, VkPipelineStageFlags2,
+                    VkAccessFlags2, VkImageLayout, uint32_t qf);
+
+struct pl_buf_vk {
+    pl_rc_t rc;
+    struct vk_memslice mem;
+    enum queue_type update_queue;
+    VkBufferView view; // for texel buffers
+
+    // synchronization and current state
+    struct vk_sem sem;
+    bool exported;
+    bool needs_flush;
+};
+
+pl_buf vk_buf_create(pl_gpu, const struct pl_buf_params *);
+void vk_buf_deref(pl_gpu, pl_buf);
+void vk_buf_write(pl_gpu, pl_buf, size_t offset, const void *src, size_t size);
+bool vk_buf_read(pl_gpu, pl_buf, size_t offset, void *dst, size_t size);
+void vk_buf_copy(pl_gpu, pl_buf dst, size_t dst_offset,
+                 pl_buf src, size_t src_offset, size_t size);
+bool vk_buf_export(pl_gpu, pl_buf);
+bool vk_buf_poll(pl_gpu, pl_buf, uint64_t timeout);
+
+// Helper to ease buffer barrier creation. (`offset` is relative to pl_buf)
+void vk_buf_barrier(pl_gpu, struct vk_cmd *, pl_buf, VkPipelineStageFlags2,
+                    VkAccessFlags2, size_t offset, size_t size, bool export);
+
+// Flush visible writes to a buffer made by the API
+void vk_buf_flush(pl_gpu, struct vk_cmd *, pl_buf, size_t offset, size_t size);
+
+struct pl_pass_vk;
+
+int vk_desc_namespace(pl_gpu, enum pl_desc_type);
+pl_pass vk_pass_create(pl_gpu, const struct pl_pass_params *);
+void vk_pass_destroy(pl_gpu, pl_pass);
+void vk_pass_run(pl_gpu, const struct pl_pass_run_params *);
+
+struct pl_sync_vk {
+    pl_rc_t rc;
+    VkSemaphore wait;
+    VkSemaphore signal;
+};
+
+void vk_sync_deref(pl_gpu, pl_sync);
diff --git a/src/vulkan/gpu_buf.c b/src/vulkan/gpu_buf.c
new file mode 100644
index 0000000..2f317bc
--- /dev/null
+++ b/src/vulkan/gpu_buf.c
@@ -0,0 +1,470 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+
+void vk_buf_barrier(pl_gpu gpu, struct vk_cmd *cmd, pl_buf buf,
+                    VkPipelineStageFlags2 stage, VkAccessFlags2 access,
+                    size_t offset, size_t size, bool export)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+    pl_assert(!export || !buf_vk->exported); // can't re-export exported buffers
+    pl_rc_ref(&buf_vk->rc);
+
+    bool needs_flush = buf_vk->needs_flush || buf->params.host_mapped ||
+                       buf->params.import_handle == PL_HANDLE_HOST_PTR;
+    bool noncoherent = buf_vk->mem.data && !buf_vk->mem.coherent;
+    if (needs_flush && noncoherent) {
+        VK(vk->FlushMappedMemoryRanges(vk->dev, 1, &(struct VkMappedMemoryRange) {
+            .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+            .memory = buf_vk->mem.vkmem,
+            .offset = buf_vk->mem.map_offset,
+            .size = buf_vk->mem.map_size,
+        }));
+
+        // Just ignore errors, not much we can do about them other than
+        // logging them and moving on...
+    error: ;
+    }
+
+    struct vk_sync_scope last;
+    last = vk_sem_barrier(cmd, &buf_vk->sem, stage, access, export);
+
+    // CONCURRENT buffers require transitioning to/from IGNORED, EXCLUSIVE
+    // buffers require transitioning to/from the concrete QF index
+    uint32_t qf = vk->pools.num > 1 ? VK_QUEUE_FAMILY_IGNORED : cmd->pool->qf;
+    uint32_t src_qf = buf_vk->exported ? VK_QUEUE_FAMILY_EXTERNAL_KHR : qf;
+    uint32_t dst_qf = export ? VK_QUEUE_FAMILY_EXTERNAL_KHR : qf;
+
+    if (last.access || src_qf != dst_qf) {
+        vk_cmd_barrier(cmd, &(VkDependencyInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+            .bufferMemoryBarrierCount = 1,
+            .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) {
+                .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+                .srcStageMask = last.stage,
+                .srcAccessMask = last.access,
+                .dstStageMask = stage,
+                .dstAccessMask = access,
+                .srcQueueFamilyIndex = src_qf,
+                .dstQueueFamilyIndex = dst_qf,
+                .buffer = buf_vk->mem.buf,
+                .offset = buf_vk->mem.offset + offset,
+                .size = size,
+            },
+        });
+    }
+
+    buf_vk->needs_flush = false;
+    buf_vk->exported = export;
+    vk_cmd_callback(cmd, (vk_cb) vk_buf_deref, gpu, buf);
+}
+
+void vk_buf_deref(pl_gpu gpu, pl_buf buf)
+{
+    if (!buf)
+        return;
+
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+    if (pl_rc_deref(&buf_vk->rc)) {
+        vk->DestroyBufferView(vk->dev, buf_vk->view, PL_VK_ALLOC);
+        vk_malloc_free(vk->ma, &buf_vk->mem);
+        pl_free((void *) buf);
+    }
+}
+
+pl_buf vk_buf_create(pl_gpu gpu, const struct pl_buf_params *params)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    struct pl_buf_t *buf = pl_zalloc_obj(NULL, buf, struct pl_buf_vk);
+    buf->params = *params;
+    buf->params.initial_data = NULL;
+
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+    pl_rc_init(&buf_vk->rc);
+
+    struct vk_malloc_params mparams = {
+        .reqs = {
+            .size = PL_ALIGN2(params->size, 4), // for vk_buf_write
+            .memoryTypeBits = UINT32_MAX,
+            .alignment = 1,
+        },
+        // these are always set, because `vk_buf_copy` can always be used
+        .buf_usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+                     VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+        .export_handle = params->export_handle,
+        .import_handle = params->import_handle,
+        .shared_mem = params->shared_mem,
+        .debug_tag = params->debug_tag,
+    };
+
+    // Mandatory/optimal buffer offset alignment
+    VkDeviceSize *align = &mparams.reqs.alignment;
+    VkDeviceSize extra_align = vk->props.limits.optimalBufferCopyOffsetAlignment;
+
+    // Try and align all buffers to the minimum texel alignment, to make sure
+    // tex_upload/tex_download always gets aligned buffer copies if possible
+    extra_align = pl_lcm(extra_align, p->min_texel_alignment);
+
+    enum pl_buf_mem_type mem_type = params->memory_type;
+    bool is_texel = false;
+
+    if (params->uniform) {
+        mparams.buf_usage |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
+        *align = pl_lcm(*align, vk->props.limits.minUniformBufferOffsetAlignment);
+        mem_type = PL_BUF_MEM_DEVICE;
+        if (params->format) {
+            mparams.buf_usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
+            is_texel = true;
+        }
+    }
+
+    if (params->storable) {
+        mparams.buf_usage |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+        *align = pl_lcm(*align, vk->props.limits.minStorageBufferOffsetAlignment);
+        buf_vk->update_queue = COMPUTE;
+        mem_type = PL_BUF_MEM_DEVICE;
+        if (params->format) {
+            mparams.buf_usage |= VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT;
+            is_texel = true;
+        }
+    }
+
+    if (is_texel) {
+        *align = pl_lcm(*align, vk->props.limits.minTexelBufferOffsetAlignment);
+        *align = pl_lcm(*align, params->format->texel_size);
+    }
+
+    if (params->drawable) {
+        mparams.buf_usage |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT |
+                             VK_BUFFER_USAGE_INDEX_BUFFER_BIT;
+        mem_type = PL_BUF_MEM_DEVICE;
+    }
+
+    if (params->host_writable || params->initial_data) {
+        // Buffers should be written using mapped memory if possible
+        mparams.optimal = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+        // Use the transfer queue for updates on very large buffers (1 MB)
+        if (params->size > 1024*1024)
+            buf_vk->update_queue = TRANSFER;
+    }
+
+    if (params->host_mapped || params->host_readable) {
+        mparams.required |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+
+        if (params->size > 1024) {
+            // Prefer cached memory for large buffers (1 kB) which may be read
+            // from, because uncached reads are extremely slow
+            mparams.optimal |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+        }
+    }
+
+    switch (mem_type) {
+    case PL_BUF_MEM_AUTO:
+        // We generally prefer VRAM since it's faster than RAM, but any number
+        // of other requirements could potentially exclude it, so just mark it
+        // as optimal by default.
+        if (!(mparams.optimal & VK_MEMORY_PROPERTY_HOST_CACHED_BIT))
+            mparams.optimal |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+        break;
+    case PL_BUF_MEM_DEVICE:
+        // Force device local memory.
+        mparams.required |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+        break;
+    case PL_BUF_MEM_HOST:
+        // This isn't a true guarantee, but actually trying to restrict the
+        // device-local bit locks out all memory heaps on iGPUs. Requiring
+        // the memory be host-mapped is the easiest compromise.
+        mparams.required |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+        mparams.optimal  |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+        break;
+    case PL_BUF_MEM_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+    if (params->import_handle) {
+        size_t offset = params->shared_mem.offset;
+        if (PL_ALIGN(offset, *align) != offset) {
+            PL_ERR(gpu, "Imported memory offset %zu violates minimum alignment "
+                   "requirement of enabled usage flags (%zu)!",
+                   offset, (size_t) *align);
+            goto error;
+        }
+    } else {
+        *align = pl_lcm(*align, extra_align);
+    }
+
+    if (!vk_malloc_slice(vk->ma, &buf_vk->mem, &mparams))
+        goto error;
+
+    if (params->host_mapped)
+        buf->data = buf_vk->mem.data;
+
+    if (params->export_handle) {
+        buf->shared_mem = buf_vk->mem.shared_mem;
+        buf->shared_mem.drm_format_mod = DRM_FORMAT_MOD_LINEAR;
+        buf_vk->exported = true;
+    }
+
+    if (is_texel) {
+        struct pl_fmt_vk *fmtp = PL_PRIV(params->format);
+        VkBufferViewCreateInfo vinfo = {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
+            .buffer = buf_vk->mem.buf,
+            .format = PL_DEF(fmtp->vk_fmt->bfmt, fmtp->vk_fmt->tfmt),
+            .offset = buf_vk->mem.offset,
+            .range = buf_vk->mem.size,
+        };
+
+        VK(vk->CreateBufferView(vk->dev, &vinfo, PL_VK_ALLOC, &buf_vk->view));
+        PL_VK_NAME(BUFFER_VIEW, buf_vk->view, PL_DEF(params->debug_tag, "texel"));
+    }
+
+    if (params->initial_data)
+        vk_buf_write(gpu, buf, 0, params->initial_data, params->size);
+
+    return buf;
+
+error:
+    vk_buf_deref(gpu, buf);
+    return NULL;
+}
+
+static void invalidate_buf(pl_gpu gpu, pl_buf buf)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+    if (buf_vk->mem.data && !buf_vk->mem.coherent) {
+        VK(vk->InvalidateMappedMemoryRanges(vk->dev, 1, &(VkMappedMemoryRange) {
+            .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+            .memory = buf_vk->mem.vkmem,
+            .offset = buf_vk->mem.map_offset,
+            .size = buf_vk->mem.map_size,
+        }));
+    }
+
+    // Ignore errors (after logging), nothing useful we can do anyway
+error: ;
+    vk_buf_deref(gpu, buf);
+}
+
+void vk_buf_flush(pl_gpu gpu, struct vk_cmd *cmd, pl_buf buf,
+                  size_t offset, size_t size)
+{
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+    // We need to perform a flush if the host is capable of reading back from
+    // the buffer, or if we intend to overwrite it using mapped memory
+    bool can_read = buf->params.host_readable;
+    bool can_write = buf_vk->mem.data && buf->params.host_writable;
+    if (buf->params.host_mapped || buf->params.import_handle == PL_HANDLE_HOST_PTR)
+        can_read = can_write = true;
+
+    if (!can_read && !can_write)
+        return;
+
+    vk_cmd_barrier(cmd, &(VkDependencyInfo) {
+        .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+        .bufferMemoryBarrierCount = 1,
+        .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+            .srcStageMask = buf_vk->sem.write.stage,
+            .srcAccessMask = buf_vk->sem.write.access,
+            .dstStageMask = VK_PIPELINE_STAGE_2_HOST_BIT,
+            .dstAccessMask = (can_read ? VK_ACCESS_2_HOST_READ_BIT : 0)
+                           | (can_write ? VK_ACCESS_2_HOST_WRITE_BIT : 0),
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = buf_vk->mem.buf,
+            .offset = buf_vk->mem.offset + offset,
+            .size = size,
+        },
+    });
+
+    // We need to hold on to the buffer until this barrier completes
+    vk_cmd_callback(cmd, (vk_cb) invalidate_buf, gpu, buf);
+    pl_rc_ref(&buf_vk->rc);
+}
+
+bool vk_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t timeout)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+    // Opportunistically check if we can re-use this buffer without flush
+    vk_poll_commands(vk, 0);
+    if (pl_rc_count(&buf_vk->rc) == 1)
+        return false;
+
+    // Otherwise, we're force to submit any queued command so that the
+    // user is guaranteed to see progress eventually, even if they call
+    // this in a tight loop
+    CMD_SUBMIT(NULL);
+    vk_poll_commands(vk, timeout);
+
+    return pl_rc_count(&buf_vk->rc) > 1;
+}
+
+void vk_buf_write(pl_gpu gpu, pl_buf buf, size_t offset,
+                  const void *data, size_t size)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+    // For host-mapped buffers, we can just directly memcpy the buffer contents.
+    // Otherwise, we can update the buffer from the GPU using a command buffer.
+    if (buf_vk->mem.data) {
+        // ensure no queued operations
+        while (vk_buf_poll(gpu, buf, UINT64_MAX))
+            ; // do nothing
+
+        uintptr_t addr = (uintptr_t) buf_vk->mem.data + offset;
+        memcpy((void *) addr, data, size);
+        buf_vk->needs_flush = true;
+    } else {
+        struct vk_cmd *cmd = CMD_BEGIN(buf_vk->update_queue);
+        if (!cmd) {
+            PL_ERR(gpu, "Failed updating buffer!");
+            return;
+        }
+
+        vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT,
+                       VK_ACCESS_2_TRANSFER_WRITE_BIT, offset, size, false);
+
+        // Vulkan requires `size` to be a multiple of 4, so we need to make
+        // sure to handle the end separately if the original data is not
+        const size_t max_transfer = 64 * 1024;
+        size_t size_rem = size % 4;
+        size_t size_base = size - size_rem;
+        VkDeviceSize buf_offset = buf_vk->mem.offset + offset;
+
+        if (size_base > max_transfer) {
+            PL_TRACE(gpu, "Using multiple vkCmdUpdateBuffer calls to upload "
+                     "large buffer. Consider using buffer-buffer transfers "
+                     "instead!");
+        }
+
+        for (size_t xfer = 0; xfer < size_base; xfer += max_transfer) {
+            vk->CmdUpdateBuffer(cmd->buf, buf_vk->mem.buf,
+                                buf_offset + xfer,
+                                PL_MIN(size_base, max_transfer),
+                                (void *) ((uint8_t *) data + xfer));
+        }
+
+        if (size_rem) {
+            uint8_t tail[4] = {0};
+            memcpy(tail, data, size_rem);
+            vk->CmdUpdateBuffer(cmd->buf, buf_vk->mem.buf, buf_offset + size_base,
+                                sizeof(tail), tail);
+        }
+
+        pl_assert(!buf->params.host_readable); // no flush needed due to this
+        CMD_FINISH(&cmd);
+    }
+}
+
+bool vk_buf_read(pl_gpu gpu, pl_buf buf, size_t offset, void *dest, size_t size)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+    pl_assert(buf_vk->mem.data);
+
+    if (vk_buf_poll(gpu, buf, 0) && buf_vk->sem.write.sync.sem) {
+        // ensure no more queued writes
+        VK(vk->WaitSemaphores(vk->dev, &(VkSemaphoreWaitInfo) {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
+            .semaphoreCount = 1,
+            .pSemaphores = &buf_vk->sem.write.sync.sem,
+            .pValues = &buf_vk->sem.write.sync.value,
+        }, UINT64_MAX));
+
+        // process callbacks
+        vk_poll_commands(vk, 0);
+    }
+
+    uintptr_t addr = (uintptr_t) buf_vk->mem.data + (size_t) offset;
+    memcpy(dest, (void *) addr, size);
+    return true;
+
+error:
+    return false;
+}
+
+void vk_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset,
+                 pl_buf src, size_t src_offset, size_t size)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_buf_vk *dst_vk = PL_PRIV(dst);
+    struct pl_buf_vk *src_vk = PL_PRIV(src);
+
+    struct vk_cmd *cmd = CMD_BEGIN(dst_vk->update_queue);
+    if (!cmd) {
+        PL_ERR(gpu, "Failed copying buffer!");
+        return;
+    }
+
+    vk_buf_barrier(gpu, cmd, dst, VK_PIPELINE_STAGE_2_COPY_BIT,
+                   VK_ACCESS_2_TRANSFER_WRITE_BIT, dst_offset, size, false);
+    vk_buf_barrier(gpu, cmd, src, VK_PIPELINE_STAGE_2_COPY_BIT,
+                   VK_ACCESS_2_TRANSFER_READ_BIT, src_offset, size, false);
+
+    VkBufferCopy region = {
+        .srcOffset = src_vk->mem.offset + src_offset,
+        .dstOffset = dst_vk->mem.offset + dst_offset,
+        .size = size,
+    };
+
+    vk->CmdCopyBuffer(cmd->buf, src_vk->mem.buf, dst_vk->mem.buf,
+                      1, &region);
+
+    vk_buf_flush(gpu, cmd, dst, dst_offset, size);
+    CMD_FINISH(&cmd);
+}
+
+bool vk_buf_export(pl_gpu gpu, pl_buf buf)
+{
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+    if (buf_vk->exported)
+        return true;
+
+    struct vk_cmd *cmd = CMD_BEGIN(ANY);
+    if (!cmd) {
+        PL_ERR(gpu, "Failed exporting buffer!");
+        return false;
+    }
+
+    // For the queue family ownership transfer, we can ignore all pipeline
+    // stages since the synchronization via fences/semaphores is required
+    vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_NONE, 0, 0,
+                   buf->params.size, true);
+
+
+    return CMD_SUBMIT(&cmd);
+}
diff --git a/src/vulkan/gpu_pass.c b/src/vulkan/gpu_pass.c
new file mode 100644
index 0000000..5ffe77d
--- /dev/null
+++ b/src/vulkan/gpu_pass.c
@@ -0,0 +1,964 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+#include "cache.h"
+#include "glsl/spirv.h"
+
+// For pl_pass.priv
+struct pl_pass_vk {
+    // Pipeline / render pass
+    VkPipeline base;
+    VkPipeline pipe;
+    VkPipelineLayout pipeLayout;
+    VkRenderPass renderPass;
+    // Descriptor set (bindings)
+    bool use_pushd;
+    VkDescriptorSetLayout dsLayout;
+    VkDescriptorPool dsPool;
+    // To keep track of which descriptor sets are and aren't available, we
+    // allocate a fixed number and use a bitmask of all available sets.
+    VkDescriptorSet dss[16];
+    uint16_t dmask;
+
+    // For recompilation
+    VkVertexInputAttributeDescription *attrs;
+    VkPipelineCache cache;
+    VkShaderModule vert;
+    VkShaderModule shader;
+
+    // For updating
+    VkWriteDescriptorSet *dswrite;
+    VkDescriptorImageInfo *dsiinfo;
+    VkDescriptorBufferInfo *dsbinfo;
+    VkSpecializationInfo specInfo;
+    size_t spec_size;
+};
+
+int vk_desc_namespace(pl_gpu gpu, enum pl_desc_type type)
+{
+    return 0;
+}
+
+static void pass_destroy_cb(pl_gpu gpu, pl_pass pass)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+
+    vk->DestroyPipeline(vk->dev, pass_vk->pipe, PL_VK_ALLOC);
+    vk->DestroyPipeline(vk->dev, pass_vk->base, PL_VK_ALLOC);
+    vk->DestroyRenderPass(vk->dev, pass_vk->renderPass, PL_VK_ALLOC);
+    vk->DestroyPipelineLayout(vk->dev, pass_vk->pipeLayout, PL_VK_ALLOC);
+    vk->DestroyPipelineCache(vk->dev, pass_vk->cache, PL_VK_ALLOC);
+    vk->DestroyDescriptorPool(vk->dev, pass_vk->dsPool, PL_VK_ALLOC);
+    vk->DestroyDescriptorSetLayout(vk->dev, pass_vk->dsLayout, PL_VK_ALLOC);
+    vk->DestroyShaderModule(vk->dev, pass_vk->vert, PL_VK_ALLOC);
+    vk->DestroyShaderModule(vk->dev, pass_vk->shader, PL_VK_ALLOC);
+
+    pl_free((void *) pass);
+}
+
+void vk_pass_destroy(pl_gpu gpu, pl_pass pass)
+{
+    vk_gpu_idle_callback(gpu, (vk_cb) pass_destroy_cb, gpu, pass);
+}
+
+static const VkDescriptorType dsType[] = {
+    [PL_DESC_SAMPLED_TEX] = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+    [PL_DESC_STORAGE_IMG] = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+    [PL_DESC_BUF_UNIFORM] = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+    [PL_DESC_BUF_STORAGE] = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+    [PL_DESC_BUF_TEXEL_UNIFORM] = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
+    [PL_DESC_BUF_TEXEL_STORAGE] = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER,
+};
+
+static VkResult vk_compile_glsl(pl_gpu gpu, void *alloc,
+                                enum glsl_shader_stage stage,
+                                const char *shader,
+                                pl_cache_obj *out_spirv)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    pl_cache cache = pl_gpu_cache(gpu);
+    uint64_t key = CACHE_KEY_SPIRV;
+    if (cache) { // skip computing key if `cache
+        pl_hash_merge(&key, p->spirv->signature);
+        pl_hash_merge(&key, pl_str0_hash(shader));
+        out_spirv->key = key;
+        if (pl_cache_get(cache, out_spirv)) {
+            PL_DEBUG(gpu, "Re-using cached SPIR-V object 0x%"PRIx64, key);
+            return VK_SUCCESS;
+        }
+    }
+
+    pl_clock_t start = pl_clock_now();
+    pl_str spirv = pl_spirv_compile_glsl(p->spirv, alloc, gpu->glsl, stage, shader);
+    pl_log_cpu_time(gpu->log, start, pl_clock_now(), "translating SPIR-V");
+    out_spirv->data = spirv.buf;
+    out_spirv->size = spirv.len;
+    out_spirv->free = pl_free;
+    return spirv.len ? VK_SUCCESS : VK_ERROR_INITIALIZATION_FAILED;
+}
+
+static const VkShaderStageFlags stageFlags[] = {
+    [PL_PASS_RASTER]  = VK_SHADER_STAGE_FRAGMENT_BIT |
+                        VK_SHADER_STAGE_VERTEX_BIT,
+    [PL_PASS_COMPUTE] = VK_SHADER_STAGE_COMPUTE_BIT,
+};
+
+static void destroy_pipeline(struct vk_ctx *vk, void *pipeline)
+{
+    vk->DestroyPipeline(vk->dev, vk_unwrap_handle(pipeline), PL_VK_ALLOC);
+}
+
+static VkResult vk_recreate_pipelines(struct vk_ctx *vk, pl_pass pass,
+                                      bool derivable, VkPipeline base,
+                                      VkPipeline *out_pipe)
+{
+    struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+    const struct pl_pass_params *params = &pass->params;
+
+    // The old pipeline might still be in use, so we have to destroy it
+    // asynchronously with a device idle callback
+    if (*out_pipe) {
+        // We don't need to use `vk_gpu_idle_callback` because the only command
+        // that can access a VkPipeline, `vk_pass_run`, always flushes `p->cmd`.
+        vk_dev_callback(vk, (vk_cb) destroy_pipeline, vk, vk_wrap_handle(*out_pipe));
+        *out_pipe = VK_NULL_HANDLE;
+    }
+
+    VkPipelineCreateFlags flags = 0;
+    if (derivable)
+        flags |= VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT;
+    if (base)
+        flags |= VK_PIPELINE_CREATE_DERIVATIVE_BIT;
+
+    const VkSpecializationInfo *specInfo = &pass_vk->specInfo;
+    if (!specInfo->dataSize)
+        specInfo = NULL;
+
+    switch (params->type) {
+    case PL_PASS_RASTER: {
+        static const VkBlendFactor blendFactors[] = {
+            [PL_BLEND_ZERO]                = VK_BLEND_FACTOR_ZERO,
+            [PL_BLEND_ONE]                 = VK_BLEND_FACTOR_ONE,
+            [PL_BLEND_SRC_ALPHA]           = VK_BLEND_FACTOR_SRC_ALPHA,
+            [PL_BLEND_ONE_MINUS_SRC_ALPHA] = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA,
+        };
+
+        VkPipelineColorBlendAttachmentState blendState = {
+            .colorBlendOp = VK_BLEND_OP_ADD,
+            .alphaBlendOp = VK_BLEND_OP_ADD,
+            .colorWriteMask = VK_COLOR_COMPONENT_R_BIT |
+                              VK_COLOR_COMPONENT_G_BIT |
+                              VK_COLOR_COMPONENT_B_BIT |
+                              VK_COLOR_COMPONENT_A_BIT,
+        };
+
+        const struct pl_blend_params *blend = params->blend_params;
+        if (blend) {
+            blendState.blendEnable = true;
+            blendState.srcColorBlendFactor = blendFactors[blend->src_rgb];
+            blendState.dstColorBlendFactor = blendFactors[blend->dst_rgb];
+            blendState.srcAlphaBlendFactor = blendFactors[blend->src_alpha];
+            blendState.dstAlphaBlendFactor = blendFactors[blend->dst_alpha];
+        }
+
+        static const VkPrimitiveTopology topologies[PL_PRIM_TYPE_COUNT] = {
+            [PL_PRIM_TRIANGLE_LIST]  = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
+            [PL_PRIM_TRIANGLE_STRIP] = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP,
+        };
+
+        VkGraphicsPipelineCreateInfo cinfo = {
+            .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
+            .flags = flags,
+            .stageCount = 2,
+            .pStages = (VkPipelineShaderStageCreateInfo[]) {
+                {
+                    .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+                    .stage = VK_SHADER_STAGE_VERTEX_BIT,
+                    .module = pass_vk->vert,
+                    .pName = "main",
+                }, {
+                    .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+                    .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
+                    .module = pass_vk->shader,
+                    .pName = "main",
+                    .pSpecializationInfo = specInfo,
+                }
+            },
+            .pVertexInputState = &(VkPipelineVertexInputStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+                .vertexBindingDescriptionCount = 1,
+                .pVertexBindingDescriptions = &(VkVertexInputBindingDescription) {
+                    .binding = 0,
+                    .stride = params->vertex_stride,
+                    .inputRate = VK_VERTEX_INPUT_RATE_VERTEX,
+                },
+                .vertexAttributeDescriptionCount = params->num_vertex_attribs,
+                .pVertexAttributeDescriptions = pass_vk->attrs,
+            },
+            .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+                .topology = topologies[params->vertex_type],
+            },
+            .pViewportState = &(VkPipelineViewportStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
+                .viewportCount = 1,
+                .scissorCount = 1,
+            },
+            .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
+                .polygonMode = VK_POLYGON_MODE_FILL,
+                .cullMode = VK_CULL_MODE_NONE,
+                .lineWidth = 1.0f,
+            },
+            .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
+                .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
+            },
+            .pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
+                .attachmentCount = 1,
+                .pAttachments = &blendState,
+            },
+            .pDynamicState = &(VkPipelineDynamicStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
+                .dynamicStateCount = 2,
+                .pDynamicStates = (VkDynamicState[]){
+                    VK_DYNAMIC_STATE_VIEWPORT,
+                    VK_DYNAMIC_STATE_SCISSOR,
+                },
+            },
+            .layout = pass_vk->pipeLayout,
+            .renderPass = pass_vk->renderPass,
+            .basePipelineHandle = base,
+            .basePipelineIndex = -1,
+        };
+
+        return vk->CreateGraphicsPipelines(vk->dev, pass_vk->cache, 1, &cinfo,
+                                           PL_VK_ALLOC, out_pipe);
+    }
+
+    case PL_PASS_COMPUTE: {
+        VkComputePipelineCreateInfo cinfo = {
+            .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+            .flags = flags,
+            .stage = {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+                .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+                .module = pass_vk->shader,
+                .pName = "main",
+                .pSpecializationInfo = specInfo,
+            },
+            .layout = pass_vk->pipeLayout,
+            .basePipelineHandle = base,
+            .basePipelineIndex = -1,
+        };
+
+        return vk->CreateComputePipelines(vk->dev, pass_vk->cache, 1, &cinfo,
+                                          PL_VK_ALLOC, out_pipe);
+    }
+
+    case PL_PASS_INVALID:
+    case PL_PASS_TYPE_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+}
+
+pl_pass vk_pass_create(pl_gpu gpu, const struct pl_pass_params *params)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    bool success = false;
+
+    struct pl_pass_t *pass = pl_zalloc_obj(NULL, pass, struct pl_pass_vk);
+    pass->params = pl_pass_params_copy(pass, params);
+
+    struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+    pass_vk->dmask = -1; // all descriptors available
+
+    // temporary allocations
+    void *tmp = pl_tmp(NULL);
+
+    int num_desc = params->num_descriptors;
+    if (!num_desc)
+        goto no_descriptors;
+    if (num_desc > vk->props.limits.maxPerStageResources) {
+        PL_ERR(gpu, "Pass with %d descriptors exceeds the maximum number of "
+               "per-stage resources %" PRIu32"!",
+               num_desc, vk->props.limits.maxPerStageResources);
+        goto error;
+    }
+
+    pass_vk->dswrite = pl_calloc(pass, num_desc, sizeof(VkWriteDescriptorSet));
+    pass_vk->dsiinfo = pl_calloc(pass, num_desc, sizeof(VkDescriptorImageInfo));
+    pass_vk->dsbinfo = pl_calloc(pass, num_desc, sizeof(VkDescriptorBufferInfo));
+
+#define NUM_DS (PL_ARRAY_SIZE(pass_vk->dss))
+
+    int dsSize[PL_DESC_TYPE_COUNT] = {0};
+    VkDescriptorSetLayoutBinding *bindings = pl_calloc_ptr(tmp, num_desc, bindings);
+
+    uint32_t max_tex = vk->props.limits.maxPerStageDescriptorSampledImages,
+             max_img = vk->props.limits.maxPerStageDescriptorStorageImages,
+             max_ubo = vk->props.limits.maxPerStageDescriptorUniformBuffers,
+             max_ssbo = vk->props.limits.maxPerStageDescriptorStorageBuffers;
+
+    uint32_t *dsLimits[PL_DESC_TYPE_COUNT] = {
+        [PL_DESC_SAMPLED_TEX] = &max_tex,
+        [PL_DESC_STORAGE_IMG] = &max_img,
+        [PL_DESC_BUF_UNIFORM] = &max_ubo,
+        [PL_DESC_BUF_STORAGE] = &max_ssbo,
+        [PL_DESC_BUF_TEXEL_UNIFORM] = &max_tex,
+        [PL_DESC_BUF_TEXEL_STORAGE] = &max_img,
+    };
+
+    for (int i = 0; i < num_desc; i++) {
+        struct pl_desc *desc = &params->descriptors[i];
+        if (!(*dsLimits[desc->type])--) {
+            PL_ERR(gpu, "Pass exceeds the maximum number of per-stage "
+                   "descriptors of type %u!", (unsigned) desc->type);
+            goto error;
+        }
+
+        dsSize[desc->type]++;
+        bindings[i] = (VkDescriptorSetLayoutBinding) {
+            .binding = desc->binding,
+            .descriptorType = dsType[desc->type],
+            .descriptorCount = 1,
+            .stageFlags = stageFlags[params->type],
+        };
+    }
+
+    VkDescriptorSetLayoutCreateInfo dinfo = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+        .pBindings = bindings,
+        .bindingCount = num_desc,
+    };
+
+    if (p->max_push_descriptors && num_desc <= p->max_push_descriptors) {
+        dinfo.flags |= VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR;
+        pass_vk->use_pushd = true;
+    } else if (p->max_push_descriptors) {
+        PL_INFO(gpu, "Pass with %d descriptors exceeds the maximum push "
+                "descriptor count (%d). Falling back to descriptor sets!",
+                num_desc, p->max_push_descriptors);
+    }
+
+    VK(vk->CreateDescriptorSetLayout(vk->dev, &dinfo, PL_VK_ALLOC,
+                                     &pass_vk->dsLayout));
+
+    if (!pass_vk->use_pushd) {
+        PL_ARRAY(VkDescriptorPoolSize) dsPoolSizes = {0};
+
+        for (enum pl_desc_type t = 0; t < PL_DESC_TYPE_COUNT; t++) {
+            if (dsSize[t] > 0) {
+                PL_ARRAY_APPEND(tmp, dsPoolSizes, (VkDescriptorPoolSize) {
+                    .type = dsType[t],
+                    .descriptorCount = dsSize[t] * NUM_DS,
+                });
+            }
+        }
+
+        if (dsPoolSizes.num) {
+            VkDescriptorPoolCreateInfo pinfo = {
+                .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+                .maxSets = NUM_DS,
+                .pPoolSizes = dsPoolSizes.elem,
+                .poolSizeCount = dsPoolSizes.num,
+            };
+
+            VK(vk->CreateDescriptorPool(vk->dev, &pinfo, PL_VK_ALLOC, &pass_vk->dsPool));
+
+            VkDescriptorSetLayout layouts[NUM_DS];
+            for (int i = 0; i < NUM_DS; i++)
+                layouts[i] = pass_vk->dsLayout;
+
+            VkDescriptorSetAllocateInfo ainfo = {
+                .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+                .descriptorPool = pass_vk->dsPool,
+                .descriptorSetCount = NUM_DS,
+                .pSetLayouts = layouts,
+            };
+
+            VK(vk->AllocateDescriptorSets(vk->dev, &ainfo, pass_vk->dss));
+        }
+    }
+
+no_descriptors: ;
+
+    bool has_spec = params->num_constants;
+    if (has_spec) {
+        PL_ARRAY(VkSpecializationMapEntry) entries = {0};
+        PL_ARRAY_RESIZE(pass, entries, params->num_constants);
+        size_t spec_size = 0;
+
+        for (int i = 0; i < params->num_constants; i++) {
+            const struct pl_constant *con = &params->constants[i];
+            size_t con_size = pl_var_type_size(con->type);
+            entries.elem[i] = (VkSpecializationMapEntry) {
+                .constantID = con->id,
+                .offset = con->offset,
+                .size = con_size,
+            };
+
+            size_t req_size = con->offset + con_size;
+            spec_size = PL_MAX(spec_size, req_size);
+        }
+
+        pass_vk->spec_size = spec_size;
+        pass_vk->specInfo = (VkSpecializationInfo) {
+            .mapEntryCount = params->num_constants,
+            .pMapEntries = entries.elem,
+        };
+
+        if (params->constant_data) {
+            pass_vk->specInfo.pData = pl_memdup(pass, params->constant_data, spec_size);
+            pass_vk->specInfo.dataSize = spec_size;
+        }
+    }
+
+    VkPipelineLayoutCreateInfo linfo = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+        .setLayoutCount = num_desc ? 1 : 0,
+        .pSetLayouts = &pass_vk->dsLayout,
+        .pushConstantRangeCount = params->push_constants_size ? 1 : 0,
+        .pPushConstantRanges = &(VkPushConstantRange){
+            .stageFlags = stageFlags[params->type],
+            .offset = 0,
+            .size = params->push_constants_size,
+        },
+    };
+
+    VK(vk->CreatePipelineLayout(vk->dev, &linfo, PL_VK_ALLOC,
+                                &pass_vk->pipeLayout));
+
+    pl_cache_obj vert = {0}, frag = {0}, comp = {0};
+    switch (params->type) {
+    case PL_PASS_RASTER: ;
+        VK(vk_compile_glsl(gpu, tmp, GLSL_SHADER_VERTEX, params->vertex_shader, &vert));
+        VK(vk_compile_glsl(gpu, tmp, GLSL_SHADER_FRAGMENT, params->glsl_shader, &frag));
+        break;
+    case PL_PASS_COMPUTE:
+        VK(vk_compile_glsl(gpu, tmp, GLSL_SHADER_COMPUTE, params->glsl_shader, &comp));
+        break;
+    case PL_PASS_INVALID:
+    case PL_PASS_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+    // Use hash of generated SPIR-V as key for pipeline cache
+    const pl_cache cache = pl_gpu_cache(gpu);
+    pl_cache_obj pipecache = {0};
+    if (cache) {
+        pipecache.key = CACHE_KEY_VK_PIPE;
+        pl_hash_merge(&pipecache.key, pl_var_hash(vk->props.pipelineCacheUUID));
+        pl_hash_merge(&pipecache.key, pl_mem_hash(vert.data, vert.size));
+        pl_hash_merge(&pipecache.key, pl_mem_hash(frag.data, frag.size));
+        pl_hash_merge(&pipecache.key, pl_mem_hash(comp.data, comp.size));
+        pl_cache_get(cache, &pipecache);
+    }
+
+    if (cache || has_spec) {
+        // Don't create pipeline cache unless we either plan on caching the
+        // result of this shader to a pl_cache, or if we will possibly re-use
+        // it due to the presence of specialization constants
+        VkPipelineCacheCreateInfo pcinfo = {
+            .sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO,
+            .pInitialData = pipecache.data,
+            .initialDataSize = pipecache.size,
+        };
+
+        VK(vk->CreatePipelineCache(vk->dev, &pcinfo, PL_VK_ALLOC, &pass_vk->cache));
+    }
+
+    VkShaderModuleCreateInfo sinfo = {
+        .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+    };
+
+    pl_clock_t start = pl_clock_now();
+    switch (params->type) {
+    case PL_PASS_RASTER: {
+        sinfo.pCode = (uint32_t *) vert.data;
+        sinfo.codeSize = vert.size;
+        VK(vk->CreateShaderModule(vk->dev, &sinfo, PL_VK_ALLOC, &pass_vk->vert));
+        PL_VK_NAME(SHADER_MODULE, pass_vk->vert, "vertex");
+
+        sinfo.pCode = (uint32_t *) frag.data;
+        sinfo.codeSize = frag.size;
+        VK(vk->CreateShaderModule(vk->dev, &sinfo, PL_VK_ALLOC, &pass_vk->shader));
+        PL_VK_NAME(SHADER_MODULE, pass_vk->shader, "fragment");
+
+        pass_vk->attrs = pl_calloc_ptr(pass, params->num_vertex_attribs, pass_vk->attrs);
+        for (int i = 0; i < params->num_vertex_attribs; i++) {
+            struct pl_vertex_attrib *va = &params->vertex_attribs[i];
+            const struct vk_format **pfmt_vk = PL_PRIV(va->fmt);
+
+            pass_vk->attrs[i] = (VkVertexInputAttributeDescription) {
+                .binding  = 0,
+                .location = va->location,
+                .offset   = va->offset,
+                .format   = PL_DEF((*pfmt_vk)->bfmt, (*pfmt_vk)->tfmt),
+            };
+        }
+
+        VkRenderPassCreateInfo rinfo = {
+            .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
+            .attachmentCount = 1,
+            .pAttachments = &(VkAttachmentDescription) {
+                .format = (VkFormat) params->target_format->signature,
+                .samples = VK_SAMPLE_COUNT_1_BIT,
+                .loadOp = pass->params.load_target
+                            ? VK_ATTACHMENT_LOAD_OP_LOAD
+                            : VK_ATTACHMENT_LOAD_OP_DONT_CARE,
+                .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
+                .initialLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+                .finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+            },
+            .subpassCount = 1,
+            .pSubpasses = &(VkSubpassDescription) {
+                .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
+                .colorAttachmentCount = 1,
+                .pColorAttachments = &(VkAttachmentReference) {
+                    .attachment = 0,
+                    .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+                },
+            },
+        };
+
+        VK(vk->CreateRenderPass(vk->dev, &rinfo, PL_VK_ALLOC, &pass_vk->renderPass));
+        break;
+    }
+    case PL_PASS_COMPUTE: {
+        sinfo.pCode = (uint32_t *) comp.data;
+        sinfo.codeSize = comp.size;
+        VK(vk->CreateShaderModule(vk->dev, &sinfo, PL_VK_ALLOC, &pass_vk->shader));
+        PL_VK_NAME(SHADER_MODULE, pass_vk->shader, "compute");
+        break;
+    }
+    case PL_PASS_INVALID:
+    case PL_PASS_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+    pl_clock_t after_compilation = pl_clock_now();
+    pl_log_cpu_time(gpu->log, start, after_compilation, "compiling shader");
+
+    // Update cache entries on successful compilation
+    pl_cache_steal(cache, &vert);
+    pl_cache_steal(cache, &frag);
+    pl_cache_steal(cache, &comp);
+
+    // Create the graphics/compute pipeline
+    VkPipeline *pipe = has_spec ? &pass_vk->base : &pass_vk->pipe;
+    VK(vk_recreate_pipelines(vk, pass, has_spec, VK_NULL_HANDLE, pipe));
+    pl_log_cpu_time(gpu->log, after_compilation, pl_clock_now(), "creating pipeline");
+
+    // Update pipeline cache
+    if (cache) {
+        size_t size = 0;
+        VK(vk->GetPipelineCacheData(vk->dev, pass_vk->cache, &size, NULL));
+        pl_cache_obj_resize(tmp, &pipecache, size);
+        VK(vk->GetPipelineCacheData(vk->dev, pass_vk->cache, &size, pipecache.data));
+        pl_cache_steal(cache, &pipecache);
+    }
+
+    if (!has_spec) {
+        // We can free these if we no longer need them for specialization
+        pl_free_ptr(&pass_vk->attrs);
+        vk->DestroyShaderModule(vk->dev, pass_vk->vert, PL_VK_ALLOC);
+        vk->DestroyShaderModule(vk->dev, pass_vk->shader, PL_VK_ALLOC);
+        vk->DestroyPipelineCache(vk->dev, pass_vk->cache, PL_VK_ALLOC);
+        pass_vk->vert = VK_NULL_HANDLE;
+        pass_vk->shader = VK_NULL_HANDLE;
+        pass_vk->cache = VK_NULL_HANDLE;
+    }
+
+    PL_DEBUG(vk, "Pass statistics: size %zu, SPIR-V: vert %zu frag %zu comp %zu",
+             pipecache.size, vert.size, frag.size, comp.size);
+
+    success = true;
+
+error:
+    if (!success) {
+        pass_destroy_cb(gpu, pass);
+        pass = NULL;
+    }
+
+#undef NUM_DS
+
+    pl_free(tmp);
+    return pass;
+}
+
+static const VkPipelineStageFlags2 shaderStages[] = {
+    [PL_PASS_RASTER]  = VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT,
+    [PL_PASS_COMPUTE] = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+};
+
+static void vk_update_descriptor(pl_gpu gpu, struct vk_cmd *cmd, pl_pass pass,
+                                 struct pl_desc_binding db,
+                                 VkDescriptorSet ds, int idx)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+    struct pl_desc *desc = &pass->params.descriptors[idx];
+
+    VkWriteDescriptorSet *wds = &pass_vk->dswrite[idx];
+    *wds = (VkWriteDescriptorSet) {
+        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+        .dstSet = ds,
+        .dstBinding = desc->binding,
+        .descriptorCount = 1,
+        .descriptorType = dsType[desc->type],
+    };
+
+    static const VkAccessFlags2 storageAccess[PL_DESC_ACCESS_COUNT] = {
+        [PL_DESC_ACCESS_READONLY]   = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
+        [PL_DESC_ACCESS_WRITEONLY]  = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+        [PL_DESC_ACCESS_READWRITE]  = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
+                                      VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+    };
+
+    switch (desc->type) {
+    case PL_DESC_SAMPLED_TEX: {
+        pl_tex tex = db.object;
+        struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+        vk_tex_barrier(gpu, cmd, tex, shaderStages[pass->params.type],
+                       VK_ACCESS_2_SHADER_SAMPLED_READ_BIT,
+                       VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+                       VK_QUEUE_FAMILY_IGNORED);
+
+        VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx];
+        *iinfo = (VkDescriptorImageInfo) {
+            .sampler = p->samplers[db.sample_mode][db.address_mode],
+            .imageView = tex_vk->view,
+            .imageLayout = tex_vk->layout,
+        };
+
+        wds->pImageInfo = iinfo;
+        return;
+    }
+    case PL_DESC_STORAGE_IMG: {
+        pl_tex tex = db.object;
+        struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+        vk_tex_barrier(gpu, cmd, tex, shaderStages[pass->params.type],
+                       storageAccess[desc->access], VK_IMAGE_LAYOUT_GENERAL,
+                       VK_QUEUE_FAMILY_IGNORED);
+
+        VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx];
+        *iinfo = (VkDescriptorImageInfo) {
+            .imageView = tex_vk->view,
+            .imageLayout = tex_vk->layout,
+        };
+
+        wds->pImageInfo = iinfo;
+        return;
+    }
+    case PL_DESC_BUF_UNIFORM:
+    case PL_DESC_BUF_STORAGE: {
+        pl_buf buf = db.object;
+        struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+        VkAccessFlags2 access = VK_ACCESS_2_UNIFORM_READ_BIT;
+        if (desc->type == PL_DESC_BUF_STORAGE)
+            access = storageAccess[desc->access];
+
+        vk_buf_barrier(gpu, cmd, buf, shaderStages[pass->params.type],
+                       access, 0, buf->params.size, false);
+
+        VkDescriptorBufferInfo *binfo = &pass_vk->dsbinfo[idx];
+        *binfo = (VkDescriptorBufferInfo) {
+            .buffer = buf_vk->mem.buf,
+            .offset = buf_vk->mem.offset,
+            .range = buf->params.size,
+        };
+
+        wds->pBufferInfo = binfo;
+        return;
+    }
+    case PL_DESC_BUF_TEXEL_UNIFORM:
+    case PL_DESC_BUF_TEXEL_STORAGE: {
+        pl_buf buf = db.object;
+        struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+        VkAccessFlags2 access = VK_ACCESS_2_SHADER_SAMPLED_READ_BIT;
+        if (desc->type == PL_DESC_BUF_TEXEL_STORAGE)
+            access = storageAccess[desc->access];
+
+        vk_buf_barrier(gpu, cmd, buf, shaderStages[pass->params.type],
+                       access, 0, buf->params.size, false);
+
+        wds->pTexelBufferView = &buf_vk->view;
+        return;
+    }
+    case PL_DESC_INVALID:
+    case PL_DESC_TYPE_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+}
+
+static void vk_release_descriptor(pl_gpu gpu, struct vk_cmd *cmd, pl_pass pass,
+                                  struct pl_desc_binding db, int idx)
+{
+    const struct pl_desc *desc = &pass->params.descriptors[idx];
+
+    switch (desc->type) {
+    case PL_DESC_BUF_UNIFORM:
+    case PL_DESC_BUF_STORAGE:
+    case PL_DESC_BUF_TEXEL_UNIFORM:
+    case PL_DESC_BUF_TEXEL_STORAGE:
+        if (desc->access != PL_DESC_ACCESS_READONLY) {
+            pl_buf buf = db.object;
+            vk_buf_flush(gpu, cmd, buf, 0, buf->params.size);
+        }
+        return;
+    case PL_DESC_SAMPLED_TEX:
+    case PL_DESC_STORAGE_IMG:
+        return;
+    case PL_DESC_INVALID:
+    case PL_DESC_TYPE_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+}
+
+static void set_ds(struct pl_pass_vk *pass_vk, void *dsbit)
+{
+    pass_vk->dmask |= (uintptr_t) dsbit;
+}
+
+static bool need_respec(pl_pass pass, const struct pl_pass_run_params *params)
+{
+    struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+    if (!pass_vk->spec_size || !params->constant_data)
+        return false;
+
+    VkSpecializationInfo *specInfo = &pass_vk->specInfo;
+    size_t size = pass_vk->spec_size;
+    if (!specInfo->pData) {
+        // Shader was never specialized before
+        specInfo->pData = pl_memdup((void *) pass, params->constant_data, size);
+        specInfo->dataSize = size;
+        return true;
+    }
+
+    // Shader is being re-specialized with new values
+    if (memcmp(specInfo->pData, params->constant_data, size) != 0) {
+        memcpy((void *) specInfo->pData, params->constant_data, size);
+        return true;
+    }
+
+    return false;
+}
+
+void vk_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    pl_pass pass = params->pass;
+    struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+
+    if (params->vertex_data || params->index_data)
+        return pl_pass_run_vbo(gpu, params);
+
+    // Check if we need to re-specialize this pipeline
+    if (need_respec(pass, params)) {
+        pl_clock_t start = pl_clock_now();
+        VK(vk_recreate_pipelines(vk, pass, false, pass_vk->base, &pass_vk->pipe));
+        pl_log_cpu_time(gpu->log, start, pl_clock_now(), "re-specializing shader");
+    }
+
+    if (!pass_vk->use_pushd) {
+        // Wait for a free descriptor set
+        while (!pass_vk->dmask) {
+            PL_TRACE(gpu, "No free descriptor sets! ...blocking (slow path)");
+            vk_poll_commands(vk, 10000000); // 10 ms
+        }
+    }
+
+    static const enum queue_type types[] = {
+        [PL_PASS_RASTER]  = GRAPHICS,
+        [PL_PASS_COMPUTE] = COMPUTE,
+    };
+
+    struct vk_cmd *cmd = CMD_BEGIN_TIMED(types[pass->params.type], params->timer);
+    if (!cmd)
+        goto error;
+
+    // Find a descriptor set to use
+    VkDescriptorSet ds = VK_NULL_HANDLE;
+    if (!pass_vk->use_pushd) {
+        for (int i = 0; i < PL_ARRAY_SIZE(pass_vk->dss); i++) {
+            uint16_t dsbit = 1u << i;
+            if (pass_vk->dmask & dsbit) {
+                ds = pass_vk->dss[i];
+                pass_vk->dmask &= ~dsbit; // unset
+                vk_cmd_callback(cmd, (vk_cb) set_ds, pass_vk,
+                                (void *)(uintptr_t) dsbit);
+                break;
+            }
+        }
+    }
+
+    // Update the dswrite structure with all of the new values
+    for (int i = 0; i < pass->params.num_descriptors; i++)
+        vk_update_descriptor(gpu, cmd, pass, params->desc_bindings[i], ds, i);
+
+    if (!pass_vk->use_pushd) {
+        vk->UpdateDescriptorSets(vk->dev, pass->params.num_descriptors,
+                                 pass_vk->dswrite, 0, NULL);
+    }
+
+    // Bind the pipeline, descriptor set, etc.
+    static const VkPipelineBindPoint bindPoint[] = {
+        [PL_PASS_RASTER]  = VK_PIPELINE_BIND_POINT_GRAPHICS,
+        [PL_PASS_COMPUTE] = VK_PIPELINE_BIND_POINT_COMPUTE,
+    };
+
+    vk->CmdBindPipeline(cmd->buf, bindPoint[pass->params.type],
+                        PL_DEF(pass_vk->pipe, pass_vk->base));
+
+    if (ds) {
+        vk->CmdBindDescriptorSets(cmd->buf, bindPoint[pass->params.type],
+                                  pass_vk->pipeLayout, 0, 1, &ds, 0, NULL);
+    }
+
+    if (pass_vk->use_pushd) {
+        vk->CmdPushDescriptorSetKHR(cmd->buf, bindPoint[pass->params.type],
+                                    pass_vk->pipeLayout, 0,
+                                    pass->params.num_descriptors,
+                                    pass_vk->dswrite);
+    }
+
+    if (pass->params.push_constants_size) {
+        vk->CmdPushConstants(cmd->buf, pass_vk->pipeLayout,
+                             stageFlags[pass->params.type], 0,
+                             pass->params.push_constants_size,
+                             params->push_constants);
+    }
+
+    switch (pass->params.type) {
+    case PL_PASS_RASTER: {
+        pl_tex tex = params->target;
+        struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+        pl_buf vert = params->vertex_buf;
+        struct pl_buf_vk *vert_vk = PL_PRIV(vert);
+        pl_buf index = params->index_buf;
+        struct pl_buf_vk *index_vk = index ? PL_PRIV(index) : NULL;
+        pl_assert(vert);
+
+        // In the edge case that vert = index buffer, we need to synchronize
+        // for both flags simultaneously
+        VkPipelineStageFlags2 vbo_stage = VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT;
+        VkAccessFlags2 vbo_flags = VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT;
+        if (index == vert) {
+            vbo_stage |= VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT;
+            vbo_flags |= VK_ACCESS_2_INDEX_READ_BIT;
+        }
+
+        vk_buf_barrier(gpu, cmd, vert, vbo_stage, vbo_flags, 0, vert->params.size, false);
+
+        VkDeviceSize offset = vert_vk->mem.offset + params->buf_offset;
+        vk->CmdBindVertexBuffers(cmd->buf, 0, 1, &vert_vk->mem.buf, &offset);
+
+        if (index) {
+            if (index != vert) {
+                vk_buf_barrier(gpu, cmd, index, VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT,
+                               VK_ACCESS_2_INDEX_READ_BIT, 0, index->params.size,
+                               false);
+            }
+
+            static const VkIndexType index_fmts[PL_INDEX_FORMAT_COUNT] = {
+                [PL_INDEX_UINT16] = VK_INDEX_TYPE_UINT16,
+                [PL_INDEX_UINT32] = VK_INDEX_TYPE_UINT32,
+            };
+
+            vk->CmdBindIndexBuffer(cmd->buf, index_vk->mem.buf,
+                                   index_vk->mem.offset + params->index_offset,
+                                   index_fmts[params->index_fmt]);
+        }
+
+
+        VkAccessFlags2 fbo_access = VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT;
+        if (pass->params.load_target)
+            fbo_access |= VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT;
+
+        vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
+                       fbo_access, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+                       VK_QUEUE_FAMILY_IGNORED);
+
+        VkViewport viewport = {
+            .x = params->viewport.x0,
+            .y = params->viewport.y0,
+            .width  = pl_rect_w(params->viewport),
+            .height = pl_rect_h(params->viewport),
+        };
+
+        VkRect2D scissor = {
+            .offset = {params->scissors.x0, params->scissors.y0},
+            .extent = {pl_rect_w(params->scissors), pl_rect_h(params->scissors)},
+        };
+
+        vk->CmdSetViewport(cmd->buf, 0, 1, &viewport);
+        vk->CmdSetScissor(cmd->buf, 0, 1, &scissor);
+
+        VkRenderPassBeginInfo binfo = {
+            .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
+            .renderPass = pass_vk->renderPass,
+            .framebuffer = tex_vk->framebuffer,
+            .renderArea.extent = {tex->params.w, tex->params.h},
+        };
+
+        vk->CmdBeginRenderPass(cmd->buf, &binfo, VK_SUBPASS_CONTENTS_INLINE);
+
+        if (index) {
+            vk->CmdDrawIndexed(cmd->buf, params->vertex_count, 1, 0, 0, 0);
+        } else {
+            vk->CmdDraw(cmd->buf, params->vertex_count, 1, 0, 0);
+        }
+
+        vk->CmdEndRenderPass(cmd->buf);
+        break;
+    }
+    case PL_PASS_COMPUTE:
+        vk->CmdDispatch(cmd->buf, params->compute_groups[0],
+                        params->compute_groups[1],
+                        params->compute_groups[2]);
+        break;
+    case PL_PASS_INVALID:
+    case PL_PASS_TYPE_COUNT:
+        pl_unreachable();
+    };
+
+    for (int i = 0; i < pass->params.num_descriptors; i++)
+        vk_release_descriptor(gpu, cmd, pass, params->desc_bindings[i], i);
+
+    // submit this command buffer for better intra-frame granularity
+    CMD_SUBMIT(&cmd);
+
+error:
+    return;
+}
diff --git a/src/vulkan/gpu_tex.c b/src/vulkan/gpu_tex.c
new file mode 100644
index 0000000..7ab83b7
--- /dev/null
+++ b/src/vulkan/gpu_tex.c
@@ -0,0 +1,1453 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+
+void vk_tex_barrier(pl_gpu gpu, struct vk_cmd *cmd, pl_tex tex,
+                    VkPipelineStageFlags2 stage, VkAccessFlags2 access,
+                    VkImageLayout layout, uint32_t qf)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+    pl_rc_ref(&tex_vk->rc);
+    pl_assert(!tex_vk->held);
+    pl_assert(!tex_vk->num_planes);
+
+    // CONCURRENT images require transitioning to/from IGNORED, EXCLUSIVE
+    // images require transitioning to/from the concrete QF index
+    if (vk->pools.num == 1) {
+        if (tex_vk->qf == VK_QUEUE_FAMILY_IGNORED)
+            tex_vk->qf = cmd->pool->qf;
+        if (qf == VK_QUEUE_FAMILY_IGNORED)
+            qf = cmd->pool->qf;
+    }
+
+    struct vk_sync_scope last;
+    bool is_trans = layout != tex_vk->layout, is_xfer = qf != tex_vk->qf;
+    last = vk_sem_barrier(cmd, &tex_vk->sem, stage, access, is_trans || is_xfer);
+
+    VkImageMemoryBarrier2 barr = {
+        .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2,
+        .srcStageMask = last.stage,
+        .srcAccessMask = last.access,
+        .dstStageMask = stage,
+        .dstAccessMask = access,
+        .oldLayout = tex_vk->layout,
+        .newLayout = layout,
+        .srcQueueFamilyIndex = tex_vk->qf,
+        .dstQueueFamilyIndex = qf,
+        .image = tex_vk->img,
+        .subresourceRange = {
+            .aspectMask = tex_vk->aspect,
+            .levelCount = 1,
+            .layerCount = 1,
+        },
+    };
+
+    if (tex_vk->may_invalidate) {
+        tex_vk->may_invalidate = false;
+        barr.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+    }
+
+    if (last.access || is_trans || is_xfer) {
+        vk_cmd_barrier(cmd, &(VkDependencyInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+            .imageMemoryBarrierCount = 1,
+            .pImageMemoryBarriers = &barr,
+        });
+    }
+
+    tex_vk->qf = qf;
+    tex_vk->layout = layout;
+    vk_cmd_callback(cmd, (vk_cb) vk_tex_deref, gpu, tex);
+
+    for (int i = 0; i < tex_vk->ext_deps.num; i++)
+        vk_cmd_dep(cmd, stage, tex_vk->ext_deps.elem[i]);
+    tex_vk->ext_deps.num = 0;
+
+    if (tex_vk->ext_sync) {
+        vk_cmd_callback(cmd, (vk_cb) vk_sync_deref, gpu, tex_vk->ext_sync);
+        tex_vk->ext_sync = NULL;
+    }
+}
+
+static void vk_tex_destroy(pl_gpu gpu, struct pl_tex_t *tex)
+{
+    if (!tex)
+        return;
+
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+    vk_sync_deref(gpu, tex_vk->ext_sync);
+    vk->DestroyFramebuffer(vk->dev, tex_vk->framebuffer, PL_VK_ALLOC);
+    vk->DestroyImageView(vk->dev, tex_vk->view, PL_VK_ALLOC);
+    for (int i = 0; i < tex_vk->num_planes; i++)
+        vk_tex_deref(gpu, tex->planes[i]);
+    if (!tex_vk->external_img) {
+        vk->DestroyImage(vk->dev, tex_vk->img, PL_VK_ALLOC);
+        vk_malloc_free(vk->ma, &tex_vk->mem);
+    }
+
+    pl_free(tex);
+}
+
+void vk_tex_deref(pl_gpu gpu, pl_tex tex)
+{
+    if (!tex)
+        return;
+
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+    if (pl_rc_deref(&tex_vk->rc))
+        vk_tex_destroy(gpu, (struct pl_tex_t *) tex);
+}
+
+
+// Initializes non-VkImage values like the image view, framebuffers, etc.
+static bool vk_init_image(pl_gpu gpu, pl_tex tex, pl_debug_tag debug_tag)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    const struct pl_tex_params *params = &tex->params;
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+    pl_assert(tex_vk->img);
+    PL_VK_NAME(IMAGE, tex_vk->img, debug_tag);
+    pl_rc_init(&tex_vk->rc);
+    if (tex_vk->num_planes)
+        return true;
+    tex_vk->layout = VK_IMAGE_LAYOUT_UNDEFINED;
+    tex_vk->transfer_queue = GRAPHICS;
+    tex_vk->qf = VK_QUEUE_FAMILY_IGNORED; // will be set on first use, if needed
+
+    // Always use the transfer pool if available, for efficiency
+    if ((params->host_writable || params->host_readable) && vk->pool_transfer)
+        tex_vk->transfer_queue = TRANSFER;
+
+    // For emulated formats: force usage of the compute queue, because we
+    // can't properly track cross-queue dependencies for buffers (yet?)
+    if (params->format->emulated)
+        tex_vk->transfer_queue = COMPUTE;
+
+    bool ret = false;
+    VkRenderPass dummyPass = VK_NULL_HANDLE;
+
+    if (params->sampleable || params->renderable || params->storable) {
+        static const VkImageViewType viewType[] = {
+            [VK_IMAGE_TYPE_1D] = VK_IMAGE_VIEW_TYPE_1D,
+            [VK_IMAGE_TYPE_2D] = VK_IMAGE_VIEW_TYPE_2D,
+            [VK_IMAGE_TYPE_3D] = VK_IMAGE_VIEW_TYPE_3D,
+        };
+
+        const VkImageViewCreateInfo vinfo = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+            .image = tex_vk->img,
+            .viewType = viewType[tex_vk->type],
+            .format = tex_vk->img_fmt,
+            .subresourceRange = {
+                .aspectMask = tex_vk->aspect,
+                .levelCount = 1,
+                .layerCount = 1,
+            },
+        };
+
+        VK(vk->CreateImageView(vk->dev, &vinfo, PL_VK_ALLOC, &tex_vk->view));
+        PL_VK_NAME(IMAGE_VIEW, tex_vk->view, debug_tag);
+    }
+
+    if (params->renderable) {
+        // Framebuffers need to be created against a specific render pass
+        // layout, so we need to temporarily create a skeleton/dummy render
+        // pass for vulkan to figure out the compatibility
+        VkRenderPassCreateInfo rinfo = {
+            .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
+            .attachmentCount = 1,
+            .pAttachments = &(VkAttachmentDescription) {
+                .format = tex_vk->img_fmt,
+                .samples = VK_SAMPLE_COUNT_1_BIT,
+                .loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE,
+                .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
+                .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
+                .finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+            },
+            .subpassCount = 1,
+            .pSubpasses = &(VkSubpassDescription) {
+                .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
+                .colorAttachmentCount = 1,
+                .pColorAttachments = &(VkAttachmentReference) {
+                    .attachment = 0,
+                    .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+                },
+            },
+        };
+
+        VK(vk->CreateRenderPass(vk->dev, &rinfo, PL_VK_ALLOC, &dummyPass));
+
+        VkFramebufferCreateInfo finfo = {
+            .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
+            .renderPass = dummyPass,
+            .attachmentCount = 1,
+            .pAttachments = &tex_vk->view,
+            .width = tex->params.w,
+            .height = tex->params.h,
+            .layers = 1,
+        };
+
+        if (finfo.width > vk->props.limits.maxFramebufferWidth ||
+            finfo.height > vk->props.limits.maxFramebufferHeight)
+        {
+            PL_ERR(gpu, "Framebuffer of size %dx%d exceeds the maximum allowed "
+                   "dimensions: %dx%d", finfo.width, finfo.height,
+                   vk->props.limits.maxFramebufferWidth,
+                   vk->props.limits.maxFramebufferHeight);
+            goto error;
+        }
+
+        VK(vk->CreateFramebuffer(vk->dev, &finfo, PL_VK_ALLOC,
+                                 &tex_vk->framebuffer));
+        PL_VK_NAME(FRAMEBUFFER, tex_vk->framebuffer, debug_tag);
+    }
+
+    ret = true;
+
+error:
+    vk->DestroyRenderPass(vk->dev, dummyPass, PL_VK_ALLOC);
+    return ret;
+}
+
+pl_tex vk_tex_create(pl_gpu gpu, const struct pl_tex_params *params)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    enum pl_handle_type handle_type = params->export_handle |
+                                      params->import_handle;
+    VkExternalMemoryHandleTypeFlagBitsKHR vk_handle_type = vk_mem_handle_type(handle_type);
+
+    struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct pl_tex_vk);
+    pl_fmt fmt = params->format;
+    tex->params = *params;
+    tex->params.initial_data = NULL;
+    tex->sampler_type = PL_SAMPLER_NORMAL;
+
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+    struct pl_fmt_vk *fmtp = PL_PRIV(fmt);
+    tex_vk->img_fmt = fmtp->vk_fmt->tfmt;
+    tex_vk->num_planes = fmt->num_planes;
+    for (int i = 0; i < tex_vk->num_planes; i++)
+        tex_vk->aspect |= VK_IMAGE_ASPECT_PLANE_0_BIT << i;
+    tex_vk->aspect = PL_DEF(tex_vk->aspect, VK_IMAGE_ASPECT_COLOR_BIT);
+
+    switch (pl_tex_params_dimension(*params)) {
+    case 1: tex_vk->type = VK_IMAGE_TYPE_1D; break;
+    case 2: tex_vk->type = VK_IMAGE_TYPE_2D; break;
+    case 3: tex_vk->type = VK_IMAGE_TYPE_3D; break;
+    }
+
+    if (fmt->emulated) {
+        tex_vk->texel_fmt = pl_find_fmt(gpu, fmt->type, 1, 0,
+                                        fmt->host_bits[0],
+                                        PL_FMT_CAP_TEXEL_UNIFORM);
+        if (!tex_vk->texel_fmt) {
+            PL_ERR(gpu, "Failed picking texel format for emulated texture!");
+            goto error;
+        }
+
+        // Our format emulation requires storage image support. In order to
+        // make a bunch of checks happy, just mark it off as storable (and also
+        // enable VK_IMAGE_USAGE_STORAGE_BIT, which we do below)
+        tex->params.storable = true;
+    }
+
+    if (fmtp->blit_emulated) {
+        // Enable what's required for sampling
+        tex->params.sampleable = fmt->caps & PL_FMT_CAP_SAMPLEABLE;
+        tex->params.storable = true;
+    }
+
+    // Blit emulation on planar textures requires storage
+    if ((params->blit_src || params->blit_dst) && tex_vk->num_planes)
+        tex->params.storable = true;
+
+    VkImageUsageFlags usage = 0;
+    VkImageCreateFlags flags = 0;
+    if (tex->params.sampleable)
+        usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+    if (tex->params.renderable)
+        usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
+    if (tex->params.storable)
+        usage |= VK_IMAGE_USAGE_STORAGE_BIT;
+    if (tex->params.host_readable || tex->params.blit_src)
+        usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
+    if (tex->params.host_writable || tex->params.blit_dst || params->initial_data)
+        usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+
+    if (!usage) {
+        // Vulkan requires images have at least *some* image usage set, but our
+        // API is perfectly happy with a (useless) image. So just put
+        // VK_IMAGE_USAGE_TRANSFER_DST_BIT since this harmless.
+        usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+    }
+
+    if (tex_vk->num_planes) {
+        flags |= VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT |
+                 VK_IMAGE_CREATE_EXTENDED_USAGE_BIT;
+    }
+
+    // FIXME: Since we can't keep track of queue family ownership properly,
+    // and we don't know in advance what types of queue families this image
+    // will belong to, we're forced to share all of our images between all
+    // command pools.
+    uint32_t qfs[3] = {0};
+    pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs));
+    for (int i = 0; i < vk->pools.num; i++)
+        qfs[i] = vk->pools.elem[i]->qf;
+
+    VkImageDrmFormatModifierExplicitCreateInfoEXT drm_explicit = {
+        .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT,
+        .drmFormatModifier = params->shared_mem.drm_format_mod,
+        .drmFormatModifierPlaneCount = 1,
+        .pPlaneLayouts = &(VkSubresourceLayout) {
+            .rowPitch = PL_DEF(params->shared_mem.stride_w, params->w),
+            .depthPitch = params->d ? PL_DEF(params->shared_mem.stride_h, params->h) : 0,
+            .offset = params->shared_mem.offset,
+        },
+    };
+
+#ifdef VK_EXT_metal_objects
+    VkImportMetalTextureInfoEXT import_metal_tex = {
+        .sType = VK_STRUCTURE_TYPE_IMPORT_METAL_TEXTURE_INFO_EXT,
+        .plane = VK_IMAGE_ASPECT_PLANE_0_BIT << params->shared_mem.plane,
+    };
+
+    VkImportMetalIOSurfaceInfoEXT import_iosurface = {
+        .sType = VK_STRUCTURE_TYPE_IMPORT_METAL_IO_SURFACE_INFO_EXT,
+    };
+#endif
+
+    VkImageDrmFormatModifierListCreateInfoEXT drm_list = {
+        .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT,
+        .drmFormatModifierCount = fmt->num_modifiers,
+        .pDrmFormatModifiers = fmt->modifiers,
+    };
+
+    VkExternalMemoryImageCreateInfoKHR ext_info = {
+        .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO_KHR,
+        .handleTypes = vk_handle_type,
+    };
+
+    VkImageCreateInfo iinfo = {
+        .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+        .pNext = vk_handle_type ? &ext_info : NULL,
+        .imageType = tex_vk->type,
+        .format = tex_vk->img_fmt,
+        .extent = (VkExtent3D) {
+            .width  = params->w,
+            .height = PL_MAX(1, params->h),
+            .depth  = PL_MAX(1, params->d)
+        },
+        .mipLevels = 1,
+        .arrayLayers = 1,
+        .samples = VK_SAMPLE_COUNT_1_BIT,
+        .tiling = VK_IMAGE_TILING_OPTIMAL,
+        .usage = usage,
+        .flags = flags,
+        .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
+        .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT
+                                         : VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = vk->pools.num,
+        .pQueueFamilyIndices = qfs,
+    };
+
+    struct vk_malloc_params mparams = {
+        .optimal = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+        .export_handle = params->export_handle,
+        .import_handle = params->import_handle,
+        .shared_mem = params->shared_mem,
+        .debug_tag = params->debug_tag,
+    };
+
+    if (params->import_handle == PL_HANDLE_DMA_BUF) {
+        vk_link_struct(&iinfo, &drm_explicit);
+        iinfo.tiling = VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
+        mparams.shared_mem.offset = 0x0; // handled via plane offsets
+    }
+
+#ifdef VK_EXT_metal_objects
+    if (params->import_handle == PL_HANDLE_MTL_TEX) {
+        vk_link_struct(&iinfo, &import_metal_tex);
+        import_metal_tex.mtlTexture = params->shared_mem.handle.handle;
+    }
+
+    if (params->import_handle == PL_HANDLE_IOSURFACE) {
+        vk_link_struct(&iinfo, &import_iosurface);
+        import_iosurface.ioSurface = params->shared_mem.handle.handle;
+    }
+#endif
+
+    if (params->export_handle == PL_HANDLE_DMA_BUF) {
+        pl_assert(drm_list.drmFormatModifierCount > 0);
+        vk_link_struct(&iinfo, &drm_list);
+        iinfo.tiling = VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
+    }
+
+    // Double-check physical image format limits and fail if invalid
+    VkPhysicalDeviceImageDrmFormatModifierInfoEXT drm_pinfo = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT,
+        .sharingMode = iinfo.sharingMode,
+        .queueFamilyIndexCount = iinfo.queueFamilyIndexCount,
+        .pQueueFamilyIndices = iinfo.pQueueFamilyIndices,
+    };
+
+    VkPhysicalDeviceExternalImageFormatInfoKHR ext_pinfo = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO_KHR,
+        .handleType = ext_info.handleTypes,
+    };
+
+    if (handle_type == PL_HANDLE_DMA_BUF) {
+        if (params->import_handle) {
+            // On import, we know exactly which format modifier to test
+            drm_pinfo.drmFormatModifier = drm_explicit.drmFormatModifier;
+        } else {
+            // On export, the choice of format modifier is ambiguous, because
+            // we offer the implementation a whole list to choose from. In
+            // principle, we must check *all* supported drm format modifiers,
+            // but in practice it should hopefully suffice to just check one
+            drm_pinfo.drmFormatModifier = drm_list.pDrmFormatModifiers[0];
+        }
+        vk_link_struct(&ext_pinfo, &drm_pinfo);
+    }
+
+    VkPhysicalDeviceImageFormatInfo2KHR pinfo = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2_KHR,
+        .pNext = vk_handle_type ? &ext_pinfo : NULL,
+        .format = iinfo.format,
+        .type = iinfo.imageType,
+        .tiling = iinfo.tiling,
+        .usage = iinfo.usage,
+        .flags = iinfo.flags,
+    };
+
+    VkExternalImageFormatPropertiesKHR ext_props = {
+        .sType = VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES_KHR,
+    };
+
+    VkImageFormatProperties2KHR props = {
+        .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2_KHR,
+        .pNext = vk_handle_type ? &ext_props : NULL,
+    };
+
+    VkResult res;
+    res = vk->GetPhysicalDeviceImageFormatProperties2KHR(vk->physd, &pinfo, &props);
+    if (res == VK_ERROR_FORMAT_NOT_SUPPORTED) {
+        PL_DEBUG(gpu, "Texture creation failed: not supported");
+        goto error;
+    } else {
+        PL_VK_ASSERT(res, "Querying image format properties");
+    }
+
+    VkExtent3D max = props.imageFormatProperties.maxExtent;
+    if (params->w > max.width || params->h > max.height || params->d > max.depth)
+    {
+        PL_ERR(gpu, "Requested image size %dx%dx%d exceeds the maximum allowed "
+               "dimensions %dx%dx%d for vulkan image format %x",
+               params->w, params->h, params->d, max.width, max.height, max.depth,
+               (unsigned) iinfo.format);
+        goto error;
+    }
+
+    // Ensure the handle type is supported
+    if (vk_handle_type) {
+        bool ok = vk_external_mem_check(vk, &ext_props.externalMemoryProperties,
+                                        handle_type, params->import_handle);
+        if (!ok) {
+            PL_ERR(gpu, "Requested handle type is not compatible with the "
+                   "specified combination of image parameters. Possibly the "
+                   "handle type is unsupported altogether?");
+            goto error;
+        }
+    }
+
+    VK(vk->CreateImage(vk->dev, &iinfo, PL_VK_ALLOC, &tex_vk->img));
+    tex_vk->usage_flags = iinfo.usage;
+
+    VkMemoryDedicatedRequirements ded_reqs = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR,
+    };
+
+    VkMemoryRequirements2 reqs = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR,
+        .pNext = &ded_reqs,
+    };
+
+    VkImageMemoryRequirementsInfo2 req_info = {
+        .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2_KHR,
+        .image = tex_vk->img,
+    };
+
+    vk->GetImageMemoryRequirements2(vk->dev, &req_info, &reqs);
+    mparams.reqs = reqs.memoryRequirements;
+    if (ded_reqs.prefersDedicatedAllocation) {
+        mparams.ded_image = tex_vk->img;
+        if (vk_mem_handle_type(params->import_handle))
+            mparams.shared_mem.size = reqs.memoryRequirements.size;
+    }
+
+    const char *debug_tag = params->debug_tag ? params->debug_tag :
+                            params->import_handle ? "imported" : "created";
+
+    if (!params->import_handle || vk_mem_handle_type(params->import_handle)) {
+        struct vk_memslice *mem = &tex_vk->mem;
+        if (!vk_malloc_slice(vk->ma, mem, &mparams))
+            goto error;
+
+        VK(vk->BindImageMemory(vk->dev, tex_vk->img, mem->vkmem, mem->offset));
+    }
+
+    static const char * const plane_names[4] = {
+        "plane 0", "plane 1", "plane 2", "plane 3",
+    };
+
+    if (tex_vk->num_planes) {
+        for (int i = 0; i < tex_vk->num_planes; i++) {
+            struct pl_tex_t *plane;
+
+            pl_assert(tex_vk->type == VK_IMAGE_TYPE_2D);
+            plane = (struct pl_tex_t *) pl_vulkan_wrap(gpu, pl_vulkan_wrap_params(
+                .image      = tex_vk->img,
+                .aspect     = VK_IMAGE_ASPECT_PLANE_0_BIT << i,
+                .width      = PL_RSHIFT_UP(tex->params.w, fmt->planes[i].shift_x),
+                .height     = PL_RSHIFT_UP(tex->params.h, fmt->planes[i].shift_y),
+                .format     = fmtp->vk_fmt->pfmt[i].fmt,
+                .usage      = usage,
+                .user_data  = params->user_data,
+                .debug_tag  = PL_DEF(params->debug_tag, plane_names[i]),
+            ));
+            if (!plane)
+                goto error;
+            plane->parent = tex;
+            tex->planes[i] = plane;
+            tex_vk->planes[i] = PL_PRIV(plane);
+            tex_vk->planes[i]->held = false;
+            tex_vk->planes[i]->layout = tex_vk->layout;
+        }
+
+        // Explicitly mask out all usage flags from planar parent images
+        pl_assert(!fmt->caps);
+        tex->params.sampleable      = false;
+        tex->params.renderable      = false;
+        tex->params.storable        = false;
+        tex->params.blit_src        = false;
+        tex->params.blit_dst        = false;
+        tex->params.host_writable   = false;
+        tex->params.host_readable   = false;
+    }
+
+    if (!vk_init_image(gpu, tex, debug_tag))
+        goto error;
+
+    if (params->export_handle)
+        tex->shared_mem = tex_vk->mem.shared_mem;
+
+    if (params->export_handle == PL_HANDLE_DMA_BUF) {
+        if (vk->GetImageDrmFormatModifierPropertiesEXT) {
+
+            // Query the DRM format modifier and plane layout from the driver
+            VkImageDrmFormatModifierPropertiesEXT mod_props = {
+                .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_PROPERTIES_EXT,
+            };
+
+            VK(vk->GetImageDrmFormatModifierPropertiesEXT(vk->dev, tex_vk->img, &mod_props));
+            tex->shared_mem.drm_format_mod = mod_props.drmFormatModifier;
+
+            VkSubresourceLayout layout = {0};
+            VkImageSubresource plane = {
+                .aspectMask = VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT,
+            };
+
+            vk->GetImageSubresourceLayout(vk->dev, tex_vk->img, &plane, &layout);
+            if (layout.offset != 0) {
+                PL_ERR(gpu, "Exported DRM plane 0 has nonzero offset %zu, "
+                       "this should never happen! Erroring for safety...",
+                       (size_t) layout.offset);
+                goto error;
+            }
+            tex->shared_mem.stride_w = layout.rowPitch;
+            tex->shared_mem.stride_h = layout.depthPitch;
+
+        } else {
+
+            // Fallback for no modifiers, just do something stupid.
+            tex->shared_mem.drm_format_mod = DRM_FORMAT_MOD_INVALID;
+            tex->shared_mem.stride_w = params->w;
+            tex->shared_mem.stride_h = params->h;
+
+        }
+    }
+
+    if (params->initial_data) {
+        struct pl_tex_transfer_params ul_params = {
+            .tex = tex,
+            .ptr = (void *) params->initial_data,
+            .rc = { 0, 0, 0, params->w, params->h, params->d },
+        };
+
+        // Since we re-use GPU helpers which require writable images, just fake it
+        bool writable = tex->params.host_writable;
+        tex->params.host_writable = true;
+        if (!pl_tex_upload(gpu, &ul_params))
+            goto error;
+        tex->params.host_writable = writable;
+    }
+
+    return tex;
+
+error:
+    vk_tex_destroy(gpu, tex);
+    return NULL;
+}
+
+void vk_tex_invalidate(pl_gpu gpu, pl_tex tex)
+{
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+    tex_vk->may_invalidate = true;
+    for (int i = 0; i < tex_vk->num_planes; i++)
+        tex_vk->planes[i]->may_invalidate = true;
+}
+
+static bool tex_clear_fallback(pl_gpu gpu, pl_tex tex,
+                               const union pl_clear_color color)
+{
+    pl_tex pixel = pl_tex_create(gpu, pl_tex_params(
+        .w = 1,
+        .h = 1,
+        .format = tex->params.format,
+        .storable = true,
+        .blit_src = true,
+        .blit_dst = true,
+    ));
+    if (!pixel)
+        return false;
+
+    pl_tex_clear_ex(gpu, pixel, color);
+
+    pl_assert(tex->params.storable);
+    pl_tex_blit(gpu, pl_tex_blit_params(
+        .src = pixel,
+        .dst = tex,
+        .sample_mode = PL_TEX_SAMPLE_NEAREST,
+    ));
+
+    pl_tex_destroy(gpu, &pixel);
+    return true;
+}
+
+void vk_tex_clear_ex(pl_gpu gpu, pl_tex tex, const union pl_clear_color color)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+    if (tex_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT) {
+        if (!tex_clear_fallback(gpu, tex,  color)) {
+            PL_ERR(gpu, "Failed clearing imported planar image: color aspect "
+                   "clears disallowed by spec and no shader fallback "
+                   "available");
+        }
+        return;
+    }
+
+    struct vk_cmd *cmd = CMD_BEGIN(GRAPHICS);
+    if (!cmd)
+        return;
+
+    vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_CLEAR_BIT,
+                   VK_ACCESS_2_TRANSFER_WRITE_BIT,
+                   VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                   VK_QUEUE_FAMILY_IGNORED);
+
+    pl_static_assert(sizeof(VkClearColorValue) == sizeof(union pl_clear_color));
+    const VkClearColorValue *clearColor = (const VkClearColorValue *) &color;
+
+    pl_assert(tex_vk->aspect == VK_IMAGE_ASPECT_COLOR_BIT);
+    static const VkImageSubresourceRange range = {
+        .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+        .levelCount = 1,
+        .layerCount = 1,
+    };
+
+    vk->CmdClearColorImage(cmd->buf, tex_vk->img, tex_vk->layout,
+                           clearColor, 1, &range);
+
+    CMD_FINISH(&cmd);
+}
+
+void vk_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_tex_vk *src_vk = PL_PRIV(params->src);
+    struct pl_tex_vk *dst_vk = PL_PRIV(params->dst);
+    struct pl_fmt_vk *src_fmtp = PL_PRIV(params->src->params.format);
+    struct pl_fmt_vk *dst_fmtp = PL_PRIV(params->dst->params.format);
+    bool blit_emulated = src_fmtp->blit_emulated || dst_fmtp->blit_emulated;
+    bool planar_fallback = src_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT ||
+                           dst_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT;
+
+    pl_rect3d src_rc = params->src_rc, dst_rc = params->dst_rc;
+    bool requires_scaling = !pl_rect3d_eq(src_rc, dst_rc);
+    if ((requires_scaling && blit_emulated) || planar_fallback) {
+        if (!pl_tex_blit_compute(gpu, params))
+            PL_ERR(gpu, "Failed emulating texture blit, incompatible textures?");
+        return;
+    }
+
+    struct vk_cmd *cmd = CMD_BEGIN(GRAPHICS);
+    if (!cmd)
+        return;
+
+    // When the blit operation doesn't require scaling, we can use the more
+    // efficient vkCmdCopyImage instead of vkCmdBlitImage
+    if (!requires_scaling) {
+        vk_tex_barrier(gpu, cmd, params->src, VK_PIPELINE_STAGE_2_COPY_BIT,
+                       VK_ACCESS_2_TRANSFER_READ_BIT,
+                       VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+                       VK_QUEUE_FAMILY_IGNORED);
+
+        vk_tex_barrier(gpu, cmd, params->dst, VK_PIPELINE_STAGE_2_COPY_BIT,
+                       VK_ACCESS_2_TRANSFER_WRITE_BIT,
+                       VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                       VK_QUEUE_FAMILY_IGNORED);
+
+        pl_rect3d_normalize(&src_rc);
+
+        VkImageCopy region = {
+            .srcSubresource = {
+                .aspectMask = src_vk->aspect,
+                .layerCount = 1,
+            },
+            .dstSubresource = {
+                .aspectMask = dst_vk->aspect,
+                .layerCount = 1,
+            },
+            .srcOffset = {src_rc.x0, src_rc.y0, src_rc.z0},
+            .dstOffset = {src_rc.x0, src_rc.y0, src_rc.z0},
+            .extent = {
+                pl_rect_w(src_rc),
+                pl_rect_h(src_rc),
+                pl_rect_d(src_rc),
+            },
+        };
+
+        vk->CmdCopyImage(cmd->buf, src_vk->img, src_vk->layout,
+                         dst_vk->img, dst_vk->layout, 1, &region);
+    } else {
+        vk_tex_barrier(gpu, cmd, params->src, VK_PIPELINE_STAGE_2_BLIT_BIT,
+                       VK_ACCESS_2_TRANSFER_READ_BIT,
+                       VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+                       VK_QUEUE_FAMILY_IGNORED);
+
+        vk_tex_barrier(gpu, cmd, params->dst, VK_PIPELINE_STAGE_2_BLIT_BIT,
+                       VK_ACCESS_2_TRANSFER_WRITE_BIT,
+                       VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                       VK_QUEUE_FAMILY_IGNORED);
+
+        VkImageBlit region = {
+            .srcSubresource = {
+                .aspectMask = src_vk->aspect,
+                .layerCount = 1,
+            },
+            .dstSubresource = {
+                .aspectMask = dst_vk->aspect,
+                .layerCount = 1,
+            },
+            .srcOffsets = {{src_rc.x0, src_rc.y0, src_rc.z0},
+                           {src_rc.x1, src_rc.y1, src_rc.z1}},
+            .dstOffsets = {{dst_rc.x0, dst_rc.y0, dst_rc.z0},
+                           {dst_rc.x1, dst_rc.y1, dst_rc.z1}},
+        };
+
+        static const VkFilter filters[PL_TEX_SAMPLE_MODE_COUNT] = {
+            [PL_TEX_SAMPLE_NEAREST] = VK_FILTER_NEAREST,
+            [PL_TEX_SAMPLE_LINEAR]  = VK_FILTER_LINEAR,
+        };
+
+        vk->CmdBlitImage(cmd->buf, src_vk->img, src_vk->layout,
+                         dst_vk->img, dst_vk->layout, 1, &region,
+                         filters[params->sample_mode]);
+    }
+
+    CMD_FINISH(&cmd);
+}
+
+// Determine the best queue type to perform a buffer<->image copy on
+static enum queue_type vk_img_copy_queue(pl_gpu gpu, pl_tex tex,
+                                         const struct VkBufferImageCopy *region)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    const struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+    enum queue_type queue = tex_vk->transfer_queue;
+    if (queue != TRANSFER)
+        return queue;
+
+    VkExtent3D alignment = vk->pool_transfer->props.minImageTransferGranularity;
+
+    enum queue_type fallback = GRAPHICS;
+    if (gpu->limits.compute_queues > gpu->limits.fragment_queues)
+        fallback = COMPUTE; // prefer async compute queue
+
+    int tex_w = PL_DEF(tex->params.w, 1),
+        tex_h = PL_DEF(tex->params.h, 1),
+        tex_d = PL_DEF(tex->params.d, 1);
+
+    bool full_w = region->imageOffset.x + region->imageExtent.width  == tex_w,
+         full_h = region->imageOffset.y + region->imageExtent.height == tex_h,
+         full_d = region->imageOffset.z + region->imageExtent.depth  == tex_d;
+
+    if (alignment.width) {
+
+        bool unaligned = false;
+        unaligned |= region->imageOffset.x % alignment.width;
+        unaligned |= region->imageOffset.y % alignment.height;
+        unaligned |= region->imageOffset.z % alignment.depth;
+        unaligned |= (region->imageExtent.width  % alignment.width)  && !full_w;
+        unaligned |= (region->imageExtent.height % alignment.height) && !full_h;
+        unaligned |= (region->imageExtent.depth  % alignment.depth)  && !full_d;
+
+        return unaligned ? fallback : queue;
+
+    } else {
+
+        // an alignment of {0} means the copy must span the entire image
+        bool unaligned = false;
+        unaligned |= region->imageOffset.x || !full_w;
+        unaligned |= region->imageOffset.y || !full_h;
+        unaligned |= region->imageOffset.z || !full_d;
+
+        return unaligned ? fallback : queue;
+
+    }
+}
+
+static void tex_xfer_cb(void *ctx, void *arg)
+{
+    void (*fun)(void *priv) = ctx;
+    fun(arg);
+}
+
+bool vk_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    pl_tex tex = params->tex;
+    pl_fmt fmt = tex->params.format;
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+    struct pl_tex_transfer_params *slices = NULL;
+    int num_slices = 0;
+
+    if (!params->buf)
+        return pl_tex_upload_pbo(gpu, params);
+
+    pl_buf buf = params->buf;
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+    pl_rect3d rc = params->rc;
+    const size_t size = pl_tex_transfer_size(params);
+    const size_t buf_offset = buf_vk->mem.offset + params->buf_offset;
+    bool unaligned = buf_offset % fmt->texel_size;
+    if (unaligned)
+        PL_TRACE(gpu, "vk_tex_upload: unaligned transfer (slow path)");
+
+    if (fmt->emulated || unaligned) {
+
+        // Create all slice buffers first, to early-fail if OOM, and to avoid
+        // blocking unnecessarily on waiting for these buffers to get read from
+        num_slices = pl_tex_transfer_slices(gpu, tex_vk->texel_fmt, params, &slices);
+        for (int i = 0; i < num_slices; i++) {
+            slices[i].buf = pl_buf_create(gpu, pl_buf_params(
+                .memory_type = PL_BUF_MEM_DEVICE,
+                .format      = tex_vk->texel_fmt,
+                .size        = pl_tex_transfer_size(&slices[i]),
+                .storable    = fmt->emulated,
+            ));
+
+            if (!slices[i].buf) {
+                PL_ERR(gpu, "Failed creating buffer for tex upload fallback!");
+                num_slices = i; // only clean up buffers up to here
+                goto error;
+            }
+        }
+
+        // All temporary buffers successfully created, begin copying source data
+        struct vk_cmd *cmd = CMD_BEGIN_TIMED(tex_vk->transfer_queue,
+                                             params->timer);
+        if (!cmd)
+            goto error;
+
+        vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT,
+                       VK_ACCESS_2_TRANSFER_READ_BIT, params->buf_offset, size,
+                       false);
+
+        for (int i = 0; i < num_slices; i++) {
+            pl_buf slice = slices[i].buf;
+            struct pl_buf_vk *slice_vk = PL_PRIV(slice);
+            vk_buf_barrier(gpu, cmd, slice, VK_PIPELINE_STAGE_2_COPY_BIT,
+                           VK_ACCESS_2_TRANSFER_WRITE_BIT, 0, slice->params.size,
+                           false);
+
+            vk->CmdCopyBuffer(cmd->buf, buf_vk->mem.buf, slice_vk->mem.buf, 1, &(VkBufferCopy) {
+                .srcOffset = buf_vk->mem.offset + slices[i].buf_offset,
+                .dstOffset = slice_vk->mem.offset,
+                .size      = slice->params.size,
+            });
+        }
+
+        if (params->callback)
+            vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv);
+
+        bool ok = CMD_FINISH(&cmd);
+
+        // Finally, dispatch the (texel) upload asynchronously. We can fire
+        // the callback already at the completion of previous command because
+        // these temporary buffers already hold persistent copies of the data
+        for (int i = 0; i < num_slices; i++) {
+            if (ok) {
+                slices[i].buf_offset = 0;
+                ok = fmt->emulated ? pl_tex_upload_texel(gpu, &slices[i])
+                                   : pl_tex_upload(gpu, &slices[i]);
+            }
+            pl_buf_destroy(gpu, &slices[i].buf);
+        }
+
+        pl_free(slices);
+        return ok;
+
+    } else {
+
+        pl_assert(fmt->texel_align == fmt->texel_size);
+        const VkBufferImageCopy region = {
+            .bufferOffset = buf_offset,
+            .bufferRowLength = params->row_pitch / fmt->texel_size,
+            .bufferImageHeight = params->depth_pitch / params->row_pitch,
+            .imageOffset = { rc.x0, rc.y0, rc.z0 },
+            .imageExtent = { rc.x1, rc.y1, rc.z1 },
+            .imageSubresource = {
+                .aspectMask = tex_vk->aspect,
+                .layerCount = 1,
+            },
+        };
+
+        enum queue_type queue = vk_img_copy_queue(gpu, tex, &region);
+        struct vk_cmd *cmd = CMD_BEGIN_TIMED(queue, params->timer);
+        if (!cmd)
+            goto error;
+
+        vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT,
+                       VK_ACCESS_2_TRANSFER_READ_BIT, params->buf_offset, size,
+                       false);
+        vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_COPY_BIT,
+                       VK_ACCESS_2_TRANSFER_WRITE_BIT,
+                       VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                       VK_QUEUE_FAMILY_IGNORED);
+        vk->CmdCopyBufferToImage(cmd->buf, buf_vk->mem.buf, tex_vk->img,
+                                 tex_vk->layout, 1, &region);
+
+        if (params->callback)
+            vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv);
+
+        return CMD_FINISH(&cmd);
+    }
+
+    pl_unreachable();
+
+error:
+    for (int i = 0; i < num_slices; i++)
+        pl_buf_destroy(gpu, &slices[i].buf);
+    pl_free(slices);
+    return false;
+}
+
+bool vk_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    pl_tex tex = params->tex;
+    pl_fmt fmt = tex->params.format;
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+    struct pl_tex_transfer_params *slices = NULL;
+    int num_slices = 0;
+
+    if (!params->buf)
+        return pl_tex_download_pbo(gpu, params);
+
+    pl_buf buf = params->buf;
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+    pl_rect3d rc = params->rc;
+    const size_t size = pl_tex_transfer_size(params);
+    const size_t buf_offset = buf_vk->mem.offset + params->buf_offset;
+    bool unaligned = buf_offset % fmt->texel_size;
+    if (unaligned)
+        PL_TRACE(gpu, "vk_tex_download: unaligned transfer (slow path)");
+
+    if (fmt->emulated || unaligned) {
+
+        num_slices = pl_tex_transfer_slices(gpu, tex_vk->texel_fmt, params, &slices);
+        for (int i = 0; i < num_slices; i++) {
+            slices[i].buf = pl_buf_create(gpu, pl_buf_params(
+                .memory_type = PL_BUF_MEM_DEVICE,
+                .format      = tex_vk->texel_fmt,
+                .size        = pl_tex_transfer_size(&slices[i]),
+                .storable    = fmt->emulated,
+            ));
+
+            if (!slices[i].buf) {
+                PL_ERR(gpu, "Failed creating buffer for tex download fallback!");
+                num_slices = i;
+                goto error;
+            }
+        }
+
+        for (int i = 0; i < num_slices; i++) {
+            // Restore buffer offset after downloading into temporary buffer,
+            // because we still need to copy the data from the temporary buffer
+            // into this offset in the original buffer
+            const size_t tmp_offset = slices[i].buf_offset;
+            slices[i].buf_offset = 0;
+            bool ok = fmt->emulated ? pl_tex_download_texel(gpu, &slices[i])
+                                    : pl_tex_download(gpu, &slices[i]);
+            slices[i].buf_offset = tmp_offset;
+            if (!ok)
+                goto error;
+        }
+
+        // Finally, download into the user buffer
+        struct vk_cmd *cmd = CMD_BEGIN_TIMED(tex_vk->transfer_queue, params->timer);
+        if (!cmd)
+            goto error;
+
+        vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT,
+                       VK_ACCESS_2_TRANSFER_WRITE_BIT, params->buf_offset, size,
+                       false);
+
+        for (int i = 0; i < num_slices; i++) {
+            pl_buf slice = slices[i].buf;
+            struct pl_buf_vk *slice_vk = PL_PRIV(slice);
+            vk_buf_barrier(gpu, cmd, slice, VK_PIPELINE_STAGE_2_COPY_BIT,
+                           VK_ACCESS_2_TRANSFER_READ_BIT, 0, slice->params.size,
+                           false);
+
+            vk->CmdCopyBuffer(cmd->buf, slice_vk->mem.buf, buf_vk->mem.buf, 1, &(VkBufferCopy) {
+                .srcOffset = slice_vk->mem.offset,
+                .dstOffset = buf_vk->mem.offset + slices[i].buf_offset,
+                .size      = slice->params.size,
+            });
+
+            pl_buf_destroy(gpu, &slices[i].buf);
+        }
+
+        vk_buf_flush(gpu, cmd, buf, params->buf_offset, size);
+
+        if (params->callback)
+            vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv);
+
+        pl_free(slices);
+        return CMD_FINISH(&cmd);
+
+    } else {
+
+        pl_assert(params->row_pitch % fmt->texel_size == 0);
+        pl_assert(params->depth_pitch % params->row_pitch == 0);
+        const VkBufferImageCopy region = {
+            .bufferOffset = buf_offset,
+            .bufferRowLength = params->row_pitch / fmt->texel_size,
+            .bufferImageHeight = params->depth_pitch / params->row_pitch,
+            .imageOffset = { rc.x0, rc.y0, rc.z0 },
+            .imageExtent = { rc.x1, rc.y1, rc.z1 },
+            .imageSubresource = {
+                .aspectMask = tex_vk->aspect,
+                .layerCount = 1,
+            },
+        };
+
+        enum queue_type queue = vk_img_copy_queue(gpu, tex, &region);
+
+        struct vk_cmd *cmd = CMD_BEGIN_TIMED(queue, params->timer);
+        if (!cmd)
+            goto error;
+
+        vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT,
+                       VK_ACCESS_2_TRANSFER_WRITE_BIT, params->buf_offset, size,
+                       false);
+        vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_COPY_BIT,
+                       VK_ACCESS_2_TRANSFER_READ_BIT,
+                       VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+                       VK_QUEUE_FAMILY_IGNORED);
+        vk->CmdCopyImageToBuffer(cmd->buf, tex_vk->img, tex_vk->layout,
+                                 buf_vk->mem.buf, 1, &region);
+        vk_buf_flush(gpu, cmd, buf, params->buf_offset, size);
+
+        if (params->callback)
+            vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv);
+
+        return CMD_FINISH(&cmd);
+    }
+
+    pl_unreachable();
+
+error:
+    for (int i = 0; i < num_slices; i++)
+        pl_buf_destroy(gpu, &slices[i].buf);
+    pl_free(slices);
+    return false;
+}
+
+bool vk_tex_poll(pl_gpu gpu, pl_tex tex, uint64_t timeout)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+    // Opportunistically check if we can re-use this texture without flush
+    vk_poll_commands(vk, 0);
+    if (pl_rc_count(&tex_vk->rc) == 1)
+        goto skip_blocking;
+
+    // Otherwise, we're force to submit any queued command so that the user is
+    // guaranteed to see progress eventually, even if they call this in a loop
+    CMD_SUBMIT(NULL);
+    vk_poll_commands(vk, timeout);
+    if (pl_rc_count(&tex_vk->rc) > 1)
+        return true;
+
+    // fall through
+skip_blocking:
+    for (int i = 0; i < tex_vk->num_planes; i++) {
+        if (vk_tex_poll(gpu, tex->planes[i], timeout))
+            return true;
+    }
+
+    return false;
+}
+
+bool vk_tex_export(pl_gpu gpu, pl_tex tex, pl_sync sync)
+{
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+    struct pl_sync_vk *sync_vk = PL_PRIV(sync);
+
+    if (tex_vk->num_planes) {
+        PL_ERR(gpu, "`pl_tex_export` cannot be called on planar textures."
+               "Please see `pl_vulkan_hold_ex` for a replacement.");
+        return false;
+    }
+
+    struct vk_cmd *cmd = CMD_BEGIN(ANY);
+    if (!cmd)
+        goto error;
+
+    vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_NONE,
+                   0, VK_IMAGE_LAYOUT_GENERAL, VK_QUEUE_FAMILY_EXTERNAL);
+
+    // Make the next barrier appear as though coming from a different queue
+    tex_vk->sem.write.queue = tex_vk->sem.read.queue = NULL;
+
+    vk_cmd_sig(cmd, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, (pl_vulkan_sem){ sync_vk->wait });
+    if (!CMD_SUBMIT(&cmd))
+        goto error;
+
+    // Remember the other dependency and hold on to the sync object
+    PL_ARRAY_APPEND(tex, tex_vk->ext_deps, (pl_vulkan_sem){ sync_vk->signal });
+    pl_rc_ref(&sync_vk->rc);
+    tex_vk->ext_sync = sync;
+    tex_vk->qf = VK_QUEUE_FAMILY_EXTERNAL;
+    return true;
+
+error:
+    PL_ERR(gpu, "Failed exporting shared texture!");
+    return false;
+}
+
+pl_tex pl_vulkan_wrap(pl_gpu gpu, const struct pl_vulkan_wrap_params *params)
+{
+    pl_fmt fmt = NULL;
+    for (int i = 0; i < gpu->num_formats; i++) {
+        const struct vk_format **vkfmt = PL_PRIV(gpu->formats[i]);
+        if ((*vkfmt)->tfmt == params->format) {
+            fmt = gpu->formats[i];
+            break;
+        }
+    }
+
+    if (!fmt) {
+        PL_ERR(gpu, "Could not find pl_fmt suitable for wrapped image "
+               "with format %s", vk_fmt_name(params->format));
+        return NULL;
+    }
+
+    VkImageUsageFlags usage = params->usage;
+    if (fmt->num_planes)
+        usage = 0; // mask capabilities from the base texture
+
+    struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct pl_tex_vk);
+    tex->params = (struct pl_tex_params) {
+        .format         = fmt,
+        .w              = params->width,
+        .h              = params->height,
+        .d              = params->depth,
+        .sampleable     = !!(usage & VK_IMAGE_USAGE_SAMPLED_BIT),
+        .renderable     = !!(usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT),
+        .storable       = !!(usage & VK_IMAGE_USAGE_STORAGE_BIT),
+        .blit_src       = !!(usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT),
+        .blit_dst       = !!(usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT),
+        .host_writable  = !!(usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT),
+        .host_readable  = !!(usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT),
+        .user_data      = params->user_data,
+        .debug_tag      = params->debug_tag,
+    };
+
+    // Mask out capabilities not permitted by the `pl_fmt`
+#define MASK(field, cap)                                                        \
+    do {                                                                        \
+        if (tex->params.field && !(fmt->caps & cap)) {                          \
+            PL_WARN(gpu, "Masking `" #field "` from wrapped texture because "   \
+                    "the corresponding format '%s' does not support " #cap,     \
+                    fmt->name);                                                 \
+            tex->params.field = false;                                          \
+        }                                                                       \
+    } while (0)
+
+    MASK(sampleable,    PL_FMT_CAP_SAMPLEABLE);
+    MASK(renderable,    PL_FMT_CAP_RENDERABLE);
+    MASK(storable,      PL_FMT_CAP_STORABLE);
+    MASK(blit_src,      PL_FMT_CAP_BLITTABLE);
+    MASK(blit_dst,      PL_FMT_CAP_BLITTABLE);
+    MASK(host_readable, PL_FMT_CAP_HOST_READABLE);
+#undef MASK
+
+    // For simplicity, explicitly mask out blit emulation for wrapped textures
+    struct pl_fmt_vk *fmtp = PL_PRIV(fmt);
+    if (fmtp->blit_emulated) {
+        tex->params.blit_src = false;
+        tex->params.blit_dst = false;
+    }
+
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+    switch (pl_tex_params_dimension(tex->params)) {
+    case 1: tex_vk->type = VK_IMAGE_TYPE_1D; break;
+    case 2: tex_vk->type = VK_IMAGE_TYPE_2D; break;
+    case 3: tex_vk->type = VK_IMAGE_TYPE_3D; break;
+    }
+    tex_vk->external_img = true;
+    tex_vk->held = !fmt->num_planes;
+    tex_vk->img = params->image;
+    tex_vk->img_fmt = params->format;
+    tex_vk->num_planes = fmt->num_planes;
+    tex_vk->usage_flags = usage;
+    tex_vk->aspect = params->aspect;
+
+    if (!tex_vk->aspect) {
+        for (int i = 0; i < tex_vk->num_planes; i++)
+            tex_vk->aspect |= VK_IMAGE_ASPECT_PLANE_0_BIT << i;
+        tex_vk->aspect = PL_DEF(tex_vk->aspect, VK_IMAGE_ASPECT_COLOR_BIT);
+    }
+
+    // Blitting to planar images requires fallback via compute shaders
+    if (tex_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT) {
+        tex->params.blit_src &= tex->params.storable;
+        tex->params.blit_dst &= tex->params.storable;
+    }
+
+    static const char * const wrapped_plane_names[4] = {
+        "wrapped plane 0", "wrapped plane 1", "wrapped plane 2", "wrapped plane 3",
+    };
+
+    for (int i = 0; i < tex_vk->num_planes; i++) {
+        struct pl_tex_t *plane;
+        VkImageAspectFlags aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << i;
+        if (!(aspect & tex_vk->aspect)) {
+            PL_INFO(gpu, "Not wrapping plane %d due to aspect bit 0x%x not "
+                    "being contained in supplied params->aspect 0x%x!",
+                    i, (unsigned) aspect, (unsigned) tex_vk->aspect);
+            continue;
+        }
+
+        pl_assert(tex_vk->type == VK_IMAGE_TYPE_2D);
+        plane = (struct pl_tex_t *) pl_vulkan_wrap(gpu, pl_vulkan_wrap_params(
+            .image      = tex_vk->img,
+            .aspect     = aspect,
+            .width      = PL_RSHIFT_UP(tex->params.w, fmt->planes[i].shift_x),
+            .height     = PL_RSHIFT_UP(tex->params.h, fmt->planes[i].shift_y),
+            .format     = fmtp->vk_fmt->pfmt[i].fmt,
+            .usage      = params->usage,
+            .user_data  = params->user_data,
+            .debug_tag  = PL_DEF(params->debug_tag, wrapped_plane_names[i]),
+        ));
+        if (!plane)
+            goto error;
+        plane->parent = tex;
+        tex->planes[i] = plane;
+        tex_vk->planes[i] = PL_PRIV(plane);
+    }
+
+    if (!vk_init_image(gpu, tex, PL_DEF(params->debug_tag, "wrapped")))
+        goto error;
+
+    return tex;
+
+error:
+    vk_tex_destroy(gpu, tex);
+    return NULL;
+}
+
+VkImage pl_vulkan_unwrap(pl_gpu gpu, pl_tex tex, VkFormat *out_format,
+                         VkImageUsageFlags *out_flags)
+{
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+    if (out_format)
+        *out_format = tex_vk->img_fmt;
+    if (out_flags)
+        *out_flags = tex_vk->usage_flags;
+
+    return tex_vk->img;
+}
+
+bool pl_vulkan_hold_ex(pl_gpu gpu, const struct pl_vulkan_hold_params *params)
+{
+    struct pl_tex_vk *tex_vk = PL_PRIV(params->tex);
+    pl_assert(params->semaphore.sem);
+
+    bool held = tex_vk->held;
+    for (int i = 0; i < tex_vk->num_planes; i++)
+        held |= tex_vk->planes[i]->held;
+
+    if (held) {
+        PL_ERR(gpu, "Attempting to hold an already held image!");
+        return false;
+    }
+
+    struct vk_cmd *cmd = CMD_BEGIN(GRAPHICS);
+    if (!cmd) {
+        PL_ERR(gpu, "Failed holding external image!");
+        return false;
+    }
+
+    VkImageLayout layout = params->layout;
+    if (params->out_layout) {
+        // For planar images, arbitrarily pick the current image layout of the
+        // first plane. This should be fine in practice, since all planes will
+        // share the same usage capabilities.
+        if (tex_vk->num_planes) {
+            layout = tex_vk->planes[0]->layout;
+        } else {
+            layout = tex_vk->layout;
+        }
+    }
+
+    bool may_invalidate = true;
+    if (!tex_vk->num_planes) {
+        may_invalidate &= tex_vk->may_invalidate;
+        vk_tex_barrier(gpu, cmd, params->tex, VK_PIPELINE_STAGE_2_NONE,
+                       0, layout, params->qf);
+    }
+
+    for (int i = 0; i < tex_vk->num_planes; i++) {
+        may_invalidate &= tex_vk->planes[i]->may_invalidate;
+        vk_tex_barrier(gpu, cmd, params->tex->planes[i],
+                       VK_PIPELINE_STAGE_2_NONE, 0, layout, params->qf);
+    }
+
+    vk_cmd_sig(cmd, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, params->semaphore);
+    bool ok = CMD_SUBMIT(&cmd);
+
+    if (!tex_vk->num_planes) {
+        tex_vk->sem.write.queue = tex_vk->sem.read.queue = NULL;
+        tex_vk->held = ok;
+    }
+
+    for (int i = 0; i < tex_vk->num_planes; i++) {
+        struct pl_tex_vk *plane_vk = tex_vk->planes[i];
+        plane_vk->sem.write.queue = plane_vk->sem.read.queue = NULL;
+        plane_vk->held = ok;
+    }
+
+    if (ok && params->out_layout)
+        *params->out_layout = may_invalidate ? VK_IMAGE_LAYOUT_UNDEFINED : layout;
+
+    return ok;
+}
+
+void pl_vulkan_release_ex(pl_gpu gpu, const struct pl_vulkan_release_params *params)
+{
+    struct pl_tex_vk *tex_vk = PL_PRIV(params->tex);
+    if (tex_vk->num_planes) {
+        struct pl_vulkan_release_params plane_pars = *params;
+        for (int i = 0; i < tex_vk->num_planes; i++) {
+            plane_pars.tex = params->tex->planes[i];
+            pl_vulkan_release_ex(gpu, &plane_pars);
+        }
+        return;
+    }
+
+    if (!tex_vk->held) {
+        PL_ERR(gpu, "Attempting to release an unheld image?");
+        return;
+    }
+
+    if (params->semaphore.sem)
+        PL_ARRAY_APPEND(params->tex, tex_vk->ext_deps, params->semaphore);
+
+    tex_vk->qf = params->qf;
+    tex_vk->layout = params->layout;
+    tex_vk->held = false;
+}
+
+bool pl_vulkan_hold(pl_gpu gpu, pl_tex tex, VkImageLayout layout,
+                    pl_vulkan_sem sem_out)
+{
+    return pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params(
+        .tex        = tex,
+        .layout     = layout,
+        .semaphore  = sem_out,
+        .qf         = VK_QUEUE_FAMILY_IGNORED,
+    ));
+}
+
+bool pl_vulkan_hold_raw(pl_gpu gpu, pl_tex tex,
+                        VkImageLayout *out_layout,
+                        pl_vulkan_sem sem_out)
+{
+    return pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params(
+        .tex        = tex,
+        .out_layout = out_layout,
+        .semaphore  = sem_out,
+        .qf         = VK_QUEUE_FAMILY_IGNORED,
+    ));
+}
+
+void pl_vulkan_release(pl_gpu gpu, pl_tex tex, VkImageLayout layout,
+                       pl_vulkan_sem sem_in)
+{
+    pl_vulkan_release_ex(gpu, pl_vulkan_release_params(
+        .tex        = tex,
+        .layout     = layout,
+        .semaphore  = sem_in,
+        .qf         = VK_QUEUE_FAMILY_IGNORED,
+    ));
+}
diff --git a/src/vulkan/malloc.c b/src/vulkan/malloc.c
new file mode 100644
index 0000000..c35183b
--- /dev/null
+++ b/src/vulkan/malloc.c
@@ -0,0 +1,1058 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "malloc.h"
+#include "command.h"
+#include "utils.h"
+#include "pl_thread.h"
+
+#ifdef PL_HAVE_UNIX
+#include <errno.h>
+#include <unistd.h>
+#endif
+
+// Controls the page size alignment, to help coalesce allocations into the same
+// slab. Pages are rounded up to multiples of this value. (Default: 4 KB)
+#define PAGE_SIZE_ALIGN (1LLU << 12)
+
+// Controls the minimum/maximum number of pages for new slabs. As slabs are
+// exhausted of memory, the number of pages per new slab grows exponentially,
+// starting with the minimum until the maximum is reached.
+//
+// Note: The maximum must never exceed the size of `vk_slab.spacemap`.
+#define MINIMUM_PAGE_COUNT 4
+#define MAXIMUM_PAGE_COUNT (sizeof(uint64_t) * 8)
+
+// Controls the maximum page size. Any allocations above this threshold
+// (absolute size or fraction of VRAM, whichever is higher) will be served by
+// dedicated allocations. (Default: 64 MB or 1/16 of VRAM)
+#define MAXIMUM_PAGE_SIZE_ABSOLUTE (1LLU << 26)
+#define MAXIMUM_PAGE_SIZE_RELATIVE 16
+
+// Controls the minimum slab size, to avoid excessive re-allocation of very
+// small slabs. (Default: 256 KB)
+#define MINIMUM_SLAB_SIZE (1LLU << 18)
+
+// How long to wait before garbage collecting empty slabs. Slabs older than
+// this many invocations of `vk_malloc_garbage_collect` will be released.
+#define MAXIMUM_SLAB_AGE 32
+
+// A single slab represents a contiguous region of allocated memory. Actual
+// allocations are served as pages of this. Slabs are organized into pools,
+// each of which contains a list of slabs of differing page sizes.
+struct vk_slab {
+    pl_mutex lock;
+    pl_debug_tag debug_tag; // debug tag of the triggering allocation
+    VkDeviceMemory mem;     // underlying device allocation
+    VkDeviceSize size;      // total allocated size of `mem`
+    VkMemoryType mtype;     // underlying memory type
+    bool dedicated;         // slab is allocated specifically for one object
+    bool imported;          // slab represents an imported memory allocation
+
+    // free space accounting (only for non-dedicated slabs)
+    uint64_t spacemap;      // bitset of available pages
+    size_t pagesize;        // size in bytes per page
+    size_t used;            // number of bytes actually in use
+    uint64_t age;           // timestamp of last use
+
+    // optional, depends on the memory type:
+    VkBuffer buffer;        // buffer spanning the entire slab
+    void *data;             // mapped memory corresponding to `mem`
+    bool coherent;          // mapped memory is coherent
+    union pl_handle handle; // handle associated with this device memory
+    enum pl_handle_type handle_type;
+};
+
+// Represents a single memory pool. We keep track of a vk_pool for each
+// combination of malloc parameters. This shouldn't actually be that many in
+// practice, because some combinations simply never occur, and others will
+// generally be the same for the same objects.
+//
+// Note: `vk_pool` addresses are not immutable, so we mustn't expose any
+// dangling references to a `vk_pool` from e.g. `vk_memslice.priv = vk_slab`.
+struct vk_pool {
+    struct vk_malloc_params params;   // allocation params (with some fields nulled)
+    PL_ARRAY(struct vk_slab *) slabs; // array of slabs, unsorted
+    int index;                        // running index in `vk_malloc.pools`
+};
+
+// The overall state of the allocator, which keeps track of a vk_pool for each
+// memory type.
+struct vk_malloc {
+    struct vk_ctx *vk;
+    pl_mutex lock;
+    VkPhysicalDeviceMemoryProperties props;
+    size_t maximum_page_size;
+    PL_ARRAY(struct vk_pool) pools;
+    uint64_t age;
+};
+
+static inline float efficiency(size_t used, size_t total)
+{
+    if (!total)
+        return 100.0;
+
+    return 100.0f * used / total;
+}
+
+static const char *print_size(char buf[8], size_t size)
+{
+    const char *suffixes = "\0KMG";
+    while (suffixes[1] && size > 9999) {
+        size >>= 10;
+        suffixes++;
+    }
+
+    int ret = *suffixes ? snprintf(buf, 8, "%4zu%c", size, *suffixes)
+                        : snprintf(buf, 8, "%5zu", size);
+
+    return ret >= 0 ? buf : "(error)";
+}
+
+#define PRINT_SIZE(x) (print_size((char[8]){0}, (size_t) (x)))
+
+void vk_malloc_print_stats(struct vk_malloc *ma, enum pl_log_level lev)
+{
+    struct vk_ctx *vk = ma->vk;
+    size_t total_size = 0;
+    size_t total_used = 0;
+    size_t total_res = 0;
+
+    PL_MSG(vk, lev, "Memory heaps supported by device:");
+    for (int i = 0; i < ma->props.memoryHeapCount; i++) {
+        VkMemoryHeap heap = ma->props.memoryHeaps[i];
+        PL_MSG(vk, lev, "    %d: flags 0x%x size %s",
+                i, (unsigned) heap.flags, PRINT_SIZE(heap.size));
+    }
+
+    PL_DEBUG(vk, "Memory types supported by device:");
+    for (int i = 0; i < ma->props.memoryTypeCount; i++) {
+        VkMemoryType type = ma->props.memoryTypes[i];
+        PL_DEBUG(vk, "    %d: flags 0x%x heap %d",
+                 i, (unsigned) type.propertyFlags, (int) type.heapIndex);
+    }
+
+    pl_mutex_lock(&ma->lock);
+    for (int i = 0; i < ma->pools.num; i++) {
+        struct vk_pool *pool = &ma->pools.elem[i];
+        const struct vk_malloc_params *par = &pool->params;
+
+        PL_MSG(vk, lev, "Memory pool %d:", i);
+        PL_MSG(vk, lev, "    Compatible types: 0x%"PRIx32, par->reqs.memoryTypeBits);
+        if (par->required)
+            PL_MSG(vk, lev, "    Required flags: 0x%"PRIx32, par->required);
+        if (par->optimal)
+            PL_MSG(vk, lev, "    Optimal flags: 0x%"PRIx32, par->optimal);
+        if (par->buf_usage)
+            PL_MSG(vk, lev, "    Buffer flags: 0x%"PRIx32, par->buf_usage);
+        if (par->export_handle)
+            PL_MSG(vk, lev, "    Export handle: 0x%x", par->export_handle);
+
+        size_t pool_size = 0;
+        size_t pool_used = 0;
+        size_t pool_res = 0;
+
+        for (int j = 0; j < pool->slabs.num; j++) {
+            struct vk_slab *slab = pool->slabs.elem[j];
+            pl_mutex_lock(&slab->lock);
+
+            size_t avail = __builtin_popcountll(slab->spacemap) * slab->pagesize;
+            size_t slab_res = slab->size - avail;
+
+            PL_MSG(vk, lev, "    Slab %2d: %8"PRIx64" x %s: "
+                   "%s used %s res %s alloc from heap %d, efficiency %.2f%%  [%s]",
+                   j, slab->spacemap, PRINT_SIZE(slab->pagesize),
+                   PRINT_SIZE(slab->used), PRINT_SIZE(slab_res),
+                   PRINT_SIZE(slab->size), (int) slab->mtype.heapIndex,
+                   efficiency(slab->used, slab_res),
+                   PL_DEF(slab->debug_tag, "unknown"));
+
+            pool_size += slab->size;
+            pool_used += slab->used;
+            pool_res += slab_res;
+            pl_mutex_unlock(&slab->lock);
+        }
+
+        PL_MSG(vk, lev, "    Pool summary: %s used %s res %s alloc, "
+               "efficiency %.2f%%, utilization %.2f%%",
+               PRINT_SIZE(pool_used), PRINT_SIZE(pool_res),
+               PRINT_SIZE(pool_size), efficiency(pool_used, pool_res),
+               efficiency(pool_res, pool_size));
+
+        total_size += pool_size;
+        total_used += pool_used;
+        total_res += pool_res;
+    }
+    pl_mutex_unlock(&ma->lock);
+
+    PL_MSG(vk, lev, "Memory summary: %s used %s res %s alloc, "
+           "efficiency %.2f%%, utilization %.2f%%, max page: %s",
+           PRINT_SIZE(total_used), PRINT_SIZE(total_res),
+           PRINT_SIZE(total_size), efficiency(total_used, total_res),
+           efficiency(total_res, total_size),
+           PRINT_SIZE(ma->maximum_page_size));
+}
+
+static void slab_free(struct vk_ctx *vk, struct vk_slab *slab)
+{
+    if (!slab)
+        return;
+
+#ifndef NDEBUG
+    if (!slab->dedicated && slab->used > 0) {
+        PL_WARN(vk, "Leaked %zu bytes of vulkan memory!", slab->used);
+        PL_WARN(vk, "slab total size: %zu bytes, heap: %d, flags: 0x%"PRIX64,
+                (size_t) slab->size, (int) slab->mtype.heapIndex,
+                (uint64_t) slab->mtype.propertyFlags);
+        if (slab->debug_tag)
+            PL_WARN(vk, "last used for: %s", slab->debug_tag);
+        pl_log_stack_trace(vk->log, PL_LOG_WARN);
+        pl_debug_abort();
+    }
+#endif
+
+    if (slab->imported) {
+        switch (slab->handle_type) {
+        case PL_HANDLE_FD:
+        case PL_HANDLE_DMA_BUF:
+            PL_TRACE(vk, "Unimporting slab of size %s from fd: %d",
+                     PRINT_SIZE(slab->size), slab->handle.fd);
+            break;
+        case PL_HANDLE_WIN32:
+        case PL_HANDLE_WIN32_KMT:
+#ifdef PL_HAVE_WIN32
+            PL_TRACE(vk, "Unimporting slab of size %s from handle: %p",
+                     PRINT_SIZE(slab->size), (void *) slab->handle.handle);
+#endif
+            break;
+        case PL_HANDLE_HOST_PTR:
+            PL_TRACE(vk, "Unimporting slab of size %s from ptr: %p",
+                     PRINT_SIZE(slab->size), (void *) slab->handle.ptr);
+            break;
+        case PL_HANDLE_IOSURFACE:
+        case PL_HANDLE_MTL_TEX:
+            pl_unreachable();
+        }
+    } else {
+        switch (slab->handle_type) {
+        case PL_HANDLE_FD:
+        case PL_HANDLE_DMA_BUF:
+#ifdef PL_HAVE_UNIX
+            if (slab->handle.fd > -1)
+                close(slab->handle.fd);
+#endif
+            break;
+        case PL_HANDLE_WIN32:
+#ifdef PL_HAVE_WIN32
+            if (slab->handle.handle != NULL)
+                CloseHandle(slab->handle.handle);
+#endif
+            break;
+        case PL_HANDLE_WIN32_KMT:
+            // PL_HANDLE_WIN32_KMT is just an identifier. It doesn't get closed.
+            break;
+        case PL_HANDLE_HOST_PTR:
+            // Implicitly unmapped
+            break;
+        case PL_HANDLE_IOSURFACE:
+        case PL_HANDLE_MTL_TEX:
+            pl_unreachable();
+        }
+
+        PL_DEBUG(vk, "Freeing slab of size %s", PRINT_SIZE(slab->size));
+    }
+
+    vk->DestroyBuffer(vk->dev, slab->buffer, PL_VK_ALLOC);
+    // also implicitly unmaps the memory if needed
+    vk->FreeMemory(vk->dev, slab->mem, PL_VK_ALLOC);
+
+    pl_mutex_destroy(&slab->lock);
+    pl_free(slab);
+}
+
+// type_mask: optional
+// thread-safety: safe
+static bool find_best_memtype(const struct vk_malloc *ma, uint32_t type_mask,
+                              const struct vk_malloc_params *params,
+                              uint32_t *out_index)
+{
+    struct vk_ctx *vk = ma->vk;
+    int best = -1;
+
+    // The vulkan spec requires memory types to be sorted in the "optimal"
+    // order, so the first matching type we find will be the best/fastest one.
+    // That being said, we still want to prioritize memory types that have
+    // better optional flags.
+
+    type_mask &= params->reqs.memoryTypeBits;
+    for (int i = 0; i < ma->props.memoryTypeCount; i++) {
+        const VkMemoryType *mtype = &ma->props.memoryTypes[i];
+
+        // The memory type flags must include our properties
+        if ((mtype->propertyFlags & params->required) != params->required)
+            continue;
+
+        // The memory heap must be large enough for the allocation
+        VkDeviceSize heapSize = ma->props.memoryHeaps[mtype->heapIndex].size;
+        if (params->reqs.size > heapSize)
+            continue;
+
+        // The memory type must be supported by the type mask (bitfield)
+        if (!(type_mask & (1LU << i)))
+            continue;
+
+        // Calculate the score as the number of optimal property flags matched
+        int score = __builtin_popcountl(mtype->propertyFlags & params->optimal);
+        if (score > best) {
+            *out_index = i;
+            best = score;
+        }
+    }
+
+    if (best < 0) {
+        PL_ERR(vk, "Found no memory type matching property flags 0x%x and type "
+               "bits 0x%x!",
+               (unsigned) params->required, (unsigned) type_mask);
+        return false;
+    }
+
+    return true;
+}
+
+static bool buf_external_check(struct vk_ctx *vk, VkBufferUsageFlags usage,
+                               enum pl_handle_type handle_type, bool import)
+{
+    if (!handle_type)
+        return true;
+
+    VkPhysicalDeviceExternalBufferInfo info = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO_KHR,
+        .usage = usage,
+        .handleType = vk_mem_handle_type(handle_type),
+    };
+
+    VkExternalBufferProperties props = {
+        .sType = VK_STRUCTURE_TYPE_EXTERNAL_BUFFER_PROPERTIES_KHR,
+    };
+
+    if (!info.handleType)
+        return false;
+
+    vk->GetPhysicalDeviceExternalBufferProperties(vk->physd, &info, &props);
+    return vk_external_mem_check(vk, &props.externalMemoryProperties,
+                                 handle_type, import);
+}
+
+// thread-safety: safe
+static struct vk_slab *slab_alloc(struct vk_malloc *ma,
+                                  const struct vk_malloc_params *params)
+{
+    struct vk_ctx *vk = ma->vk;
+    struct vk_slab *slab = pl_alloc_ptr(NULL, slab);
+    *slab = (struct vk_slab) {
+        .age = ma->age,
+        .size = params->reqs.size,
+        .handle_type = params->export_handle,
+        .debug_tag = params->debug_tag,
+    };
+    pl_mutex_init(&slab->lock);
+
+    switch (slab->handle_type) {
+    case PL_HANDLE_FD:
+    case PL_HANDLE_DMA_BUF:
+        slab->handle.fd = -1;
+        break;
+    case PL_HANDLE_WIN32:
+    case PL_HANDLE_WIN32_KMT:
+    case PL_HANDLE_MTL_TEX:
+    case PL_HANDLE_IOSURFACE:
+        slab->handle.handle = NULL;
+        break;
+    case PL_HANDLE_HOST_PTR:
+        slab->handle.ptr = NULL;
+        break;
+    }
+
+    VkExportMemoryAllocateInfoKHR ext_info = {
+        .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR,
+        .handleTypes = vk_mem_handle_type(slab->handle_type),
+    };
+
+    uint32_t type_mask = UINT32_MAX;
+    if (params->buf_usage) {
+        // Queue family sharing modes don't matter for buffers, so we just
+        // set them as concurrent and stop worrying about it.
+        uint32_t qfs[3] = {0};
+        pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs));
+        for (int i = 0; i < vk->pools.num; i++)
+            qfs[i] = vk->pools.elem[i]->qf;
+
+        VkExternalMemoryBufferCreateInfoKHR ext_buf_info = {
+            .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR,
+            .handleTypes = ext_info.handleTypes,
+        };
+
+        VkBufferCreateInfo binfo = {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+            .pNext = slab->handle_type ? &ext_buf_info : NULL,
+            .size  = slab->size,
+            .usage = params->buf_usage,
+            .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT
+                                             : VK_SHARING_MODE_EXCLUSIVE,
+            .queueFamilyIndexCount = vk->pools.num,
+            .pQueueFamilyIndices = qfs,
+        };
+
+        if (!buf_external_check(vk, binfo.usage, slab->handle_type, false)) {
+            PL_ERR(vk, "Failed allocating shared memory buffer: possibly "
+                   "the handle type is unsupported?");
+            goto error;
+        }
+
+        VK(vk->CreateBuffer(vk->dev, &binfo, PL_VK_ALLOC, &slab->buffer));
+        PL_VK_NAME(BUFFER, slab->buffer, "slab");
+
+        VkMemoryRequirements reqs = {0};
+        vk->GetBufferMemoryRequirements(vk->dev, slab->buffer, &reqs);
+        slab->size = reqs.size; // this can be larger than `slab->size`
+        type_mask = reqs.memoryTypeBits;
+
+        // Note: we can ignore `reqs.align` because we always bind the buffer
+        // memory to offset 0
+    }
+
+    VkMemoryAllocateInfo minfo = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+        .allocationSize = slab->size,
+    };
+
+    if (params->export_handle)
+        vk_link_struct(&minfo, &ext_info);
+
+    VkMemoryDedicatedAllocateInfoKHR dinfo = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR,
+        .image = params->ded_image,
+    };
+
+    if (params->ded_image)
+        vk_link_struct(&minfo, &dinfo);
+
+    if (!find_best_memtype(ma, type_mask, params, &minfo.memoryTypeIndex))
+        goto error;
+
+    const VkMemoryType *mtype = &ma->props.memoryTypes[minfo.memoryTypeIndex];
+    PL_DEBUG(vk, "Allocating %zu memory of type 0x%x (id %d) in heap %d: %s",
+             (size_t) slab->size, (unsigned) mtype->propertyFlags,
+             (int) minfo.memoryTypeIndex, (int) mtype->heapIndex,
+             PL_DEF(params->debug_tag, "unknown"));
+
+    pl_clock_t start = pl_clock_now();
+
+    VkResult res = vk->AllocateMemory(vk->dev, &minfo, PL_VK_ALLOC, &slab->mem);
+    switch (res) {
+    case VK_ERROR_OUT_OF_DEVICE_MEMORY:
+    case VK_ERROR_OUT_OF_HOST_MEMORY:
+        PL_ERR(vk, "Allocation of size %s failed: %s!",
+               PRINT_SIZE(slab->size), vk_res_str(res));
+        vk_malloc_print_stats(ma, PL_LOG_ERR);
+        pl_log_stack_trace(vk->log, PL_LOG_ERR);
+        pl_debug_abort();
+        goto error;
+
+    default:
+        PL_VK_ASSERT(res, "vkAllocateMemory");
+    }
+
+    slab->mtype = *mtype;
+    if (mtype->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
+        VK(vk->MapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data));
+        slab->coherent = mtype->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+    }
+
+    if (slab->buffer)
+        VK(vk->BindBufferMemory(vk->dev, slab->buffer, slab->mem, 0));
+
+#ifdef PL_HAVE_UNIX
+    if (slab->handle_type == PL_HANDLE_FD ||
+        slab->handle_type == PL_HANDLE_DMA_BUF)
+    {
+        VkMemoryGetFdInfoKHR fd_info = {
+            .sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR,
+            .memory = slab->mem,
+            .handleType = ext_info.handleTypes,
+        };
+
+        VK(vk->GetMemoryFdKHR(vk->dev, &fd_info, &slab->handle.fd));
+    }
+#endif
+
+#ifdef PL_HAVE_WIN32
+    if (slab->handle_type == PL_HANDLE_WIN32 ||
+        slab->handle_type == PL_HANDLE_WIN32_KMT)
+    {
+        VkMemoryGetWin32HandleInfoKHR handle_info = {
+            .sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR,
+            .memory = slab->mem,
+            .handleType = ext_info.handleTypes,
+        };
+
+        VK(vk->GetMemoryWin32HandleKHR(vk->dev, &handle_info,
+                                       &slab->handle.handle));
+    }
+#endif
+
+    pl_log_cpu_time(vk->log, start, pl_clock_now(), "allocating slab");
+
+    // free space accounting is done by the caller
+    return slab;
+
+error:
+    if (params->debug_tag)
+        PL_ERR(vk, "  for malloc: %s", params->debug_tag);
+    slab_free(vk, slab);
+    return NULL;
+}
+
+static void pool_uninit(struct vk_ctx *vk, struct vk_pool *pool)
+{
+    for (int i = 0; i < pool->slabs.num; i++)
+        slab_free(vk, pool->slabs.elem[i]);
+
+    pl_free(pool->slabs.elem);
+    *pool = (struct vk_pool) {0};
+}
+
+struct vk_malloc *vk_malloc_create(struct vk_ctx *vk)
+{
+    struct vk_malloc *ma = pl_zalloc_ptr(NULL, ma);
+    pl_mutex_init(&ma->lock);
+    vk->GetPhysicalDeviceMemoryProperties(vk->physd, &ma->props);
+    ma->vk = vk;
+
+    // Determine maximum page size
+    ma->maximum_page_size = MAXIMUM_PAGE_SIZE_ABSOLUTE;
+    for (int i = 0; i < ma->props.memoryHeapCount; i++) {
+        VkMemoryHeap heap = ma->props.memoryHeaps[i];
+        if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) {
+            size_t size_max = heap.size / MAXIMUM_PAGE_SIZE_RELATIVE;
+            ma->maximum_page_size = PL_MAX(ma->maximum_page_size, size_max);
+        }
+    }
+
+    vk_malloc_print_stats(ma, PL_LOG_INFO);
+    return ma;
+}
+
+void vk_malloc_destroy(struct vk_malloc **ma_ptr)
+{
+    struct vk_malloc *ma = *ma_ptr;
+    if (!ma)
+        return;
+
+    vk_malloc_print_stats(ma, PL_LOG_DEBUG);
+    for (int i = 0; i < ma->pools.num; i++)
+        pool_uninit(ma->vk, &ma->pools.elem[i]);
+
+    pl_mutex_destroy(&ma->lock);
+    pl_free_ptr(ma_ptr);
+}
+
+void vk_malloc_garbage_collect(struct vk_malloc *ma)
+{
+    struct vk_ctx *vk = ma->vk;
+
+    pl_mutex_lock(&ma->lock);
+    ma->age++;
+
+    for (int i = 0; i < ma->pools.num; i++) {
+        struct vk_pool *pool = &ma->pools.elem[i];
+        for (int n = 0; n < pool->slabs.num; n++) {
+            struct vk_slab *slab = pool->slabs.elem[n];
+            pl_mutex_lock(&slab->lock);
+            if (slab->used || (ma->age - slab->age) <= MAXIMUM_SLAB_AGE) {
+                pl_mutex_unlock(&slab->lock);
+                continue;
+            }
+
+            PL_DEBUG(vk, "Garbage collected slab of size %s from pool %d",
+                     PRINT_SIZE(slab->size), pool->index);
+
+            pl_mutex_unlock(&slab->lock);
+            slab_free(ma->vk, slab);
+            PL_ARRAY_REMOVE_AT(pool->slabs, n--);
+        }
+    }
+
+    pl_mutex_unlock(&ma->lock);
+}
+
+pl_handle_caps vk_malloc_handle_caps(const struct vk_malloc *ma, bool import)
+{
+    struct vk_ctx *vk = ma->vk;
+    pl_handle_caps caps = 0;
+
+    for (int i = 0; vk_mem_handle_list[i]; i++) {
+        // Try seeing if we could allocate a "basic" buffer using these
+        // capabilities, with no fancy buffer usage. More specific checks will
+        // happen down the line at VkBuffer creation time, but this should give
+        // us a rough idea of what the driver supports.
+        enum pl_handle_type type = vk_mem_handle_list[i];
+        if (buf_external_check(vk, VK_BUFFER_USAGE_TRANSFER_DST_BIT, type, import))
+            caps |= type;
+    }
+
+    return caps;
+}
+
+void vk_malloc_free(struct vk_malloc *ma, struct vk_memslice *slice)
+{
+    struct vk_ctx *vk = ma->vk;
+    struct vk_slab *slab = slice->priv;
+    if (!slab || slab->dedicated) {
+        slab_free(vk, slab);
+        goto done;
+    }
+
+    pl_mutex_lock(&slab->lock);
+
+    int page_idx = slice->offset / slab->pagesize;
+    slab->spacemap |= 0x1LLU << page_idx;
+    slab->used -= slice->size;
+    slab->age = ma->age;
+    pl_assert(slab->used >= 0);
+
+    pl_mutex_unlock(&slab->lock);
+
+done:
+    *slice = (struct vk_memslice) {0};
+}
+
+static inline bool pool_params_eq(const struct vk_malloc_params *a,
+                                  const struct vk_malloc_params *b)
+{
+    return a->reqs.size == b->reqs.size &&
+           a->reqs.alignment == b->reqs.alignment &&
+           a->reqs.memoryTypeBits == b->reqs.memoryTypeBits &&
+           a->required == b->required &&
+           a->optimal == b->optimal &&
+           a->buf_usage == b->buf_usage &&
+           a->export_handle == b->export_handle;
+}
+
+static struct vk_pool *find_pool(struct vk_malloc *ma,
+                                 const struct vk_malloc_params *params)
+{
+    pl_assert(!params->import_handle);
+    pl_assert(!params->ded_image);
+
+    struct vk_malloc_params fixed = *params;
+    fixed.reqs.alignment = 0;
+    fixed.reqs.size = 0;
+    fixed.shared_mem = (struct pl_shared_mem) {0};
+
+    for (int i = 0; i < ma->pools.num; i++) {
+        if (pool_params_eq(&ma->pools.elem[i].params, &fixed))
+            return &ma->pools.elem[i];
+    }
+
+    // Not found => add it
+    PL_ARRAY_GROW(ma, ma->pools);
+    size_t idx = ma->pools.num++;
+    ma->pools.elem[idx] = (struct vk_pool) {
+        .params = fixed,
+        .index = idx,
+    };
+    return &ma->pools.elem[idx];
+}
+
+// Returns a suitable memory page from the pool. A new slab will be allocated
+// under the hood, if necessary.
+//
+// Note: This locks the slab it returns
+static struct vk_slab *pool_get_page(struct vk_malloc *ma, struct vk_pool *pool,
+                                     size_t size, size_t align,
+                                     VkDeviceSize *offset)
+{
+    struct vk_slab *slab = NULL;
+    int slab_pages = MINIMUM_PAGE_COUNT;
+    size = PL_ALIGN2(size, PAGE_SIZE_ALIGN);
+    const size_t pagesize = PL_ALIGN(size, align);
+
+    for (int i = 0; i < pool->slabs.num; i++) {
+        slab = pool->slabs.elem[i];
+        if (slab->pagesize < size)
+            continue;
+        if (slab->pagesize > pagesize * MINIMUM_PAGE_COUNT) // rough heuristic
+            continue;
+        if (slab->pagesize % align)
+            continue;
+
+        pl_mutex_lock(&slab->lock);
+        int page_idx = __builtin_ffsll(slab->spacemap);
+        if (!page_idx--) {
+            pl_mutex_unlock(&slab->lock);
+            // Increase the number of slabs to allocate for new slabs the
+            // more existing full slabs exist for this size range
+            slab_pages = PL_MIN(slab_pages << 1, MAXIMUM_PAGE_COUNT);
+            continue;
+        }
+
+        slab->spacemap ^= 0x1LLU << page_idx;
+        *offset = page_idx * slab->pagesize;
+        return slab;
+    }
+
+    // Otherwise, allocate a new vk_slab and append it to the list.
+    VkDeviceSize slab_size = slab_pages * pagesize;
+    pl_static_assert(MINIMUM_SLAB_SIZE <= PAGE_SIZE_ALIGN * MAXIMUM_PAGE_COUNT);
+    const VkDeviceSize max_slab_size = ma->maximum_page_size * MINIMUM_PAGE_COUNT;
+    pl_assert(pagesize <= ma->maximum_page_size);
+    slab_size = PL_CLAMP(slab_size, MINIMUM_SLAB_SIZE, max_slab_size);
+    slab_pages = slab_size / pagesize;
+    slab_size = slab_pages * pagesize; // max_slab_size may be npot2, trim excess
+
+    struct vk_malloc_params params = pool->params;
+    params.reqs.size = slab_size;
+
+    // Don't hold the lock while allocating the slab, because it can be a
+    // potentially very costly operation.
+    pl_mutex_unlock(&ma->lock);
+    slab = slab_alloc(ma, &params);
+    pl_mutex_lock(&ma->lock);
+    if (!slab)
+        return NULL;
+    pl_mutex_lock(&slab->lock);
+
+    slab->spacemap = (slab_pages == sizeof(uint64_t) * 8) ? ~0LLU : ~(~0LLU << slab_pages);
+    slab->pagesize = pagesize;
+    PL_ARRAY_APPEND(NULL, pool->slabs, slab);
+
+    // Return the first page in this newly allocated slab
+    slab->spacemap ^= 0x1;
+    *offset = 0;
+    return slab;
+}
+
+static bool vk_malloc_import(struct vk_malloc *ma, struct vk_memslice *out,
+                             const struct vk_malloc_params *params)
+{
+    struct vk_ctx *vk = ma->vk;
+    VkExternalMemoryHandleTypeFlagBitsKHR vk_handle_type;
+    vk_handle_type = vk_mem_handle_type(params->import_handle);
+
+    struct vk_slab *slab = NULL;
+    const struct pl_shared_mem *shmem = &params->shared_mem;
+
+    VkMemoryDedicatedAllocateInfoKHR dinfo = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR,
+        .image = params->ded_image,
+    };
+
+    VkImportMemoryFdInfoKHR fdinfo = {
+        .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR,
+        .handleType = vk_handle_type,
+        .fd = -1,
+    };
+
+    VkImportMemoryHostPointerInfoEXT ptrinfo = {
+        .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT,
+        .handleType = vk_handle_type,
+    };
+
+    VkMemoryAllocateInfo ainfo = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+        .allocationSize = shmem->size,
+    };
+
+    if (params->ded_image)
+        vk_link_struct(&ainfo, &dinfo);
+
+    VkBuffer buffer = VK_NULL_HANDLE;
+    VkMemoryRequirements reqs = params->reqs;
+
+    if (params->buf_usage) {
+        uint32_t qfs[3] = {0};
+        pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs));
+        for (int i = 0; i < vk->pools.num; i++)
+            qfs[i] = vk->pools.elem[i]->qf;
+
+        VkExternalMemoryBufferCreateInfoKHR ext_buf_info = {
+            .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR,
+            .handleTypes = vk_handle_type,
+        };
+
+        VkBufferCreateInfo binfo = {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+            .pNext = &ext_buf_info,
+            .size = shmem->size,
+            .usage = params->buf_usage,
+            .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT
+                                             : VK_SHARING_MODE_EXCLUSIVE,
+            .queueFamilyIndexCount = vk->pools.num,
+            .pQueueFamilyIndices = qfs,
+        };
+
+        VK(vk->CreateBuffer(vk->dev, &binfo, PL_VK_ALLOC, &buffer));
+        PL_VK_NAME(BUFFER, buffer, "imported");
+
+        vk->GetBufferMemoryRequirements(vk->dev, buffer, &reqs);
+    }
+
+    if (reqs.size > shmem->size) {
+        PL_ERR(vk, "Imported object requires %zu bytes, larger than the "
+               "provided size %zu!",
+               (size_t) reqs.size, shmem->size);
+        goto error;
+    }
+
+    if (shmem->offset % reqs.alignment || shmem->offset % params->reqs.alignment) {
+        PL_ERR(vk, "Imported object offset %zu conflicts with alignment %zu!",
+               shmem->offset, pl_lcm(reqs.alignment, params->reqs.alignment));
+        goto error;
+    }
+
+    switch (params->import_handle) {
+#ifdef PL_HAVE_UNIX
+    case PL_HANDLE_DMA_BUF: {
+        if (!vk->GetMemoryFdPropertiesKHR) {
+            PL_ERR(vk, "Importing PL_HANDLE_DMA_BUF requires %s.",
+                   VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME);
+            goto error;
+        }
+
+        VkMemoryFdPropertiesKHR fdprops = {
+            .sType = VK_STRUCTURE_TYPE_MEMORY_FD_PROPERTIES_KHR,
+        };
+
+        VK(vk->GetMemoryFdPropertiesKHR(vk->dev,
+                                        vk_handle_type,
+                                        shmem->handle.fd,
+                                        &fdprops));
+
+        // We dup() the fd to make it safe to import the same original fd
+        // multiple times.
+        fdinfo.fd = dup(shmem->handle.fd);
+        if (fdinfo.fd == -1) {
+            PL_ERR(vk, "Failed to dup() fd (%d) when importing memory: %s",
+                   fdinfo.fd, strerror(errno));
+            goto error;
+        }
+
+        reqs.memoryTypeBits &= fdprops.memoryTypeBits;
+        vk_link_struct(&ainfo, &fdinfo);
+        break;
+    }
+#else // !PL_HAVE_UNIX
+    case PL_HANDLE_DMA_BUF:
+        PL_ERR(vk, "PL_HANDLE_DMA_BUF requires building with UNIX support!");
+        goto error;
+#endif
+
+    case PL_HANDLE_HOST_PTR: {
+        VkMemoryHostPointerPropertiesEXT ptrprops = {
+            .sType = VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT,
+        };
+
+        VK(vk->GetMemoryHostPointerPropertiesEXT(vk->dev, vk_handle_type,
+                                                 shmem->handle.ptr,
+                                                 &ptrprops));
+
+        ptrinfo.pHostPointer = (void *) shmem->handle.ptr;
+        reqs.memoryTypeBits &= ptrprops.memoryTypeBits;
+        vk_link_struct(&ainfo, &ptrinfo);
+        break;
+    }
+
+    case PL_HANDLE_FD:
+    case PL_HANDLE_WIN32:
+    case PL_HANDLE_WIN32_KMT:
+    case PL_HANDLE_IOSURFACE:
+    case PL_HANDLE_MTL_TEX:
+        PL_ERR(vk, "vk_malloc_import: unsupported handle type %d",
+               params->import_handle);
+        goto error;
+    }
+
+    if (!find_best_memtype(ma, reqs.memoryTypeBits, params, &ainfo.memoryTypeIndex)) {
+        PL_ERR(vk, "No compatible memory types offered for imported memory!");
+        goto error;
+    }
+
+    VkDeviceMemory vkmem = VK_NULL_HANDLE;
+    VK(vk->AllocateMemory(vk->dev, &ainfo, PL_VK_ALLOC, &vkmem));
+
+    slab = pl_alloc_ptr(NULL, slab);
+    *slab = (struct vk_slab) {
+        .mem = vkmem,
+        .dedicated = true,
+        .imported = true,
+        .buffer = buffer,
+        .size = shmem->size,
+        .handle_type = params->import_handle,
+    };
+    pl_mutex_init(&slab->lock);
+
+    *out = (struct vk_memslice) {
+        .vkmem = vkmem,
+        .buf = buffer,
+        .size = shmem->size - shmem->offset,
+        .offset = shmem->offset,
+        .shared_mem = *shmem,
+        .priv = slab,
+    };
+
+    switch (params->import_handle) {
+    case PL_HANDLE_DMA_BUF:
+    case PL_HANDLE_FD:
+        PL_TRACE(vk, "Imported %s bytes from fd: %d%s",
+                 PRINT_SIZE(slab->size), shmem->handle.fd,
+                 params->ded_image ? " (dedicated)" : "");
+        // fd ownership is transferred at this point.
+        slab->handle.fd = fdinfo.fd;
+        fdinfo.fd = -1;
+        break;
+    case PL_HANDLE_HOST_PTR:
+        PL_TRACE(vk, "Imported %s bytes from ptr: %p%s",
+                 PRINT_SIZE(slab->size), shmem->handle.ptr,
+                 params->ded_image ? " (dedicated" : "");
+        slab->handle.ptr = ptrinfo.pHostPointer;
+        break;
+    case PL_HANDLE_WIN32:
+    case PL_HANDLE_WIN32_KMT:
+    case PL_HANDLE_IOSURFACE:
+    case PL_HANDLE_MTL_TEX:
+        break;
+    }
+
+    VkMemoryPropertyFlags flags = ma->props.memoryTypes[ainfo.memoryTypeIndex].propertyFlags;
+    if (flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
+        VK(vk->MapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data));
+        slab->coherent = flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+        out->data = (uint8_t *) slab->data + out->offset;
+        out->coherent = slab->coherent;
+        if (!slab->coherent) {
+            // Use entire buffer range, since this is a dedicated memory
+            // allocation. This avoids issues with noncoherent atomicity
+            out->map_offset = 0;
+            out->map_size = VK_WHOLE_SIZE;
+
+            // Mapping does not implicitly invalidate mapped memory
+            VK(vk->InvalidateMappedMemoryRanges(vk->dev, 1, &(VkMappedMemoryRange) {
+                .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+                .memory = slab->mem,
+                .offset = out->map_offset,
+                .size = out->map_size,
+            }));
+        }
+    }
+
+    if (buffer)
+        VK(vk->BindBufferMemory(vk->dev, buffer, vkmem, 0));
+
+    return true;
+
+error:
+    if (params->debug_tag)
+        PL_ERR(vk, "  for malloc: %s", params->debug_tag);
+    vk->DestroyBuffer(vk->dev, buffer, PL_VK_ALLOC);
+#ifdef PL_HAVE_UNIX
+    if (fdinfo.fd > -1)
+        close(fdinfo.fd);
+#endif
+    pl_free(slab);
+    *out = (struct vk_memslice) {0};
+    return false;
+}
+
+size_t vk_malloc_avail(struct vk_malloc *ma, VkMemoryPropertyFlags flags)
+{
+    size_t avail = 0;
+    for (int i = 0; i < ma->props.memoryTypeCount; i++) {
+        const VkMemoryType *mtype = &ma->props.memoryTypes[i];
+        if ((mtype->propertyFlags & flags) != flags)
+            continue;
+        avail = PL_MAX(avail, ma->props.memoryHeaps[mtype->heapIndex].size);
+    }
+
+    return avail;
+}
+
+bool vk_malloc_slice(struct vk_malloc *ma, struct vk_memslice *out,
+                     const struct vk_malloc_params *params)
+{
+    struct vk_ctx *vk = ma->vk;
+    pl_assert(!params->import_handle || !params->export_handle);
+    if (params->import_handle)
+        return vk_malloc_import(ma, out, params);
+
+    pl_assert(params->reqs.size);
+    size_t size = params->reqs.size;
+    size_t align = params->reqs.alignment;
+    align = pl_lcm(align, vk->props.limits.bufferImageGranularity);
+    align = pl_lcm(align, vk->props.limits.nonCoherentAtomSize);
+
+    struct vk_slab *slab;
+    VkDeviceSize offset;
+
+    if (params->ded_image || size > ma->maximum_page_size) {
+        slab = slab_alloc(ma, params);
+        if (!slab)
+            return false;
+        slab->dedicated = true;
+        offset = 0;
+    } else {
+        pl_mutex_lock(&ma->lock);
+        struct vk_pool *pool = find_pool(ma, params);
+        slab = pool_get_page(ma, pool, size, align, &offset);
+        pl_mutex_unlock(&ma->lock);
+        if (!slab) {
+            PL_ERR(ma->vk, "No slab to serve request for %s bytes (with "
+                   "alignment 0x%zx) in pool %d!",
+                   PRINT_SIZE(size), align, pool->index);
+            return false;
+        }
+
+        // For accounting, just treat the alignment as part of the used size.
+        // Doing it this way makes sure that the sizes reported to vk_memslice
+        // consumers are always aligned properly.
+        size = PL_ALIGN(size, align);
+        slab->used += size;
+        slab->age = ma->age;
+        if (params->debug_tag)
+            slab->debug_tag = params->debug_tag;
+        pl_mutex_unlock(&slab->lock);
+    }
+
+    pl_assert(offset % align == 0);
+    *out = (struct vk_memslice) {
+        .vkmem = slab->mem,
+        .offset = offset,
+        .size = size,
+        .buf = slab->buffer,
+        .data = slab->data ? (uint8_t *) slab->data + offset : 0x0,
+        .coherent = slab->coherent,
+        .map_offset = slab->data ? offset : 0,
+        .map_size = slab->data ? size : 0,
+        .priv = slab,
+        .shared_mem = {
+            .handle = slab->handle,
+            .offset = offset,
+            .size = slab->size,
+        },
+    };
+    return true;
+}
diff --git a/src/vulkan/malloc.h b/src/vulkan/malloc.h
new file mode 100644
index 0000000..115352e
--- /dev/null
+++ b/src/vulkan/malloc.h
@@ -0,0 +1,72 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+// All memory allocated from a vk_malloc MUST be explicitly released by
+// the caller before vk_malloc_destroy is called.
+struct vk_malloc *vk_malloc_create(struct vk_ctx *vk);
+void vk_malloc_destroy(struct vk_malloc **ma);
+
+// Get the supported handle types for this malloc instance
+pl_handle_caps vk_malloc_handle_caps(const struct vk_malloc *ma, bool import);
+
+// Represents a single "slice" of generic (non-buffer) memory, plus some
+// metadata for accounting. This struct is essentially read-only.
+struct vk_memslice {
+    VkDeviceMemory vkmem;
+    VkDeviceSize offset;
+    VkDeviceSize size;
+    void *priv;
+    // depending on the type/flags:
+    struct pl_shared_mem shared_mem;
+    VkBuffer buf;   // associated buffer (when `buf_usage` is nonzero)
+    void *data;     // pointer to slice (for persistently mapped slices)
+    bool coherent;  // whether `data` is coherent
+    VkDeviceSize map_offset; // can be larger than offset/size
+    VkDeviceSize map_size;
+};
+
+struct vk_malloc_params {
+    VkMemoryRequirements reqs;
+    VkMemoryPropertyFlags required;
+    VkMemoryPropertyFlags optimal;
+    VkBufferUsageFlags buf_usage;
+    VkImage ded_image; // for dedicated image allocations
+    enum pl_handle_type export_handle;
+    enum pl_handle_type import_handle;
+    struct pl_shared_mem shared_mem; // for `import_handle`
+    pl_debug_tag debug_tag;
+};
+
+// Returns the amount of available memory matching a given set of property
+// flags. Always returns the highest single allocation, not the combined total.
+size_t vk_malloc_avail(struct vk_malloc *ma, VkMemoryPropertyFlags flags);
+
+bool vk_malloc_slice(struct vk_malloc *ma, struct vk_memslice *out,
+                     const struct vk_malloc_params *params);
+
+void vk_malloc_free(struct vk_malloc *ma, struct vk_memslice *slice);
+
+// Clean up unused slabs. Call this roughly once per frame to reduce
+// memory pressure / memory leaks.
+void vk_malloc_garbage_collect(struct vk_malloc *ma);
+
+// For debugging purposes. Doesn't include dedicated slab allocations!
+void vk_malloc_print_stats(struct vk_malloc *ma, enum pl_log_level);
diff --git a/src/vulkan/meson.build b/src/vulkan/meson.build
new file mode 100644
index 0000000..64c5572
--- /dev/null
+++ b/src/vulkan/meson.build
@@ -0,0 +1,59 @@
+vulkan_build = get_option('vulkan')
+vulkan_link = get_option('vk-proc-addr')
+vulkan_loader = dependency('vulkan', required: false)
+vulkan_headers = vulkan_loader.partial_dependency(includes: true, compile_args: true)
+registry_xml = get_option('vulkan-registry')
+
+# Prefer our Vulkan headers for portability
+vulkan_headers_dir = thirdparty/'Vulkan-Headers'
+vulkan_headers_inc = include_directories()
+if fs.is_dir(vulkan_headers_dir/'include')
+  vulkan_headers = declare_dependency()
+  vulkan_headers_inc = include_directories('../../3rdparty/Vulkan-Headers/include')
+  # Force the use of this vk.xml because it has to be in sync with the headers
+  registry_xml = vulkan_headers_dir/'registry/vk.xml'
+endif
+
+vulkan_build = vulkan_build.require(
+  cc.has_header_symbol('vulkan/vulkan_core.h', 'VK_VERSION_1_3',
+                       include_directories: vulkan_headers_inc,
+                       dependencies: vulkan_headers),
+  error_message: 'vulkan.h was not found on the system, nor inside ' +
+                 '`3rdparty/Vulkan-Headers`. Please run `git submodule update --init` ' +
+                 'followed by `meson --wipe`.')
+components.set('vulkan', vulkan_build.allowed())
+
+vulkan_link = vulkan_link.require(vulkan_loader.found() and vulkan_build.allowed())
+components.set('vk-proc-addr', vulkan_link.allowed())
+
+build_deps += vulkan_headers
+
+if vulkan_build.allowed()
+  sources += [
+    'vulkan/command.c',
+    'vulkan/context.c',
+    'vulkan/formats.c',
+    'vulkan/gpu.c',
+    'vulkan/gpu_buf.c',
+    'vulkan/gpu_tex.c',
+    'vulkan/gpu_pass.c',
+    'vulkan/malloc.c',
+    'vulkan/swapchain.c',
+    'vulkan/utils.c',
+  ]
+
+  datadir = get_option('prefix') / get_option('datadir')
+  sources += custom_target('utils_gen.c',
+    input: 'utils_gen.py',
+    output: 'utils_gen.c',
+    command: [python, '@INPUT@', datadir, registry_xml, '@OUTPUT@'],
+    env: python_env,
+  )
+
+  if vulkan_link.allowed()
+    build_deps += vulkan_loader
+    tests += 'vulkan.c'
+  endif
+else
+  sources += 'vulkan/stubs.c'
+endif
diff --git a/src/vulkan/stubs.c b/src/vulkan/stubs.c
new file mode 100644
index 0000000..0c0738e
--- /dev/null
+++ b/src/vulkan/stubs.c
@@ -0,0 +1,108 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../common.h"
+#include "log.h"
+
+#include <libplacebo/vulkan.h>
+
+const struct pl_vk_inst_params pl_vk_inst_default_params = {0};
+const struct pl_vulkan_params pl_vulkan_default_params = { PL_VULKAN_DEFAULTS };
+
+pl_vk_inst pl_vk_inst_create(pl_log log, const struct pl_vk_inst_params *params)
+{
+    pl_fatal(log, "libplacebo compiled without Vulkan support!");
+    return NULL;
+}
+
+void pl_vk_inst_destroy(pl_vk_inst *pinst)
+{
+    pl_vk_inst inst = *pinst;
+    pl_assert(!inst);
+}
+
+pl_vulkan pl_vulkan_create(pl_log log, const struct pl_vulkan_params *params)
+{
+    pl_fatal(log, "libplacebo compiled without Vulkan support!");
+    return NULL;
+}
+
+void pl_vulkan_destroy(pl_vulkan *pvk)
+{
+    pl_vulkan vk = *pvk;
+    pl_assert(!vk);
+}
+
+pl_vulkan pl_vulkan_get(pl_gpu gpu)
+{
+    return NULL;
+}
+
+VkPhysicalDevice pl_vulkan_choose_device(pl_log log,
+                              const struct pl_vulkan_device_params *params)
+{
+    pl_err(log, "libplacebo compiled without Vulkan support!");
+    return NULL;
+}
+
+pl_swapchain pl_vulkan_create_swapchain(pl_vulkan vk,
+                              const struct pl_vulkan_swapchain_params *params)
+{
+    pl_unreachable();
+}
+
+bool pl_vulkan_swapchain_suboptimal(pl_swapchain sw)
+{
+    pl_unreachable();
+}
+
+pl_vulkan pl_vulkan_import(pl_log log, const struct pl_vulkan_import_params *params)
+{
+    pl_fatal(log, "libplacebo compiled without Vulkan support!");
+    return NULL;
+}
+
+pl_tex pl_vulkan_wrap(pl_gpu gpu, const struct pl_vulkan_wrap_params *params)
+{
+    pl_unreachable();
+}
+
+VkImage pl_vulkan_unwrap(pl_gpu gpu, pl_tex tex,
+                         VkFormat *out_format, VkImageUsageFlags *out_flags)
+{
+    pl_unreachable();
+}
+
+bool pl_vulkan_hold_ex(pl_gpu gpu, const struct pl_vulkan_hold_params *params)
+{
+    pl_unreachable();
+}
+
+void pl_vulkan_release_ex(pl_gpu gpu, const struct pl_vulkan_release_params *params)
+{
+    pl_unreachable();
+}
+
+VkSemaphore pl_vulkan_sem_create(pl_gpu gpu, const struct pl_vulkan_sem_params *params)
+{
+    pl_unreachable();
+}
+
+void pl_vulkan_sem_destroy(pl_gpu gpu, VkSemaphore *semaphore)
+{
+    pl_unreachable();
+}
diff --git a/src/vulkan/swapchain.c b/src/vulkan/swapchain.c
new file mode 100644
index 0000000..0741fbf
--- /dev/null
+++ b/src/vulkan/swapchain.c
@@ -0,0 +1,911 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include "command.h"
+#include "formats.h"
+#include "utils.h"
+#include "gpu.h"
+#include "swapchain.h"
+#include "pl_thread.h"
+
+struct sem_pair {
+    VkSemaphore in;
+    VkSemaphore out;
+};
+
+struct priv {
+    struct pl_sw_fns impl;
+
+    pl_mutex lock;
+    struct vk_ctx *vk;
+    VkSurfaceKHR surf;
+    PL_ARRAY(VkSurfaceFormatKHR) formats;
+
+    // current swapchain and metadata:
+    struct pl_vulkan_swapchain_params params;
+    VkSwapchainCreateInfoKHR protoInfo; // partially filled-in prototype
+    VkSwapchainKHR swapchain;
+    int cur_width, cur_height;
+    int swapchain_depth;
+    pl_rc_t frames_in_flight;       // number of frames currently queued
+    bool suboptimal;                // true once VK_SUBOPTIMAL_KHR is returned
+    bool needs_recreate;            // swapchain needs to be recreated
+    struct pl_color_repr color_repr;
+    struct pl_color_space color_space;
+    struct pl_hdr_metadata hdr_metadata;
+
+    // state of the images:
+    PL_ARRAY(pl_tex) images;        // pl_tex wrappers for the VkImages
+    PL_ARRAY(struct sem_pair) sems; // pool of semaphores used to synchronize images
+    int idx_sems;                   // index of next free semaphore pair
+    int last_imgidx;                // the image index last acquired (for submit)
+};
+
+static const struct pl_sw_fns vulkan_swapchain;
+
+static bool map_color_space(VkColorSpaceKHR space, struct pl_color_space *out)
+{
+    switch (space) {
+    // Note: This is technically against the spec, but more often than not
+    // it's the correct result since `SRGB_NONLINEAR` is just a catch-all
+    // for any sort of typical SDR curve, which is better approximated by
+    // `pl_color_space_monitor`.
+    case VK_COLOR_SPACE_SRGB_NONLINEAR_KHR:
+        *out = pl_color_space_monitor;
+        return true;
+
+    case VK_COLOR_SPACE_BT709_NONLINEAR_EXT:
+        *out = pl_color_space_monitor;
+        return true;
+    case VK_COLOR_SPACE_DISPLAY_P3_NONLINEAR_EXT:
+        *out = (struct pl_color_space) {
+            .primaries = PL_COLOR_PRIM_DISPLAY_P3,
+            .transfer  = PL_COLOR_TRC_BT_1886,
+        };
+        return true;
+    case VK_COLOR_SPACE_DCI_P3_LINEAR_EXT:
+        *out = (struct pl_color_space) {
+            .primaries = PL_COLOR_PRIM_DCI_P3,
+            .transfer  = PL_COLOR_TRC_LINEAR,
+        };
+        return true;
+    case VK_COLOR_SPACE_DCI_P3_NONLINEAR_EXT:
+        *out = (struct pl_color_space) {
+            .primaries = PL_COLOR_PRIM_DCI_P3,
+            .transfer  = PL_COLOR_TRC_BT_1886,
+        };
+        return true;
+    case VK_COLOR_SPACE_EXTENDED_SRGB_LINEAR_EXT:
+    case VK_COLOR_SPACE_EXTENDED_SRGB_NONLINEAR_EXT:
+        // TODO
+        return false;
+    case VK_COLOR_SPACE_BT709_LINEAR_EXT:
+        *out = (struct pl_color_space) {
+            .primaries = PL_COLOR_PRIM_DCI_P3,
+            .transfer  = PL_COLOR_TRC_LINEAR,
+        };
+        return true;
+    case VK_COLOR_SPACE_BT2020_LINEAR_EXT:
+        *out = (struct pl_color_space) {
+            .primaries = PL_COLOR_PRIM_BT_2020,
+            .transfer  = PL_COLOR_TRC_LINEAR,
+        };
+        return true;
+    case VK_COLOR_SPACE_HDR10_ST2084_EXT:
+        *out = (struct pl_color_space) {
+            .primaries = PL_COLOR_PRIM_BT_2020,
+            .transfer  = PL_COLOR_TRC_PQ,
+        };
+        return true;
+    case VK_COLOR_SPACE_DOLBYVISION_EXT:
+        // Unlikely to ever be implemented
+        return false;
+    case VK_COLOR_SPACE_HDR10_HLG_EXT:
+        *out = (struct pl_color_space) {
+            .primaries = PL_COLOR_PRIM_BT_2020,
+            .transfer  = PL_COLOR_TRC_HLG,
+        };
+        return true;
+    case VK_COLOR_SPACE_ADOBERGB_LINEAR_EXT:
+        *out = (struct pl_color_space) {
+            .primaries = PL_COLOR_PRIM_ADOBE,
+            .transfer  = PL_COLOR_TRC_LINEAR,
+        };
+        return true;
+    case VK_COLOR_SPACE_ADOBERGB_NONLINEAR_EXT:
+        *out = (struct pl_color_space) {
+            .primaries = PL_COLOR_PRIM_ADOBE,
+            .transfer  = PL_COLOR_TRC_GAMMA22,
+        };
+        return true;
+    case VK_COLOR_SPACE_PASS_THROUGH_EXT:
+        *out = pl_color_space_unknown;
+        return true;
+
+#ifdef VK_AMD_display_native_hdr
+    case VK_COLOR_SPACE_DISPLAY_NATIVE_AMD:
+        // TODO
+        return false;
+#endif
+
+    default: return false;
+    }
+}
+
+static bool pick_surf_format(pl_swapchain sw, const struct pl_color_space *hint)
+{
+    struct priv *p = PL_PRIV(sw);
+    struct vk_ctx *vk = p->vk;
+    pl_gpu gpu = sw->gpu;
+
+    int best_score = 0, best_id;
+    bool wide_gamut = pl_color_primaries_is_wide_gamut(hint->primaries);
+    bool prefer_hdr = pl_color_transfer_is_hdr(hint->transfer);
+
+    for (int i = 0; i < p->formats.num; i++) {
+        // Color space / format whitelist
+        struct pl_color_space space;
+        if (!map_color_space(p->formats.elem[i].colorSpace, &space))
+            continue;
+
+        bool disable10 = !pl_color_transfer_is_hdr(space.transfer) &&
+                         p->params.disable_10bit_sdr;
+
+        switch (p->formats.elem[i].format) {
+        // Only accept floating point formats for linear curves
+        case VK_FORMAT_R16G16B16_SFLOAT:
+        case VK_FORMAT_R16G16B16A16_SFLOAT:
+        case VK_FORMAT_R32G32B32_SFLOAT:
+        case VK_FORMAT_R32G32B32A32_SFLOAT:
+        case VK_FORMAT_R64G64B64_SFLOAT:
+        case VK_FORMAT_R64G64B64A64_SFLOAT:
+            if (space.transfer == PL_COLOR_TRC_LINEAR)
+                break; // accept
+            continue;
+
+        // Only accept 8 bit for non-HDR curves
+        case VK_FORMAT_R8G8B8_UNORM:
+        case VK_FORMAT_B8G8R8_UNORM:
+        case VK_FORMAT_R8G8B8A8_UNORM:
+        case VK_FORMAT_B8G8R8A8_UNORM:
+        case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+            if (!pl_color_transfer_is_hdr(space.transfer))
+                break; // accept
+            continue;
+
+        // Only accept 10 bit formats for non-linear curves
+        case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
+        case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+            if (space.transfer != PL_COLOR_TRC_LINEAR && !disable10)
+                break; // accept
+            continue;
+
+        // Accept 16-bit formats for everything
+        case VK_FORMAT_R16G16B16_UNORM:
+        case VK_FORMAT_R16G16B16A16_UNORM:
+            if (!disable10)
+                break; // accept
+            continue;
+
+        default: continue;
+        }
+
+        // Make sure we can wrap this format to a meaningful, valid pl_fmt
+        for (int n = 0; n < gpu->num_formats; n++) {
+            pl_fmt plfmt = gpu->formats[n];
+            const struct vk_format **pvkfmt = PL_PRIV(plfmt);
+            if ((*pvkfmt)->tfmt != p->formats.elem[i].format)
+                continue;
+
+            enum pl_fmt_caps render_caps = 0;
+            render_caps |= PL_FMT_CAP_RENDERABLE;
+            render_caps |= PL_FMT_CAP_BLITTABLE;
+            if ((plfmt->caps & render_caps) != render_caps)
+                continue;
+
+            // format valid, use it if it has a higher score
+            int score = 0;
+            for (int c = 0; c < 3; c++)
+                score += plfmt->component_depth[c];
+            if (pl_color_primaries_is_wide_gamut(space.primaries) == wide_gamut)
+                score += 1000;
+            if (space.primaries == hint->primaries)
+                score += 2000;
+            if (pl_color_transfer_is_hdr(space.transfer) == prefer_hdr)
+                score += 10000;
+            if (space.transfer == hint->transfer)
+                score += 20000;
+
+            switch (plfmt->type) {
+            case PL_FMT_UNKNOWN: break;
+            case PL_FMT_UINT: break;
+            case PL_FMT_SINT: break;
+            case PL_FMT_UNORM: score += 500; break;
+            case PL_FMT_SNORM: score += 400; break;
+            case PL_FMT_FLOAT: score += 300; break;
+            case PL_FMT_TYPE_COUNT: pl_unreachable();
+            };
+
+            if (score > best_score) {
+                best_score = score;
+                best_id = i;
+                break;
+            }
+        }
+    }
+
+    if (!best_score) {
+        PL_ERR(vk, "Failed picking any valid, renderable surface format!");
+        return false;
+    }
+
+    VkSurfaceFormatKHR new_sfmt = p->formats.elem[best_id];
+    if (p->protoInfo.imageFormat != new_sfmt.format ||
+        p->protoInfo.imageColorSpace != new_sfmt.colorSpace)
+    {
+        PL_INFO(vk, "Picked surface configuration %d: %s + %s", best_id,
+                vk_fmt_name(new_sfmt.format),
+                vk_csp_name(new_sfmt.colorSpace));
+
+        p->protoInfo.imageFormat = new_sfmt.format;
+        p->protoInfo.imageColorSpace = new_sfmt.colorSpace;
+        p->needs_recreate = true;
+    }
+
+    return true;
+}
+
+static void set_hdr_metadata(struct priv *p, const struct pl_hdr_metadata *metadata)
+{
+    struct vk_ctx *vk = p->vk;
+    if (!vk->SetHdrMetadataEXT)
+        return;
+
+    // Whitelist only values that we support signalling metadata for
+    struct pl_hdr_metadata fix = {
+        .prim     = metadata->prim,
+        .min_luma = metadata->min_luma,
+        .max_luma = metadata->max_luma,
+        .max_cll  = metadata->max_cll,
+        .max_fall = metadata->max_fall,
+    };
+
+    // Ignore no-op changes
+    if (pl_hdr_metadata_equal(&fix, &p->hdr_metadata))
+        return;
+
+    // Remember the metadata so we can re-apply it after swapchain recreation
+    p->hdr_metadata = fix;
+
+    // Ignore HDR metadata requests for SDR swapchains
+    if (!pl_color_transfer_is_hdr(p->color_space.transfer))
+        return;
+
+    if (!p->swapchain)
+        return;
+
+    vk->SetHdrMetadataEXT(vk->dev, 1, &p->swapchain, &(VkHdrMetadataEXT) {
+        .sType = VK_STRUCTURE_TYPE_HDR_METADATA_EXT,
+        .displayPrimaryRed   = { fix.prim.red.x,   fix.prim.red.y },
+        .displayPrimaryGreen = { fix.prim.green.x, fix.prim.green.y },
+        .displayPrimaryBlue  = { fix.prim.blue.x,  fix.prim.blue.y },
+        .whitePoint = { fix.prim.white.x, fix.prim.white.y },
+        .maxLuminance = fix.max_luma,
+        .minLuminance = fix.min_luma,
+        .maxContentLightLevel = fix.max_cll,
+        .maxFrameAverageLightLevel = fix.max_fall,
+    });
+
+    // Keep track of applied HDR colorimetry metadata
+    p->color_space.hdr = p->hdr_metadata;
+}
+
+pl_swapchain pl_vulkan_create_swapchain(pl_vulkan plvk,
+                              const struct pl_vulkan_swapchain_params *params)
+{
+    struct vk_ctx *vk = PL_PRIV(plvk);
+    pl_gpu gpu = plvk->gpu;
+
+    if (!vk->CreateSwapchainKHR) {
+        PL_ERR(gpu, VK_KHR_SWAPCHAIN_EXTENSION_NAME " not enabled!");
+        return NULL;
+    }
+
+    struct pl_swapchain_t *sw = pl_zalloc_obj(NULL, sw, struct priv);
+    sw->log = vk->log;
+    sw->gpu = gpu;
+
+    struct priv *p = PL_PRIV(sw);
+    pl_mutex_init(&p->lock);
+    p->impl = vulkan_swapchain;
+    p->params = *params;
+    p->vk = vk;
+    p->surf = params->surface;
+    p->swapchain_depth = PL_DEF(params->swapchain_depth, 3);
+    pl_assert(p->swapchain_depth > 0);
+    atomic_init(&p->frames_in_flight, 0);
+    p->last_imgidx = -1;
+    p->protoInfo = (VkSwapchainCreateInfoKHR) {
+        .sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR,
+        .surface = p->surf,
+        .imageArrayLayers = 1, // non-stereoscopic
+        .imageSharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        .minImageCount = p->swapchain_depth + 1, // +1 for the FB
+        .presentMode = params->present_mode,
+        .clipped = true,
+    };
+
+    // These fields will be updated by `vk_sw_recreate`
+    p->color_space = pl_color_space_unknown;
+    p->color_repr = (struct pl_color_repr) {
+        .sys    = PL_COLOR_SYSTEM_RGB,
+        .levels = PL_COLOR_LEVELS_FULL,
+        .alpha  = PL_ALPHA_UNKNOWN,
+    };
+
+    // Make sure the swapchain present mode is supported
+    VkPresentModeKHR *modes = NULL;
+    uint32_t num_modes = 0;
+    VK(vk->GetPhysicalDeviceSurfacePresentModesKHR(vk->physd, p->surf, &num_modes, NULL));
+    modes = pl_calloc_ptr(NULL, num_modes, modes);
+    VK(vk->GetPhysicalDeviceSurfacePresentModesKHR(vk->physd, p->surf, &num_modes, modes));
+
+    bool supported = false;
+    for (int i = 0; i < num_modes; i++)
+        supported |= (modes[i] == p->protoInfo.presentMode);
+    pl_free_ptr(&modes);
+
+    if (!supported) {
+        PL_WARN(vk, "Requested swap mode unsupported by this device, falling "
+                "back to VK_PRESENT_MODE_FIFO_KHR");
+        p->protoInfo.presentMode = VK_PRESENT_MODE_FIFO_KHR;
+    }
+
+    // Enumerate the supported surface color spaces
+    uint32_t num_formats = 0;
+    VK(vk->GetPhysicalDeviceSurfaceFormatsKHR(vk->physd, p->surf, &num_formats, NULL));
+    PL_ARRAY_RESIZE(sw, p->formats, num_formats);
+    VK(vk->GetPhysicalDeviceSurfaceFormatsKHR(vk->physd, p->surf, &num_formats, p->formats.elem));
+    p->formats.num = num_formats;
+
+    PL_INFO(gpu, "Available surface configurations:");
+    for (int i = 0; i < p->formats.num; i++) {
+        PL_INFO(gpu, "    %d: %-40s %s", i,
+                vk_fmt_name(p->formats.elem[i].format),
+                vk_csp_name(p->formats.elem[i].colorSpace));
+    }
+
+    // Ensure there exists at least some valid renderable surface format
+    struct pl_color_space hint = {0};
+    if (!pick_surf_format(sw, &hint))
+        goto error;
+
+    return sw;
+
+error:
+    pl_free(modes);
+    pl_free(sw);
+    return NULL;
+}
+
+static void vk_sw_destroy(pl_swapchain sw)
+{
+    pl_gpu gpu = sw->gpu;
+    struct priv *p = PL_PRIV(sw);
+    struct vk_ctx *vk = p->vk;
+
+    pl_gpu_flush(gpu);
+    vk_wait_idle(vk);
+
+    // Vulkan offers no way to know when a queue presentation command is done,
+    // leading to spec-mandated undefined behavior when destroying resources
+    // tied to the swapchain. Use an extra `vkQueueWaitIdle` on all of the
+    // queues we may have oustanding presentation calls on, to hopefully inform
+    // the driver that we want to wait until the device is truly idle.
+    for (int i = 0; i < vk->pool_graphics->num_queues; i++)
+        vk->QueueWaitIdle(vk->pool_graphics->queues[i]);
+
+    for (int i = 0; i < p->images.num; i++)
+        pl_tex_destroy(gpu, &p->images.elem[i]);
+    for (int i = 0; i < p->sems.num; i++) {
+        vk->DestroySemaphore(vk->dev, p->sems.elem[i].in, PL_VK_ALLOC);
+        vk->DestroySemaphore(vk->dev, p->sems.elem[i].out, PL_VK_ALLOC);
+    }
+
+    vk->DestroySwapchainKHR(vk->dev, p->swapchain, PL_VK_ALLOC);
+    pl_mutex_destroy(&p->lock);
+    pl_free((void *) sw);
+}
+
+static int vk_sw_latency(pl_swapchain sw)
+{
+    struct priv *p = PL_PRIV(sw);
+    return p->swapchain_depth;
+}
+
+static bool update_swapchain_info(struct priv *p, VkSwapchainCreateInfoKHR *info,
+                                  int w, int h)
+{
+    struct vk_ctx *vk = p->vk;
+
+    // Query the supported capabilities and update this struct as needed
+    VkSurfaceCapabilitiesKHR caps = {0};
+    VK(vk->GetPhysicalDeviceSurfaceCapabilitiesKHR(vk->physd, p->surf, &caps));
+
+    // Check for hidden/invisible window
+    if (!caps.currentExtent.width || !caps.currentExtent.height) {
+        PL_DEBUG(vk, "maxImageExtent reported as 0x0, hidden window? skipping");
+        return false;
+    }
+
+    // Sorted by preference
+    static const struct { VkCompositeAlphaFlagsKHR vk_mode;
+                          enum pl_alpha_mode pl_mode;
+                        } alphaModes[] = {
+        {VK_COMPOSITE_ALPHA_POST_MULTIPLIED_BIT_KHR, PL_ALPHA_INDEPENDENT},
+        {VK_COMPOSITE_ALPHA_PRE_MULTIPLIED_BIT_KHR,  PL_ALPHA_PREMULTIPLIED},
+        {VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR,          PL_ALPHA_UNKNOWN},
+        {VK_COMPOSITE_ALPHA_INHERIT_BIT_KHR,         PL_ALPHA_UNKNOWN},
+    };
+
+    for (int i = 0; i < PL_ARRAY_SIZE(alphaModes); i++) {
+        if (caps.supportedCompositeAlpha & alphaModes[i].vk_mode) {
+            info->compositeAlpha = alphaModes[i].vk_mode;
+            p->color_repr.alpha = alphaModes[i].pl_mode;
+            PL_DEBUG(vk, "Requested alpha compositing mode: %s",
+                     vk_alpha_mode(info->compositeAlpha));
+            break;
+        }
+    }
+
+    if (!info->compositeAlpha) {
+        PL_ERR(vk, "Failed picking alpha compositing mode (caps: 0x%x)",
+               caps.supportedCompositeAlpha);
+        goto error;
+    }
+
+    // Note: We could probably also allow picking a surface transform that
+    // flips the framebuffer and set `pl_swapchain_frame.flipped`, but this
+    // doesn't appear to be necessary for any vulkan implementations.
+    static const VkSurfaceTransformFlagsKHR rotModes[] = {
+        VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR,
+        VK_SURFACE_TRANSFORM_INHERIT_BIT_KHR,
+    };
+
+    for (int i = 0; i < PL_ARRAY_SIZE(rotModes); i++) {
+        if (caps.supportedTransforms & rotModes[i]) {
+            info->preTransform = rotModes[i];
+            PL_DEBUG(vk, "Requested surface transform: %s",
+                     vk_surface_transform(info->preTransform));
+            break;
+        }
+    }
+
+    if (!info->preTransform) {
+        PL_ERR(vk, "Failed picking surface transform mode (caps: 0x%x)",
+               caps.supportedTransforms);
+        goto error;
+    }
+
+    // Image count as required
+    PL_DEBUG(vk, "Requested image count: %d (min %d max %d)",
+             (int) info->minImageCount, (int) caps.minImageCount,
+             (int) caps.maxImageCount);
+
+    info->minImageCount = PL_MAX(info->minImageCount, caps.minImageCount);
+    if (caps.maxImageCount)
+        info->minImageCount = PL_MIN(info->minImageCount, caps.maxImageCount);
+
+    PL_DEBUG(vk, "Requested image size: %dx%d (min %dx%d < cur %dx%d < max %dx%d)",
+             w, h, caps.minImageExtent.width, caps.minImageExtent.height,
+             caps.currentExtent.width, caps.currentExtent.height,
+             caps.maxImageExtent.width, caps.maxImageExtent.height);
+
+    // Default the requested size based on the reported extent
+    if (caps.currentExtent.width != 0xFFFFFFFF)
+        w = PL_DEF(w, caps.currentExtent.width);
+    if (caps.currentExtent.height != 0xFFFFFFFF)
+        h = PL_DEF(h, caps.currentExtent.height);
+
+    // Otherwise, re-use the existing size if available
+    w = PL_DEF(w, info->imageExtent.width);
+    h = PL_DEF(h, info->imageExtent.height);
+
+    if (!w || !h) {
+        PL_ERR(vk, "Failed resizing swapchain: unknown size?");
+        goto error;
+    }
+
+    // Clamp the extent based on the supported limits
+    w = PL_CLAMP(w, caps.minImageExtent.width,  caps.maxImageExtent.width);
+    h = PL_CLAMP(h, caps.minImageExtent.height, caps.maxImageExtent.height);
+    info->imageExtent = (VkExtent2D) { w, h };
+
+    // We just request whatever makes sense, and let the pl_vk decide what
+    // pl_tex_params that translates to. That said, we still need to intersect
+    // the swapchain usage flags with the format usage flags
+    VkImageUsageFlags req_flags = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
+                                  VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+    VkImageUsageFlags opt_flags = VK_IMAGE_USAGE_STORAGE_BIT;
+
+    info->imageUsage = caps.supportedUsageFlags & (req_flags | opt_flags);
+    VkFormatProperties fmtprop = {0};
+    vk->GetPhysicalDeviceFormatProperties(vk->physd, info->imageFormat, &fmtprop);
+
+#define CHECK(usage, feature) \
+    if (!((fmtprop.optimalTilingFeatures & VK_FORMAT_FEATURE_##feature##_BIT))) \
+        info->imageUsage &= ~VK_IMAGE_USAGE_##usage##_BIT
+
+    CHECK(COLOR_ATTACHMENT, COLOR_ATTACHMENT);
+    CHECK(TRANSFER_DST, TRANSFER_DST);
+    CHECK(STORAGE, STORAGE_IMAGE);
+
+    if ((info->imageUsage & req_flags) != req_flags) {
+        PL_ERR(vk, "The swapchain doesn't support rendering and blitting!");
+        goto error;
+    }
+
+    return true;
+
+error:
+    return false;
+}
+
+static void destroy_swapchain(struct vk_ctx *vk, void *swapchain)
+{
+    vk->DestroySwapchainKHR(vk->dev, vk_unwrap_handle(swapchain), PL_VK_ALLOC);
+}
+
+static bool vk_sw_recreate(pl_swapchain sw, int w, int h)
+{
+    pl_gpu gpu = sw->gpu;
+    struct priv *p = PL_PRIV(sw);
+    struct vk_ctx *vk = p->vk;
+
+    VkImage *vkimages = NULL;
+    uint32_t num_images = 0;
+
+    if (!update_swapchain_info(p, &p->protoInfo, w, h))
+        return false;
+
+    VkSwapchainCreateInfoKHR sinfo = p->protoInfo;
+#ifdef VK_EXT_full_screen_exclusive
+    // Explicitly disallow full screen exclusive mode if possible
+    static const VkSurfaceFullScreenExclusiveInfoEXT fsinfo = {
+        .sType = VK_STRUCTURE_TYPE_SURFACE_FULL_SCREEN_EXCLUSIVE_INFO_EXT,
+        .fullScreenExclusive = VK_FULL_SCREEN_EXCLUSIVE_DISALLOWED_EXT,
+    };
+    if (vk->AcquireFullScreenExclusiveModeEXT)
+        vk_link_struct(&sinfo, &fsinfo);
+#endif
+
+    p->suboptimal = false;
+    p->needs_recreate = false;
+    p->cur_width = sinfo.imageExtent.width;
+    p->cur_height = sinfo.imageExtent.height;
+
+    PL_DEBUG(sw, "(Re)creating swapchain of size %dx%d",
+             sinfo.imageExtent.width,
+             sinfo.imageExtent.height);
+
+#ifdef PL_HAVE_UNIX
+    if (vk->props.vendorID == VK_VENDOR_ID_NVIDIA) {
+        vk->DeviceWaitIdle(vk->dev);
+        vk_wait_idle(vk);
+    }
+#endif
+
+    // Calling `vkCreateSwapchainKHR` puts sinfo.oldSwapchain into a retired
+    // state whether the call succeeds or not, so we always need to garbage
+    // collect it afterwards - asynchronously as it may still be in use
+    sinfo.oldSwapchain = p->swapchain;
+    p->swapchain = VK_NULL_HANDLE;
+    VkResult res = vk->CreateSwapchainKHR(vk->dev, &sinfo, PL_VK_ALLOC, &p->swapchain);
+    vk_dev_callback(vk, (vk_cb) destroy_swapchain, vk, vk_wrap_handle(sinfo.oldSwapchain));
+    PL_VK_ASSERT(res, "vk->CreateSwapchainKHR(...)");
+
+    // Get the new swapchain images
+    VK(vk->GetSwapchainImagesKHR(vk->dev, p->swapchain, &num_images, NULL));
+    vkimages = pl_calloc_ptr(NULL, num_images, vkimages);
+    VK(vk->GetSwapchainImagesKHR(vk->dev, p->swapchain, &num_images, vkimages));
+
+    for (int i = 0; i < num_images; i++)
+        PL_VK_NAME(IMAGE, vkimages[i], "swapchain");
+
+    // If needed, allocate some more semaphores
+    while (num_images > p->sems.num) {
+        VkSemaphore sem_in = VK_NULL_HANDLE, sem_out = VK_NULL_HANDLE;
+        static const VkSemaphoreCreateInfo seminfo = {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+        };
+        VK(vk->CreateSemaphore(vk->dev, &seminfo, PL_VK_ALLOC, &sem_in));
+        VK(vk->CreateSemaphore(vk->dev, &seminfo, PL_VK_ALLOC, &sem_out));
+        PL_VK_NAME(SEMAPHORE, sem_in, "swapchain in");
+        PL_VK_NAME(SEMAPHORE, sem_out, "swapchain out");
+
+        PL_ARRAY_APPEND(sw, p->sems, (struct sem_pair) {
+            .in = sem_in,
+            .out = sem_out,
+        });
+    }
+
+    // Recreate the pl_tex wrappers
+    for (int i = 0; i < p->images.num; i++)
+        pl_tex_destroy(gpu, &p->images.elem[i]);
+    p->images.num = 0;
+
+    for (int i = 0; i < num_images; i++) {
+        const VkExtent2D *ext = &sinfo.imageExtent;
+        pl_tex tex = pl_vulkan_wrap(gpu, pl_vulkan_wrap_params(
+            .image = vkimages[i],
+            .width = ext->width,
+            .height = ext->height,
+            .format = sinfo.imageFormat,
+            .usage = sinfo.imageUsage,
+        ));
+        if (!tex)
+            goto error;
+        PL_ARRAY_APPEND(sw, p->images, tex);
+    }
+
+    pl_assert(num_images > 0);
+    int bits = 0;
+
+    // The channel with the most bits is probably the most authoritative about
+    // the actual color information (consider e.g. a2bgr10). Slight downside
+    // in that it results in rounding r/b for e.g. rgb565, but we don't pick
+    // surfaces with fewer than 8 bits anyway, so let's not care for now.
+    pl_fmt fmt = p->images.elem[0]->params.format;
+    for (int i = 0; i < fmt->num_components; i++)
+        bits = PL_MAX(bits, fmt->component_depth[i]);
+
+    p->color_repr.bits.sample_depth = bits;
+    p->color_repr.bits.color_depth = bits;
+
+    // Note: `p->color_space.hdr` is (re-)applied by `set_hdr_metadata`
+    map_color_space(sinfo.imageColorSpace, &p->color_space);
+
+    // Forcibly re-apply HDR metadata, bypassing the no-op check
+    struct pl_hdr_metadata metadata = p->hdr_metadata;
+    p->hdr_metadata = pl_hdr_metadata_empty;
+    set_hdr_metadata(p, &metadata);
+
+    pl_free(vkimages);
+    return true;
+
+error:
+    PL_ERR(vk, "Failed (re)creating swapchain!");
+    pl_free(vkimages);
+    vk->DestroySwapchainKHR(vk->dev, p->swapchain, PL_VK_ALLOC);
+    p->swapchain = VK_NULL_HANDLE;
+    p->cur_width = p->cur_height = 0;
+    return false;
+}
+
+static bool vk_sw_start_frame(pl_swapchain sw,
+                              struct pl_swapchain_frame *out_frame)
+{
+    struct priv *p = PL_PRIV(sw);
+    struct vk_ctx *vk = p->vk;
+    pl_mutex_lock(&p->lock);
+
+    bool recreate = !p->swapchain || p->needs_recreate;
+    if (p->suboptimal && !p->params.allow_suboptimal)
+        recreate = true;
+
+    if (recreate && !vk_sw_recreate(sw, 0, 0)) {
+        pl_mutex_unlock(&p->lock);
+        return false;
+    }
+
+    VkSemaphore sem_in = p->sems.elem[p->idx_sems].in;
+    PL_TRACE(vk, "vkAcquireNextImageKHR signals 0x%"PRIx64, (uint64_t) sem_in);
+
+    for (int attempts = 0; attempts < 2; attempts++) {
+        uint32_t imgidx = 0;
+        VkResult res = vk->AcquireNextImageKHR(vk->dev, p->swapchain, UINT64_MAX,
+                                               sem_in, VK_NULL_HANDLE, &imgidx);
+
+        switch (res) {
+        case VK_SUBOPTIMAL_KHR:
+            p->suboptimal = true;
+            // fall through
+        case VK_SUCCESS:
+            p->last_imgidx = imgidx;
+            pl_vulkan_release_ex(sw->gpu, pl_vulkan_release_params(
+                .tex        = p->images.elem[imgidx],
+                .layout     = VK_IMAGE_LAYOUT_UNDEFINED,
+                .qf         = VK_QUEUE_FAMILY_IGNORED,
+                .semaphore  = { sem_in },
+            ));
+            *out_frame = (struct pl_swapchain_frame) {
+                .fbo = p->images.elem[imgidx],
+                .flipped = false,
+                .color_repr = p->color_repr,
+                .color_space = p->color_space,
+            };
+            // keep lock held
+            return true;
+
+        case VK_ERROR_OUT_OF_DATE_KHR: {
+            // In these cases try recreating the swapchain
+            if (!vk_sw_recreate(sw, 0, 0)) {
+                pl_mutex_unlock(&p->lock);
+                return false;
+            }
+            continue;
+        }
+
+        default:
+            PL_ERR(vk, "Failed acquiring swapchain image: %s", vk_res_str(res));
+            pl_mutex_unlock(&p->lock);
+            return false;
+        }
+    }
+
+    // If we've exhausted the number of attempts to recreate the swapchain,
+    // just give up silently and let the user retry some time later.
+    pl_mutex_unlock(&p->lock);
+    return false;
+}
+
+static void present_cb(struct priv *p, void *arg)
+{
+    (void) pl_rc_deref(&p->frames_in_flight);
+}
+
+static bool vk_sw_submit_frame(pl_swapchain sw)
+{
+    pl_gpu gpu = sw->gpu;
+    struct priv *p = PL_PRIV(sw);
+    struct vk_ctx *vk = p->vk;
+    pl_assert(p->last_imgidx >= 0);
+    pl_assert(p->swapchain);
+    uint32_t idx = p->last_imgidx;
+    VkSemaphore sem_out = p->sems.elem[p->idx_sems++].out;
+    p->idx_sems %= p->sems.num;
+    p->last_imgidx = -1;
+
+    bool held = pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params(
+        .tex        = p->images.elem[idx],
+        .layout     = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,
+        .qf         = VK_QUEUE_FAMILY_IGNORED,
+        .semaphore  = { sem_out },
+    ));
+
+    if (!held) {
+        PL_ERR(gpu, "Failed holding swapchain image for presentation");
+        pl_mutex_unlock(&p->lock);
+        return false;
+    }
+
+    struct vk_cmd *cmd = pl_vk_steal_cmd(gpu);
+    if (!cmd) {
+        pl_mutex_unlock(&p->lock);
+        return false;
+    }
+
+    pl_rc_ref(&p->frames_in_flight);
+    vk_cmd_callback(cmd, (vk_cb) present_cb, p, NULL);
+    if (!vk_cmd_submit(&cmd)) {
+        pl_mutex_unlock(&p->lock);
+        return false;
+    }
+
+    struct vk_cmdpool *pool = vk->pool_graphics;
+    int qidx = pool->idx_queues;
+    VkQueue queue = pool->queues[qidx];
+
+    vk_rotate_queues(p->vk);
+    vk_malloc_garbage_collect(vk->ma);
+
+    VkPresentInfoKHR pinfo = {
+        .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR,
+        .waitSemaphoreCount = 1,
+        .pWaitSemaphores = &sem_out,
+        .swapchainCount = 1,
+        .pSwapchains = &p->swapchain,
+        .pImageIndices = &idx,
+    };
+
+    PL_TRACE(vk, "vkQueuePresentKHR waits on 0x%"PRIx64, (uint64_t) sem_out);
+    vk->lock_queue(vk->queue_ctx, pool->qf, qidx);
+    VkResult res = vk->QueuePresentKHR(queue, &pinfo);
+    vk->unlock_queue(vk->queue_ctx, pool->qf, qidx);
+    pl_mutex_unlock(&p->lock);
+
+    switch (res) {
+    case VK_SUBOPTIMAL_KHR:
+        p->suboptimal = true;
+        // fall through
+    case VK_SUCCESS:
+        return true;
+
+    case VK_ERROR_OUT_OF_DATE_KHR:
+        // We can silently ignore this error, since the next start_frame will
+        // recreate the swapchain automatically.
+        return true;
+
+    default:
+        PL_ERR(vk, "Failed presenting to queue %p: %s", (void *) queue,
+               vk_res_str(res));
+        return false;
+    }
+}
+
+static void vk_sw_swap_buffers(pl_swapchain sw)
+{
+    struct priv *p = PL_PRIV(sw);
+
+    pl_mutex_lock(&p->lock);
+    while (pl_rc_count(&p->frames_in_flight) >= p->swapchain_depth) {
+        pl_mutex_unlock(&p->lock); // don't hold mutex while blocking
+        vk_poll_commands(p->vk, UINT64_MAX);
+        pl_mutex_lock(&p->lock);
+    }
+    pl_mutex_unlock(&p->lock);
+}
+
+static bool vk_sw_resize(pl_swapchain sw, int *width, int *height)
+{
+    struct priv *p = PL_PRIV(sw);
+    bool ok = true;
+
+    pl_mutex_lock(&p->lock);
+
+    bool width_changed = *width && *width != p->cur_width,
+         height_changed = *height && *height != p->cur_height;
+
+    if (p->suboptimal || p->needs_recreate || width_changed || height_changed)
+        ok = vk_sw_recreate(sw, *width, *height);
+
+    *width = p->cur_width;
+    *height = p->cur_height;
+
+    pl_mutex_unlock(&p->lock);
+    return ok;
+}
+
+static void vk_sw_colorspace_hint(pl_swapchain sw, const struct pl_color_space *csp)
+{
+    struct priv *p = PL_PRIV(sw);
+    pl_mutex_lock(&p->lock);
+
+    // This should never fail if the swapchain already exists
+    bool ok = pick_surf_format(sw, csp);
+    set_hdr_metadata(p, &csp->hdr);
+    pl_assert(ok);
+
+    pl_mutex_unlock(&p->lock);
+}
+
+bool pl_vulkan_swapchain_suboptimal(pl_swapchain sw)
+{
+    struct priv *p = PL_PRIV(sw);
+    return p->suboptimal;
+}
+
+static const struct pl_sw_fns vulkan_swapchain = {
+    .destroy            = vk_sw_destroy,
+    .latency            = vk_sw_latency,
+    .resize             = vk_sw_resize,
+    .colorspace_hint    = vk_sw_colorspace_hint,
+    .start_frame        = vk_sw_start_frame,
+    .submit_frame       = vk_sw_submit_frame,
+    .swap_buffers       = vk_sw_swap_buffers,
+};
diff --git a/src/vulkan/utils.c b/src/vulkan/utils.c
new file mode 100644
index 0000000..914f9e4
--- /dev/null
+++ b/src/vulkan/utils.c
@@ -0,0 +1,181 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "utils.h"
+
+VkExternalMemoryHandleTypeFlagBitsKHR
+vk_mem_handle_type(enum pl_handle_type handle_type)
+{
+    if (!handle_type)
+        return 0;
+
+    switch (handle_type) {
+    case PL_HANDLE_FD:
+        return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+    case PL_HANDLE_WIN32:
+        return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR;
+    case PL_HANDLE_WIN32_KMT:
+        return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR;
+    case PL_HANDLE_DMA_BUF:
+        return VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT;
+    case PL_HANDLE_HOST_PTR:
+        return VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT;
+    case PL_HANDLE_MTL_TEX:
+    case PL_HANDLE_IOSURFACE:
+        return 0;
+    }
+
+    pl_unreachable();
+}
+
+VkExternalSemaphoreHandleTypeFlagBitsKHR
+vk_sync_handle_type(enum pl_handle_type handle_type)
+{
+    if (!handle_type)
+        return 0;
+
+    switch (handle_type) {
+    case PL_HANDLE_FD:
+        return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+    case PL_HANDLE_WIN32:
+        return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR;
+    case PL_HANDLE_WIN32_KMT:
+        return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR;
+    case PL_HANDLE_DMA_BUF:
+    case PL_HANDLE_HOST_PTR:
+    case PL_HANDLE_MTL_TEX:
+    case PL_HANDLE_IOSURFACE:
+        return 0;
+    }
+
+    pl_unreachable();
+}
+
+bool vk_external_mem_check(struct vk_ctx *vk,
+                           const VkExternalMemoryPropertiesKHR *props,
+                           enum pl_handle_type handle_type,
+                           bool import)
+{
+    VkExternalMemoryFeatureFlagsKHR flags = props->externalMemoryFeatures;
+    VkExternalMemoryHandleTypeFlagBitsKHR vk_handle = vk_mem_handle_type(handle_type);
+
+    if (import) {
+        if (!(flags & VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT_KHR)) {
+            PL_DEBUG(vk, "Handle type %s (0x%x) is not importable",
+                     vk_handle_name(vk_handle), (unsigned int) handle_type);
+            return false;
+        }
+    } else {
+        if (!(flags & VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT_KHR)) {
+            PL_DEBUG(vk, "Handle type %s (0x%x) is not exportable",
+                     vk_handle_name(vk_handle), (unsigned int) handle_type);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+const enum pl_handle_type vk_mem_handle_list[] = {
+        PL_HANDLE_HOST_PTR,
+#ifdef PL_HAVE_UNIX
+        PL_HANDLE_FD,
+        PL_HANDLE_DMA_BUF,
+#endif
+#ifdef PL_HAVE_WIN32
+        PL_HANDLE_WIN32,
+        PL_HANDLE_WIN32_KMT,
+#endif
+        0
+};
+
+const enum pl_handle_type vk_sync_handle_list[] = {
+#ifdef PL_HAVE_UNIX
+        PL_HANDLE_FD,
+#endif
+#ifdef PL_HAVE_WIN32
+        PL_HANDLE_WIN32,
+        PL_HANDLE_WIN32_KMT,
+#endif
+        0
+};
+
+const void *vk_find_struct(const void *chain, VkStructureType stype)
+{
+    const VkBaseInStructure *in = chain;
+    while (in) {
+        if (in->sType == stype)
+            return in;
+
+        in = in->pNext;
+    }
+
+    return NULL;
+}
+
+void vk_link_struct(void *chain, const void *in)
+{
+    if (!in)
+        return;
+
+    VkBaseOutStructure *out = chain;
+    while (out->pNext)
+        out = out->pNext;
+
+    out->pNext = (void *) in;
+}
+
+void *vk_struct_memdup(void *alloc, const void *pin)
+{
+    if (!pin)
+        return NULL;
+
+    const VkBaseInStructure *in = pin;
+    size_t size = vk_struct_size(in->sType);
+    pl_assert(size);
+
+    VkBaseOutStructure *out = pl_memdup(alloc, in, size);
+    out->pNext = NULL;
+    return out;
+}
+
+void *vk_chain_memdup(void *alloc, const void *pin)
+{
+    if (!pin)
+        return NULL;
+
+    const VkBaseInStructure *in = pin;
+    VkBaseOutStructure *out = vk_struct_memdup(alloc, in);
+    pl_assert(out);
+
+    out->pNext = vk_chain_memdup(alloc, in->pNext);
+    return out;
+}
+
+void *vk_chain_alloc(void *alloc, void *chain, VkStructureType stype)
+{
+    for (VkBaseOutStructure *out = chain;; out = out->pNext) {
+        if (out->sType == stype)
+            return out;
+        if (!out->pNext) {
+            VkBaseOutStructure *s = pl_zalloc(alloc, vk_struct_size(stype));
+            s->sType = stype;
+            out->pNext = s;
+            return s;
+        }
+    }
+}
diff --git a/src/vulkan/utils.h b/src/vulkan/utils.h
new file mode 100644
index 0000000..cb1c5f5
--- /dev/null
+++ b/src/vulkan/utils.h
@@ -0,0 +1,136 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+// Return a human-readable name for various vulkan enums
+const char *vk_res_str(VkResult res);
+const char *vk_fmt_name(VkFormat fmt);
+const char *vk_csp_name(VkColorSpaceKHR csp);
+const char *vk_handle_name(VkExternalMemoryHandleTypeFlagBitsKHR handle);
+const char *vk_obj_type(VkObjectType obj);
+const char *vk_alpha_mode(VkCompositeAlphaFlagsKHR alpha);
+const char *vk_surface_transform(VkSurfaceTransformFlagsKHR transform);
+
+// Return the size of an arbitrary vulkan struct. Returns 0 for unknown structs
+size_t vk_struct_size(VkStructureType stype);
+
+// Returns the vulkan API version which a given extension was promoted to, or 0
+// if the extension is not promoted.
+uint32_t vk_ext_promoted_ver(const char *extension);
+
+// Enum translation boilerplate
+VkExternalMemoryHandleTypeFlagBitsKHR vk_mem_handle_type(enum pl_handle_type);
+VkExternalSemaphoreHandleTypeFlagBitsKHR vk_sync_handle_type(enum pl_handle_type);
+
+// Bitmask of all access flags that imply a read/write operation, respectively
+extern const VkAccessFlags2 vk_access_read;
+extern const VkAccessFlags2 vk_access_write;
+
+// Check for compatibility of a VkExternalMemoryProperties
+bool vk_external_mem_check(struct vk_ctx *vk,
+                           const VkExternalMemoryPropertiesKHR *props,
+                           enum pl_handle_type handle_type,
+                           bool check_import);
+
+// Static lists of external handle types we should try probing for
+extern const enum pl_handle_type vk_mem_handle_list[];
+extern const enum pl_handle_type vk_sync_handle_list[];
+
+// Find a structure in a pNext chain, or NULL
+const void *vk_find_struct(const void *chain, VkStructureType stype);
+
+// Link a structure into a pNext chain
+void vk_link_struct(void *chain, const void *in);
+
+// Make a copy of a structure, not including the pNext chain
+void *vk_struct_memdup(void *alloc, const void *in);
+
+// Make a deep copy of an entire pNext chain
+void *vk_chain_memdup(void *alloc, const void *in);
+
+// Find a structure in a pNext chain, or allocate + link it if absent.
+void *vk_chain_alloc(void *alloc, void *chain, VkStructureType stype);
+
+// Renormalize input features into a state consistent for a given API version.
+// If `api_ver` is specified as 0, *both* meta-structs and extension structs
+// will be emitted. Note: `out` should be initialized by the user. In
+// particular, if it already contains a valid features chain, then this
+// function will effectively act as a union.
+void vk_features_normalize(void *alloc, const VkPhysicalDeviceFeatures2 *in,
+                           uint32_t api_ver, VkPhysicalDeviceFeatures2 *out);
+
+// Convenience macros to simplify a lot of common boilerplate
+#define PL_VK_ASSERT(res, str)                            \
+    do {                                                  \
+        if (res != VK_SUCCESS) {                          \
+            PL_ERR(vk, str ": %s (%s:%d)",                \
+                   vk_res_str(res), __FILE__, __LINE__);  \
+            goto error;                                   \
+        }                                                 \
+    } while (0)
+
+#define VK(cmd)                                           \
+    do {                                                  \
+        PL_TRACE(vk, #cmd);                               \
+        VkResult _res = (cmd);                            \
+        PL_VK_ASSERT(_res, #cmd);                         \
+    } while (0)
+
+#define PL_VK_NAME(type, obj, name)                                             \
+    do {                                                                        \
+        if (vk->SetDebugUtilsObjectNameEXT) {                                   \
+            vk->SetDebugUtilsObjectNameEXT(vk->dev, &(VkDebugUtilsObjectNameInfoEXT) { \
+                .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT,    \
+                .objectType = VK_OBJECT_TYPE_##type,                            \
+                .objectHandle = (uint64_t) (obj),                               \
+                .pObjectName = (name),                                          \
+            });                                                                 \
+        }                                                                       \
+    } while (0)
+
+// Variant of PL_VK_NAME for dispatchable handles
+#define PL_VK_NAME_HANDLE(type, obj, name) \
+    PL_VK_NAME(type, (uintptr_t) (obj), name)
+
+// Helper functions to wrap and unwrap non-dispatchable handles into pointers.
+// Note that wrap/unwrap must always be used linearly.
+#if VK_USE_64_BIT_PTR_DEFINES == 1
+#define vk_wrap_handle(h) (h)
+#define vk_unwrap_handle(h) (h)
+#elif UINTPTR_MAX >= UINT64_MAX
+#define vk_wrap_handle(h) ((void *) (uintptr_t) (h))
+#define vk_unwrap_handle(h) ((uint64_t) (uintptr_t) (h))
+#else
+static inline void *vk_wrap_handle(uint64_t h)
+{
+    uint64_t *wrapper = malloc(sizeof(h));
+    assert(wrapper);
+    *wrapper = h;
+    return wrapper;
+}
+
+static inline uint64_t vk_unwrap_handle(void *h)
+{
+    uint64_t *wrapper = h;
+    uint64_t ret = *wrapper;
+    free(wrapper);
+    return ret;
+}
+#endif
diff --git a/src/vulkan/utils_gen.c.j2 b/src/vulkan/utils_gen.c.j2
new file mode 100644
index 0000000..6db0454
--- /dev/null
+++ b/src/vulkan/utils_gen.c.j2
@@ -0,0 +1,137 @@
+#define VK_ENABLE_BETA_EXTENSIONS
+#include "vulkan/utils.h"
+
+const char *vk_res_str(VkResult res)
+{
+    switch (res) {
+{% for res in vkresults %}
+    case {{ res }}: return "{{ res }}";
+{% endfor %}
+
+    default: return "unknown error";
+    }
+}
+
+const char *vk_fmt_name(VkFormat fmt)
+{
+    switch (fmt) {
+{% for fmt in vkformats %}
+    case {{ fmt }}: return "{{ fmt }}";
+{% endfor %}
+
+    default: return "unknown format";
+    }
+}
+
+const char *vk_csp_name(VkColorSpaceKHR csp)
+{
+    switch (csp) {
+{% for csp in vkspaces %}
+    case {{ csp }}: return "{{ csp }}";
+{% endfor %}
+
+    default: return "unknown color space";
+    }
+}
+
+const char *vk_handle_name(VkExternalMemoryHandleTypeFlagBitsKHR handle)
+{
+    switch (handle) {
+{% for handle in vkhandles %}
+    case {{ handle }}: return "{{ handle }}";
+{% endfor %}
+
+    default: return "unknown handle type";
+    }
+}
+
+const char *vk_alpha_mode(VkCompositeAlphaFlagsKHR alpha)
+{
+    switch (alpha) {
+{% for mode in vkalphas %}
+    case {{ mode }}: return "{{ mode }}";
+{% endfor %}
+
+    default: return "unknown alpha mode";
+    }
+}
+
+const char *vk_surface_transform(VkSurfaceTransformFlagsKHR tf)
+{
+    switch (tf) {
+{% for tf in vktransforms %}
+    case {{ tf }}: return "{{ tf }}";
+{% endfor %}
+
+    default: return "unknown surface transform";
+    }
+}
+
+
+const char *vk_obj_type(VkObjectType obj)
+{
+    switch (obj) {
+{% for obj in vkobjects %}
+    case {{ obj.enum }}: return "{{ obj.name }}";
+{% endfor %}
+
+    default: return "unknown object";
+    }
+}
+
+size_t vk_struct_size(VkStructureType stype)
+{
+    switch (stype) {
+{% for struct in vkstructs %}
+    case {{ struct.stype }}: return sizeof({{ struct.name }});
+{% endfor %}
+
+    default: return 0;
+    }
+}
+
+uint32_t vk_ext_promoted_ver(const char *extension)
+{
+{% for ext in vkexts %}
+{%  if ext.promoted_ver %}
+    if (!strcmp(extension, "{{ ext.name }}"))
+        return {{ ext.promoted_ver }};
+{%  endif %}
+{% endfor %}
+    return 0;
+}
+
+void vk_features_normalize(void *alloc, const VkPhysicalDeviceFeatures2 *fin,
+                           uint32_t api_ver, VkPhysicalDeviceFeatures2 *out)
+{
+    for (const VkBaseInStructure *in = (void *) fin; in; in = in->pNext) {
+        switch (in->sType) {
+        default: break;
+{% for fs in vkfeatures %}
+        case {{ fs.stype }}: {
+            const {{ fs.name }} *i = (const void *) in;
+{% for f in fs.features %}
+            if (i->{{ f.name }}) {
+{% for r in f.replacements %}
+{% if r.core_ver %}
+               if (!api_ver || api_ver >= {{ r.core_ver }})
+{% elif r.max_ver %}
+               if (!api_ver || api_ver < {{ r.max_ver }})
+{% endif %}
+{% if fs.is_base %}
+                out->{{ f.name }} = true;
+{% else %}
+                (({{ r.name }} *) vk_chain_alloc(alloc, out, {{ r.stype }}))->{{ f.name }} = true;
+{% endif %}
+{% endfor %}
+            }
+{% endfor %}
+            break;
+        }
+{% endfor %}
+        }
+    }
+}
+
+const VkAccessFlags2 vk_access_read = {{ '0x%x' % vkaccess.read }}LLU;
+const VkAccessFlags2 vk_access_write = {{ '0x%x' % vkaccess.write }}LLU;
diff --git a/src/vulkan/utils_gen.py b/src/vulkan/utils_gen.py
new file mode 100644
index 0000000..a8652fd
--- /dev/null
+++ b/src/vulkan/utils_gen.py
@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+#
+# This file is part of libplacebo.
+#
+# libplacebo is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# libplacebo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+
+import os.path
+import re
+import sys
+import xml.etree.ElementTree as ET
+
+try:
+    import jinja2
+except ModuleNotFoundError:
+    print('Module \'jinja2\' not found, please install \'python3-Jinja2\' or '
+          'an equivalent package on your system! Alternatively, run '
+          '`git submodule update --init` followed by `meson --wipe`.',
+          file=sys.stderr)
+    sys.exit(1)
+
+TEMPLATE = jinja2.Environment(
+    loader = jinja2.FileSystemLoader(searchpath=os.path.dirname(__file__)),
+    trim_blocks=True,
+).get_template('utils_gen.c.j2')
+
+class Obj(object):
+    def __init__(self, **kwargs):
+        self.__dict__.update(kwargs)
+
+class VkXML(ET.ElementTree):
+    def blacklist_block(self, req):
+        for t in req.iterfind('type'):
+            self.blacklist_types.add(t.attrib['name'])
+        for e in req.iterfind('enum'):
+            self.blacklist_enums.add(e.attrib['name'])
+
+    def __init__(self, *args, **kwargs):
+
+        super().__init__(*args, **kwargs)
+        self.blacklist_types = set()
+        self.blacklist_enums = set()
+
+        for f in self.iterfind('feature'):
+            # Feature block for non-Vulkan API
+            if not 'vulkan' in f.attrib['api'].split(','):
+                for r in f.iterfind('require'):
+                    self.blacklist_block(r)
+
+        for e in self.iterfind('extensions/extension'):
+            # Entire extension is unsupported on vulkan or platform-specifid
+            if not 'vulkan' in e.attrib['supported'].split(',') or 'platform' in e.attrib:
+                for r in e.iterfind('require'):
+                    self.blacklist_block(r)
+                continue
+
+            # Only individual <require> blocks are API-specific
+            for r in e.iterfind('require[@api]'):
+                if not 'vulkan' in r.attrib['api'].split(','):
+                    self.blacklist_block(r)
+
+    def findall_enum(self, name):
+        for e in self.iterfind('enums[@name="{0}"]/enum'.format(name)):
+            if not 'alias' in e.attrib:
+                if not e.attrib['name'] in self.blacklist_enums:
+                    yield e
+        for e in self.iterfind('.//enum[@extends="{0}"]'.format(name)):
+            if not 'alias' in e.attrib:
+                if not e.attrib['name'] in self.blacklist_enums:
+                    yield e
+
+    def findall_type(self, category):
+        for t in self.iterfind('types/type[@category="{0}"]'.format(category)):
+            name = t.attrib.get('name') or t.find('name').text
+            if name in self.blacklist_types:
+                continue
+            yield t
+
+
+def get_vkenum(registry, enum):
+    for e in registry.findall_enum(enum):
+        yield e.attrib['name']
+
+def get_vkobjects(registry):
+    for t in registry.findall_type('handle'):
+        if 'objtypeenum' in t.attrib:
+            yield Obj(enum = t.attrib['objtypeenum'],
+                      name = t.find('name').text)
+
+def get_vkstructs(registry):
+    for t in registry.findall_type('struct'):
+        stype = None
+        for m in t.iterfind('member'):
+            if m.find('name').text == 'sType':
+                stype = m
+                break
+
+        if stype is not None and 'values' in stype.attrib:
+            yield Obj(stype = stype.attrib['values'],
+                      name = t.attrib['name'])
+
+def get_vkaccess(registry):
+    access = Obj(read = 0, write = 0)
+    for e in registry.findall_enum('VkAccessFlagBits2'):
+        if '_READ_' in e.attrib['name']:
+            access.read |= 1 << int(e.attrib['bitpos'])
+        if '_WRITE_' in e.attrib['name']:
+            access.write |= 1 << int(e.attrib['bitpos'])
+    return access
+
+def get_vkexts(registry):
+    for e in registry.iterfind('extensions/extension'):
+        promoted_ver = None
+        if res := re.match(r'VK_VERSION_(\d)_(\d)', e.attrib.get('promotedto', '')):
+            promoted_ver = 'VK_API_VERSION_{0}_{1}'.format(res[1], res[2])
+        yield Obj(name = e.attrib['name'],
+                  promoted_ver = promoted_ver)
+
+def get_vkfeatures(registry):
+    structs = [];
+    featuremap = {}; # features -> [struct]
+    for t in registry.findall_type('struct'):
+        sname = t.attrib['name']
+        is_base = sname == 'VkPhysicalDeviceFeatures'
+        extends = t.attrib.get('structextends', [])
+        if is_base:
+            sname = 'VkPhysicalDeviceFeatures2'
+            stype = 'VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2'
+        elif not 'VkPhysicalDeviceFeatures2' in extends:
+            continue
+
+        features = []
+        for f in t.iterfind('member'):
+            if f.find('type').text == 'VkStructureType':
+                stype = f.attrib['values']
+            elif f.find('type').text == 'VkBool32':
+                fname = f.find('name').text
+                if is_base:
+                    fname = 'features.' + fname
+                features.append(Obj(name = fname))
+
+        core_ver = None
+        if res := re.match(r'VkPhysicalDeviceVulkan(\d)(\d)Features', sname):
+            core_ver = 'VK_API_VERSION_{0}_{1}'.format(res[1], res[2])
+
+        struct = Obj(name       = sname,
+                     stype      = stype,
+                     core_ver   = core_ver,
+                     is_base    = is_base,
+                     features   = features)
+
+        structs.append(struct)
+        for f in features:
+            featuremap.setdefault(f.name, []).append(struct)
+
+    for s in structs:
+        for f in s.features:
+            f.replacements = featuremap[f.name]
+            core_ver = next(( r.core_ver for r in f.replacements if r.core_ver ), None)
+            for r in f.replacements:
+                if not r.core_ver:
+                    r.max_ver = core_ver
+
+    yield from structs
+
+def find_registry_xml(datadir):
+    registry_paths = [
+        '{0}/vulkan/registry/vk.xml'.format(datadir),
+        '$MINGW_PREFIX/share/vulkan/registry/vk.xml',
+        '%VULKAN_SDK%/share/vulkan/registry/vk.xml',
+        '$VULKAN_SDK/share/vulkan/registry/vk.xml',
+        '/usr/share/vulkan/registry/vk.xml',
+    ]
+
+    for p in registry_paths:
+        path = os.path.expandvars(p)
+        if os.path.isfile(path):
+            print('Found vk.xml: {0}'.format(path))
+            return path
+
+    print('Could not find the vulkan registry (vk.xml), please specify its '
+          'location manually using the -Dvulkan-registry=/path/to/vk.xml '
+          'option!', file=sys.stderr)
+    sys.exit(1)
+
+if __name__ == '__main__':
+    assert len(sys.argv) == 4
+    datadir = sys.argv[1]
+    xmlfile = sys.argv[2]
+    outfile = sys.argv[3]
+
+    if not xmlfile or xmlfile == '':
+        xmlfile = find_registry_xml(datadir)
+
+    registry = VkXML(ET.parse(xmlfile))
+    with open(outfile, 'w') as f:
+        f.write(TEMPLATE.render(
+            vkresults = get_vkenum(registry, 'VkResult'),
+            vkformats = get_vkenum(registry, 'VkFormat'),
+            vkspaces  = get_vkenum(registry, 'VkColorSpaceKHR'),
+            vkhandles = get_vkenum(registry, 'VkExternalMemoryHandleTypeFlagBits'),
+            vkalphas  = get_vkenum(registry, 'VkCompositeAlphaFlagBitsKHR'),
+            vktransforms = get_vkenum(registry, 'VkSurfaceTransformFlagBitsKHR'),
+            vkobjects = get_vkobjects(registry),
+            vkstructs = get_vkstructs(registry),
+            vkaccess = get_vkaccess(registry),
+            vkexts = get_vkexts(registry),
+            vkfeatures = get_vkfeatures(registry),
+        ))
diff --git a/tools/glsl_preproc/macros.py b/tools/glsl_preproc/macros.py
new file mode 100644
index 0000000..2ba7e21
--- /dev/null
+++ b/tools/glsl_preproc/macros.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+
+import re
+
+from variables import Var
+from templates import *
+from statement import *
+
+PATTERN_PRAGMA  = re.compile(flags=re.VERBOSE, pattern=r'''
+\s*\#\s*pragma\s+               # '#pragma'
+(?P<pragma>(?:                  # pragma name
+    GLSL[PHF]?
+))\s*
+(?P<rest>.*)$                   # rest of line (pragma body)
+''')
+
+# Represents a single #pragma macro
+class Macro(object):
+    PRAGMAS = {
+        'GLSL':  'SH_BUF_BODY',
+        'GLSLP': 'SH_BUF_PRELUDE',
+        'GLSLH': 'SH_BUF_HEADER',
+        'GLSLF': 'SH_BUF_FOOTER',
+    }
+
+    def __init__(self, linenr=0, type='GLSL'):
+        self.linenr = linenr
+        self.buf    = Macro.PRAGMAS[type]
+        self.name   = '_glsl_' + str(linenr)
+        self.body   = []    # list of statements
+        self.last   = None  # previous GLSLBlock (if unterminated)
+        self.vars   = VarSet()
+
+    def needs_single_line(self):
+        if not self.body:
+            return False
+        prev = self.body[-1]
+        return isinstance(prev, BlockStart) and not prev.multiline
+
+    def push_line(self, line):
+        self.vars.merge(line.vars)
+
+        if isinstance(line, GLSLLine):
+            if self.last:
+                self.last.append(line)
+            elif self.needs_single_line():
+                self.body.append(GLSLBlock(line))
+            else:
+                # start new GLSL block
+                self.last = GLSLBlock(line)
+                self.body.append(self.last)
+        else:
+            self.body.append(line)
+            self.last = None
+
+    def render_struct(self):
+        return STRUCT_TEMPLATE.render(macro=self)
+
+    def render_call(self):
+        return CALL_TEMPLATE.render(macro=self)
+
+    def render_fun(self):
+        return FUNCTION_TEMPLATE.render(macro=self, Var=Var)
+
+    # yields output lines
+    @staticmethod
+    def process_file(lines, strip=False):
+        macro = None
+        macros = []
+
+        for linenr, line_orig in enumerate(lines, start=1):
+            line = line_orig.rstrip()
+
+            # Strip leading spaces, due to C indent. Skip first pragma line.
+            if macro and leading_spaces is None:
+                leading_spaces = len(line) - len(line.lstrip())
+
+            # check for start of macro
+            if not macro:
+                leading_spaces = None
+                if result := re.match(PATTERN_PRAGMA, line):
+                    macro = Macro(linenr, type=result['pragma'])
+                    line = result['rest'] # strip pragma prefix
+
+            if macro:
+                if leading_spaces:
+                    line = re.sub(f'^\s{{1,{leading_spaces}}}', '', line)
+                if more_lines := line.endswith('\\'):
+                    line = line[:-1]
+                if statement := Statement.parse(line, strip=strip, linenr=linenr):
+                    macro.push_line(statement)
+                if more_lines:
+                    continue # stay in macro
+                else:
+                    yield macro.render_call()
+                    yield '#line {}\n'.format(linenr + 1)
+                    macros.append(macro)
+                    macro = None
+            else:
+                yield line_orig
+
+        if macros:
+            yield '\n// Auto-generated template functions:'
+        for macro in macros:
+            yield macro.render_fun()
diff --git a/tools/glsl_preproc/main.py b/tools/glsl_preproc/main.py
new file mode 100755
index 0000000..fcfea1f
--- /dev/null
+++ b/tools/glsl_preproc/main.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+import sys
+import argparse
+
+from macros import Macro
+
+parser = argparse.ArgumentParser()
+parser.add_argument('input')
+parser.add_argument('output')
+parser.add_argument('-s', '--strip', default=False, action='store_true')
+args = parser.parse_args()
+
+with open(args.input) as infile:
+    with open(args.output, 'w') as outfile:
+        for line in Macro.process_file(infile, strip=args.strip):
+            outfile.write(line)
diff --git a/tools/glsl_preproc/meson.build b/tools/glsl_preproc/meson.build
new file mode 100644
index 0000000..677ef7c
--- /dev/null
+++ b/tools/glsl_preproc/meson.build
@@ -0,0 +1,13 @@
+strip_arg = get_option('debug') ? [] : [ '--strip' ]
+glsl_preproc = [ python, join_paths(meson.current_source_dir(), 'main.py') ] + \
+                 strip_arg + [ '@INPUT@', '@OUTPUT@' ]
+glsl_deps = files(
+  'macros.py',
+  'statement.py',
+  'templates.py',
+  'templates/call.c.j2',
+  'templates/function.c.j2',
+  'templates/glsl_block.c.j2',
+  'templates/struct.c.j2',
+  'variables.py',
+)
diff --git a/tools/glsl_preproc/statement.py b/tools/glsl_preproc/statement.py
new file mode 100644
index 0000000..8641e94
--- /dev/null
+++ b/tools/glsl_preproc/statement.py
@@ -0,0 +1,301 @@
+import re
+
+from templates import GLSL_BLOCK_TEMPLATE
+from variables import VarSet, slugify
+
+VAR_PATTERN = re.compile(flags=re.VERBOSE, pattern=r'''
+    # long form ${ ... } syntax
+    \${ (?:\s*(?P<type>(?:              # optional type prefix
+            ident                       # identifiers (always dynamic)
+          | (?:(?:const|dynamic)\s+)?   # optional const/dynamic modifiers
+                (?:float|u?int)         # base type
+          | swizzle                     # swizzle mask
+          | (?:i|u)?vecType             # vector type (for mask)
+        )):)?
+        (?P<expr>[^{}]+)
+      }
+|   \$(?P<name>\w+) # reference to captured variable
+|   @(?P<var>\w+)   # reference to locally defined var
+''')
+
+class FmtSpec(object):
+    def __init__(self, ctype='ident_t', fmtstr='_%hx',
+                 wrap_expr=lambda name, expr: expr,
+                 fmt_expr=lambda name: name):
+        self.ctype     = ctype
+        self.fmtstr    = fmtstr
+        self.wrap_expr = wrap_expr
+        self.fmt_expr  = fmt_expr
+
+    @staticmethod
+    def wrap_var(type, dynamic=False):
+        if dynamic:
+            return lambda name, expr: f'sh_var_{type}(sh, "{name}", {expr}, true)'
+        else:
+            return lambda name, expr: f'sh_const_{type}(sh, "{name}", {expr})'
+
+    @staticmethod
+    def wrap_fn(fn):
+        return lambda name: f'{fn}({name})'
+
+VAR_TYPES = {
+    # identifiers: get mapped as-is
+    'ident':            FmtSpec(),
+
+    # normal variables: get mapped as shader constants
+    'int':              FmtSpec(wrap_expr=FmtSpec.wrap_var('int')),
+    'uint':             FmtSpec(wrap_expr=FmtSpec.wrap_var('uint')),
+    'float':            FmtSpec(wrap_expr=FmtSpec.wrap_var('float')),
+
+    # constant variables: get printed directly into the source code
+    'const int':        FmtSpec(ctype='int',          fmtstr='%d'),
+    'const uint':       FmtSpec(ctype='unsigned',     fmtstr='uint(%u)'),
+    'const float':      FmtSpec(ctype='float',        fmtstr='float(%f)'),
+
+    # dynamic variables: get loaded as shader variables
+    'dynamic int':      FmtSpec(wrap_expr=FmtSpec.wrap_var('int', dynamic=True)),
+    'dynamic uint':     FmtSpec(wrap_expr=FmtSpec.wrap_var('uint', dynamic=True)),
+    'dynamic float':    FmtSpec(wrap_expr=FmtSpec.wrap_var('float', dynamic=True)),
+
+    # component mask types
+    'swizzle':          FmtSpec(ctype='uint8_t', fmtstr='%s', fmt_expr=FmtSpec.wrap_fn('sh_swizzle')),
+    'ivecType':         FmtSpec(ctype='uint8_t', fmtstr='%s', fmt_expr=FmtSpec.wrap_fn('sh_float_type')),
+    'uvecType':         FmtSpec(ctype='uint8_t', fmtstr='%s', fmt_expr=FmtSpec.wrap_fn('sh_float_type')),
+    'vecType':          FmtSpec(ctype='uint8_t', fmtstr='%s', fmt_expr=FmtSpec.wrap_fn('sh_float_type')),
+}
+
+def stringify(value, strip):
+    end = '\\n"'
+    if strip:
+        end = '"'
+        value = re.sub(r'(?:\/\*[^\*]*\*\/|\/\/[^\n]+|^\s*)', '', value)
+    return '"' + value.replace('\\', '\\\\').replace('"', '\\"') + end
+
+def commentify(value, strip):
+    if strip:
+        return ''
+    return '/*' + value.replace('/*', '[[').replace('*/', ']]') + '*/'
+
+# Represents a statement + its enclosed variables
+class Statement(object):
+    def __init__(self, linenr=0):
+        super().__init__()
+        self.linenr = linenr
+        self.vars = VarSet()
+
+    def add_var(self, ctype, expr, name=None):
+        return self.vars.add_var(ctype, expr, name, self.linenr)
+
+    def render(self):
+        raise NotImplementedError
+
+    @staticmethod
+    def parse(text_orig, **kwargs):
+        raise NotImplementedError
+
+# Represents a single line of GLSL
+class GLSLLine(Statement):
+    class GLSLVar(object): # variable reference
+        def __init__(self, fmt, var):
+            self.fmt = fmt
+            self.var = var
+
+    def __init__(self, text, strip=False, **kwargs):
+        super().__init__(**kwargs)
+        self.refs = []
+        self.strip = strip
+
+        # produce two versions of line, one for printf() and one for append()
+        text = text.rstrip()
+        self.rawstr = stringify(text, strip)
+        self.fmtstr = stringify(re.sub(VAR_PATTERN, self.handle_var, text.replace('%', '%%')), strip)
+
+    def handle_var(self, match):
+        # local @var
+        if match['var']:
+            self.refs.append(match['var'])
+            return '%d'
+
+        # captured $var
+        type = match['type']
+        name = match['name']
+        expr = match['expr'] or name
+        name = name or slugify(expr)
+
+        fmt = VAR_TYPES[type or 'ident']
+        self.refs.append(fmt.fmt_expr(self.add_var(
+            ctype = fmt.ctype,
+            expr  = fmt.wrap_expr(name, expr),
+            name  = name,
+        )))
+
+        if fmt.ctype == 'ident_t':
+            return commentify(name, self.strip) + fmt.fmtstr
+        else:
+            return fmt.fmtstr
+
+# Represents an entire GLSL block
+class GLSLBlock(Statement):
+    def __init__(self, line):
+        super().__init__(linenr=line.linenr)
+        self.lines = []
+        self.refs  = []
+        self.append(line)
+
+    def append(self, line):
+        assert isinstance(line, GLSLLine)
+        self.lines.append(line)
+        self.refs += line.refs
+        self.vars.merge(line.vars)
+
+    def render(self):
+        return GLSL_BLOCK_TEMPLATE.render(block=self)
+
+# Represents a statement which can either take a single line or a block
+class BlockStart(Statement):
+    def __init__(self, multiline=False, **kwargs):
+        super().__init__(**kwargs)
+        self.multiline = multiline
+
+    def add_brace(self, text):
+        if self.multiline:
+            text += ' {'
+        return text
+
+# Represents an @if
+class IfCond(BlockStart):
+    def __init__(self, cond, inner=False, **kwargs):
+        super().__init__(**kwargs)
+        self.cond = cond if inner else self.add_var('bool', expr=cond)
+
+    def render(self):
+        return self.add_brace(f'if ({self.cond})')
+
+# Represents an @else
+class Else(BlockStart):
+    def __init__(self, closing, **kwargs):
+        super().__init__(**kwargs)
+        self.closing = closing
+
+    def render(self):
+        text = '} else' if self.closing else 'else'
+        return self.add_brace(text)
+
+# Represents a normal (integer) @for loop, or an (unsigned 8-bit) bitmask loop
+class ForLoop(BlockStart):
+    def __init__(self, var, op, bound, **kwargs):
+        super().__init__(**kwargs)
+        self.comps = op == ':'
+        self.bound = self.add_var('uint8_t' if self.comps else 'int', expr=bound)
+        self.var   = var
+        self.op    = op
+
+    def render(self):
+        if self.comps:
+            loopstart = f'uint8_t _mask = {self.bound}, {self.var}'
+            loopcond  = f'_mask && ({self.var} = __builtin_ctz(_mask), 1)'
+            loopstep  = f'_mask &= ~(1u << {self.var})'
+        else:
+            loopstart = f'int {self.var} = 0'
+            loopcond  = f'{self.var} {self.op} {self.bound}'
+            loopstep  = f'{self.var}++'
+
+        return self.add_brace(f'for ({loopstart}; {loopcond}; {loopstep})')
+
+# Represents a @switch block
+class Switch(Statement):
+    def __init__(self, expr, **kwargs):
+        super().__init__(**kwargs)
+        self.expr = self.add_var('unsigned', expr=expr)
+
+    def render(self):
+        return f'switch ({self.expr}) {{'
+
+# Represents a @case label
+class Case(Statement):
+    def __init__(self, label, **kwargs):
+        super().__init__(**kwargs)
+        self.label = label
+
+    def render(self):
+        return f'case {self.label}:'
+
+# Represents a @default line
+class Default(Statement):
+    def render(self):
+        return 'default:'
+
+# Represents a @break line
+class Break(Statement):
+    def render(self):
+        return 'break;'
+
+# Represents a single closing brace
+class EndBrace(Statement):
+    def render(self):
+        return '}'
+
+# Shitty regex-based statement parser
+PATTERN_IF  = re.compile(flags=re.VERBOSE, pattern=r'''
+@\s*if\s*                       # '@if'
+(?P<inner>@)?                   # optional leading @
+\((?P<cond>.+)\)\s*             # (condition)
+(?P<multiline>{)?\s*            # optional trailing {
+$''')
+
+PATTERN_ELSE = re.compile(flags=re.VERBOSE, pattern=r'''
+@\s*(?P<closing>})?\s*          # optional leading }
+else\s*                         # 'else'
+(?P<multiline>{)?\s*            # optional trailing {
+$''')
+
+PATTERN_FOR = re.compile(flags=re.VERBOSE, pattern=r'''
+@\s*for\s+\(                    # '@for' (
+(?P<var>\w+)\s*                 # loop variable name
+(?P<op>(?:\<=?|:))(?=[\w\s])\s* # '<', '<=' or ':', followed by \s or \w
+(?P<bound>[^\s].*)\s*           # loop boundary expression
+\)\s*(?P<multiline>{)?\s*       # ) and optional trailing {
+$''')
+
+PATTERN_SWITCH = re.compile(flags=re.VERBOSE, pattern=r'''
+@\s*switch\s*                   # '@switch'
+\((?P<expr>.+)\)\s*{            # switch expression
+$''')
+
+PATTERN_CASE = re.compile(flags=re.VERBOSE, pattern=r'''
+@\s*case\s*                     # '@case'
+(?P<label>[^:]+):?              # case label, optionally followed by :
+$''')
+
+PATTERN_BREAK   = r'@\s*break;?\s*$'
+PATTERN_DEFAULT = r'@\s*default:?\s*$'
+PATTERN_BRACE   = r'@\s*}\s*$'
+
+PARSERS = {
+    PATTERN_IF:         lambda r, **kw: IfCond(r['cond'], inner=r['inner'], multiline=r['multiline'], **kw),
+    PATTERN_ELSE:       lambda r, **kw: Else(closing=r['closing'], multiline=r['multiline'], **kw),
+    PATTERN_FOR:        lambda r, **kw: ForLoop(r['var'], r['op'], r['bound'], multiline=r['multiline'], **kw),
+    PATTERN_SWITCH:     lambda r, **kw: Switch(r['expr'], **kw),
+    PATTERN_CASE:       lambda r, **kw: Case(r['label'], **kw),
+    PATTERN_BREAK:      lambda _, **kw: Break(**kw),
+    PATTERN_DEFAULT:    lambda _, **kw: Default(**kw),
+    PATTERN_BRACE:      lambda _, **kw: EndBrace(**kw),
+}
+
+def parse_line(text_orig, strip, **kwargs):
+    # skip empty lines
+    text = text_orig.strip()
+    if not text:
+        return None
+    if text.lstrip().startswith('@'):
+        # try parsing as statement
+        for pat, fun in PARSERS.items():
+            if res := re.match(pat, text):
+                return fun(res, **kwargs)
+        # return generic error for unrecognized statements
+        raise SyntaxError('Syntax error in directive: ' + text.lstrip())
+    else:
+        # default to literal GLSL line
+        return GLSLLine(text_orig, strip, **kwargs)
+
+Statement.parse = parse_line
diff --git a/tools/glsl_preproc/templates.py b/tools/glsl_preproc/templates.py
new file mode 100644
index 0000000..b3b6c44
--- /dev/null
+++ b/tools/glsl_preproc/templates.py
@@ -0,0 +1,14 @@
+import jinja2
+import os.path
+
+TEMPLATEDIR = os.path.dirname(__file__) + '/templates'
+TEMPLATES = jinja2.Environment(
+    loader        = jinja2.FileSystemLoader(searchpath=TEMPLATEDIR),
+    lstrip_blocks = True,
+    trim_blocks   = True,
+)
+
+GLSL_BLOCK_TEMPLATE = TEMPLATES.get_template('glsl_block.c.j2')
+FUNCTION_TEMPLATE   = TEMPLATES.get_template('function.c.j2')
+CALL_TEMPLATE       = TEMPLATES.get_template('call.c.j2')
+STRUCT_TEMPLATE     = TEMPLATES.get_template('struct.c.j2')
diff --git a/tools/glsl_preproc/templates/call.c.j2 b/tools/glsl_preproc/templates/call.c.j2
new file mode 100644
index 0000000..61ee6c0
--- /dev/null
+++ b/tools/glsl_preproc/templates/call.c.j2
@@ -0,0 +1,19 @@
+{
+{% if macro.vars %}
+    const {{ macro.render_struct() }} {{ macro.name }}_args = {
+    {% for var in macro.vars %}
+#line {{ var.linenr }}
+        .{{ var.name }} = {{ var.expr }},
+    {% endfor %}
+    };
+#line {{ macro.linenr }}
+{% endif %}
+    size_t {{ macro.name }}_fn(void *, pl_str *, const uint8_t *);
+{% if macro.vars %}
+    pl_str_builder_append(sh->buffers[{{ macro.buf }}], {{ macro.name }}_fn,
+                          &{{ macro.name }}_args, sizeof({{ macro.name }}_args));
+{% else %}
+    pl_str_builder_append(sh->buffers[{{ macro.buf }}], {{ macro.name }}_fn, NULL, 0);
+{% endif %}
+}
+
diff --git a/tools/glsl_preproc/templates/function.c.j2 b/tools/glsl_preproc/templates/function.c.j2
new file mode 100644
index 0000000..9216472
--- /dev/null
+++ b/tools/glsl_preproc/templates/function.c.j2
@@ -0,0 +1,19 @@
+
+size_t {{ macro.name }}_fn(void *alloc, pl_str *buf, const uint8_t *ptr);
+size_t {{ macro.name }}_fn(void *alloc, pl_str *buf, const uint8_t *ptr)
+{
+{% if macro.vars %}
+{{ macro.render_struct() }} {{ Var.STRUCT_NAME }};
+memcpy(&{{ Var.STRUCT_NAME }}, ptr, sizeof({{ Var.STRUCT_NAME }}));
+{% endif %}
+
+{% for statement in macro.body %}
+{{ statement.render() }}
+{% endfor %}
+
+{% if macro.vars %}
+return sizeof({{ Var.STRUCT_NAME }});
+{% else %}
+return 0;
+{% endif %}
+}
diff --git a/tools/glsl_preproc/templates/glsl_block.c.j2 b/tools/glsl_preproc/templates/glsl_block.c.j2
new file mode 100644
index 0000000..aa8372d
--- /dev/null
+++ b/tools/glsl_preproc/templates/glsl_block.c.j2
@@ -0,0 +1,17 @@
+#line {{ block.linenr }}
+{% if block.refs %}
+    pl_str_append_asprintf_c(alloc, buf,
+    {% for line in block.lines %}
+        {{ line.fmtstr }}{{ ',' if loop.last }}
+    {% endfor %}
+    {% for ref in block.refs %}
+        {{ ref }}{{ ',' if not loop.last }}
+    {% endfor %}
+    );
+{% else %}
+    pl_str_append(alloc, buf, pl_str0(
+    {% for line in block.lines %}
+        {{ line.rawstr }}
+    {% endfor %}
+    ));
+{% endif %}
diff --git a/tools/glsl_preproc/templates/struct.c.j2 b/tools/glsl_preproc/templates/struct.c.j2
new file mode 100644
index 0000000..6a6a8fb
--- /dev/null
+++ b/tools/glsl_preproc/templates/struct.c.j2
@@ -0,0 +1,5 @@
+struct __attribute__((__packed__)) {
+{% for var in macro.vars %}
+    {{ var.ctype }} {{ var.name }};
+{% endfor %}
+}
diff --git a/tools/glsl_preproc/variables.py b/tools/glsl_preproc/variables.py
new file mode 100644
index 0000000..187fd79
--- /dev/null
+++ b/tools/glsl_preproc/variables.py
@@ -0,0 +1,79 @@
+import re
+
+def slugify(value):
+    value = re.sub(r'[^\w]+', '_', value.lower()).strip('_')
+    if value[:1].isdigit():
+        value = '_' + value
+    return value
+
+# A single variable (enclosed by the template)
+class Var(object):
+    STRUCT_NAME = 'vars'
+    CSIZES = {
+        # This array doesn't have to be exact, it's only used for sorting
+        # struct members to save a few bytes of memory here and there
+        'int':      4,
+        'unsigned': 4,
+        'float':    4,
+        'ident_t':  2,
+        'uint8_t':  1,
+        'bool':     1,
+    }
+
+    def __init__(self, ctype, expr, name, csize=0, linenr=0):
+        self.ctype  = ctype
+        self.csize  = csize or Var.CSIZES[ctype]
+        self.expr   = expr
+        self.name   = name
+        self.linenr = linenr
+
+    def __str__(self):
+        return f'{Var.STRUCT_NAME}.{self.name}'
+
+def is_literal(expr):
+    return expr.isnumeric() or expr in ['true', 'false']
+
+# A (deduplicated) set of variables
+class VarSet(object):
+    def __init__(self):
+        self.varmap = {} # expr -> cvar
+
+    def __iter__(self):
+        # Sort from largest to smallest variable to optimize struct padding
+        yield from sorted(self.varmap.values(),
+            reverse=True,
+            key=lambda v: v.csize,
+        )
+
+    def __bool__(self):
+        return True if self.varmap else False
+
+    def add_var_raw(self, var):
+        # Re-use existing entry for identical expression/type pairs
+        if old := self.varmap.get(var.expr):
+            if var.ctype != old.ctype:
+                raise SyntaxError(f'Conflicting types for expression {var.expr}, '
+                                  f'got {var.ctype}, expected {old.ctype}')
+            assert old.name == var.name
+            return old
+
+        names = [ v.name for v in self.varmap.values() ]
+        while var.name in names:
+            var.name += '_'
+        self.varmap[var.expr] = var
+        return var
+
+    # Returns the added variable
+    def add_var(self, ctype, expr, name=None, linenr=0):
+        assert expr
+        expr = expr.strip()
+        if is_literal(expr):
+            return expr
+        name = name or slugify(expr)
+
+        var = Var(ctype, expr=expr, name=name, linenr=linenr)
+        return self.add_var_raw(var)
+
+    def merge(self, other):
+        for var in other:
+            self.add_var_raw(var)
diff --git a/tools/meson.build b/tools/meson.build
new file mode 100644
index 0000000..bba9c7c
--- /dev/null
+++ b/tools/meson.build
@@ -0,0 +1 @@
+subdir('glsl_preproc')
diff --git a/win32/demos.manifest b/win32/demos.manifest
new file mode 100644
index 0000000..26162e2
--- /dev/null
+++ b/win32/demos.manifest
@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<assembly manifestVersion="1.0" xmlns="urn:schemas-microsoft-com:asm.v1" xmlns:asmv3="urn:schemas-microsoft-com:asm.v3">
+    <compatibility xmlns="urn:schemas-microsoft-com:compatibility.v1">
+        <description>libplacebo</description>
+        <application>
+            <!-- Windows 10 and Windows 11 -->
+            <supportedOS Id="{8e0f7a12-bfb3-4fe8-b9a5-48fd50a15a9a}"/>
+            <!-- Windows 8.1 -->
+            <supportedOS Id="{1f676c76-80e1-4239-95bb-83d0f6d0da78}"/>
+            <!-- Windows 8 -->
+            <supportedOS Id="{4a2f28e3-53b9-4441-ba9c-d69d4a4a6e38}"/>
+            <!-- Windows 7 -->
+            <supportedOS Id="{35138b9a-5d96-4fbd-8e2d-a2440225f93a}"/>
+        </application>
+    </compatibility>
+</assembly>
diff --git a/win32/demos.rc.in b/win32/demos.rc.in
new file mode 100644
index 0000000..2357e65
--- /dev/null
+++ b/win32/demos.rc.in
@@ -0,0 +1,49 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma code_page(65001)
+
+#include <winuser.h>
+#include <winver.h>
+#include "version.h"
+
+VS_VERSION_INFO VERSIONINFO
+FILEVERSION @PL_MAJOR@, @PL_MINOR@, @PL_PATCH@
+PRODUCTVERSION @PL_MAJOR@, @PL_MINOR@, @PL_PATCH@
+FILEFLAGSMASK VS_FFI_FILEFLAGSMASK
+FILEFLAGS 0
+FILEOS VOS__WINDOWS32
+FILETYPE VFT_APP
+FILESUBTYPE VFT2_UNKNOWN
+{
+    BLOCK "StringFileInfo" {
+        BLOCK "000004b0" {
+            VALUE "Comments", "libplacebo is distributed under the terms of the GNU Lesser General Public License, version 2.1"
+            VALUE "CompanyName", "libplacebo"
+            VALUE "FileDescription", "libplacebo"
+            VALUE "FileVersion", BUILD_VERSION
+            VALUE "LegalCopyright", "Copyright © 2017-2023 libplacebo project"
+            VALUE "ProductName", "libplacebo"
+            VALUE "ProductVersion", BUILD_VERSION
+        }
+    }
+    BLOCK "VarFileInfo" {
+        VALUE "Translation", 0, 1200
+    }
+}
+
+CREATEPROCESS_MANIFEST_RESOURCE_ID RT_MANIFEST "demos.manifest"
diff --git a/win32/libplacebo.manifest b/win32/libplacebo.manifest
new file mode 100644
index 0000000..ca49eac
--- /dev/null
+++ b/win32/libplacebo.manifest
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<assembly manifestVersion="1.0" xmlns="urn:schemas-microsoft-com:asm.v1" xmlns:asmv3="urn:schemas-microsoft-com:asm.v3">
+    <compatibility xmlns="urn:schemas-microsoft-com:compatibility.v1">
+        <assemblyIdentity
+            version="1.0.0.0"
+            processorArchitecture="*"
+            name="libplacebo.dll"
+            type="win32"
+        />
+        <description>libplacebo</description>
+        <application>
+            <!-- Windows 10 and Windows 11 -->
+            <supportedOS Id="{8e0f7a12-bfb3-4fe8-b9a5-48fd50a15a9a}"/>
+            <!-- Windows 8.1 -->
+            <supportedOS Id="{1f676c76-80e1-4239-95bb-83d0f6d0da78}"/>
+            <!-- Windows 8 -->
+            <supportedOS Id="{4a2f28e3-53b9-4441-ba9c-d69d4a4a6e38}"/>
+            <!-- Windows 7 -->
+            <supportedOS Id="{35138b9a-5d96-4fbd-8e2d-a2440225f93a}"/>
+        </application>
+    </compatibility>
+</assembly>
diff --git a/win32/libplacebo.rc.in b/win32/libplacebo.rc.in
new file mode 100644
index 0000000..a665e47
--- /dev/null
+++ b/win32/libplacebo.rc.in
@@ -0,0 +1,50 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma code_page(65001)
+
+#include <winuser.h>
+#include <winver.h>
+#include "version.h"
+
+VS_VERSION_INFO VERSIONINFO
+FILEVERSION @PL_MAJOR@, @PL_MINOR@, @PL_PATCH@
+PRODUCTVERSION @PL_MAJOR@, @PL_MINOR@, @PL_PATCH@
+FILEFLAGSMASK VS_FFI_FILEFLAGSMASK
+FILEFLAGS 0
+FILEOS VOS__WINDOWS32
+FILETYPE VFT_DLL
+FILESUBTYPE VFT2_UNKNOWN
+{
+    BLOCK "StringFileInfo" {
+        BLOCK "000004b0" {
+            VALUE "Comments", "libplacebo is distributed under the terms of the GNU Lesser General Public License, version 2.1"
+            VALUE "CompanyName", "libplacebo"
+            VALUE "FileDescription", "libplacebo"
+            VALUE "FileVersion", BUILD_VERSION
+            VALUE "LegalCopyright", "Copyright © 2017-2023 libplacebo project"
+            VALUE "OriginalFilename", "libplacebo-@PL_MINOR@.dll"
+            VALUE "ProductName", "libplacebo"
+            VALUE "ProductVersion", BUILD_VERSION
+        }
+    }
+    BLOCK "VarFileInfo" {
+        VALUE "Translation", 0, 1200
+    }
+}
+
+CREATEPROCESS_MANIFEST_RESOURCE_ID RT_MANIFEST "libplacebo.manifest"
diff --git a/win32/meson.build b/win32/meson.build
new file mode 100644
index 0000000..9226312
--- /dev/null
+++ b/win32/meson.build
@@ -0,0 +1,12 @@
+version_arr = meson.project_version().split('.')
+version_config = configuration_data()
+version_config.set('PL_MAJOR', version_arr[0])
+version_config.set('PL_MINOR', version_arr[1])
+version_config.set('PL_PATCH', version_arr[2])
+
+libplacebo_rc = configure_file(input: 'libplacebo.rc.in',
+                               output: 'libplacebo.rc',
+                               configuration: version_config)
+demos_rc =  configure_file(input: 'demos.rc.in',
+                           output: 'demos.rc',
+                           configuration: version_config)
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-15 20:38:23 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-15 20:38:23 +0000
commit	ff6e3c025658a5fa1affd094f220b623e7e1b24b (patch)
tree	9faab72d69c92d24e349d184f5869b9796f17e0c
parent	Initial commit. (diff)
download	libplacebo-upstream.tar.xz libplacebo-upstream.zip