20 files changed, 10168 insertions, 0 deletions
diff --git a/src/vulkan/command.c b/src/vulkan/command.c
new file mode 100644
index 0000000..5020aff
--- /dev/null
+++ b/src/vulkan/command.c
@@ -0,0 +1,571 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "command.h"
+#include "utils.h"
+
+// returns VK_SUCCESS (completed), VK_TIMEOUT (not yet completed) or an error
+static VkResult vk_cmd_poll(struct vk_cmd *cmd, uint64_t timeout)
+{
+    struct vk_ctx *vk = cmd->pool->vk;
+    return vk->WaitSemaphores(vk->dev, &(VkSemaphoreWaitInfo) {
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
+        .semaphoreCount = 1,
+        .pSemaphores = &cmd->sync.sem,
+        .pValues = &cmd->sync.value,
+    }, timeout);
+}
+
+static void flush_callbacks(struct vk_ctx *vk)
+{
+    while (vk->num_pending_callbacks) {
+        const struct vk_callback *cb = vk->pending_callbacks++;
+        vk->num_pending_callbacks--;
+        cb->run(cb->priv, cb->arg);
+    }
+}
+
+static void vk_cmd_reset(struct vk_cmd *cmd)
+{
+    struct vk_ctx *vk = cmd->pool->vk;
+
+    // Flush possible callbacks left over from a previous command still in the
+    // process of being reset, whose callback triggered this command being
+    // reset.
+    flush_callbacks(vk);
+    vk->pending_callbacks = cmd->callbacks.elem;
+    vk->num_pending_callbacks = cmd->callbacks.num;
+    flush_callbacks(vk);
+
+    cmd->callbacks.num = 0;
+    cmd->deps.num = 0;
+    cmd->sigs.num = 0;
+}
+
+static void vk_cmd_destroy(struct vk_cmd *cmd)
+{
+    if (!cmd)
+        return;
+
+    struct vk_ctx *vk = cmd->pool->vk;
+    vk_cmd_poll(cmd, UINT64_MAX);
+    vk_cmd_reset(cmd);
+    vk->DestroySemaphore(vk->dev, cmd->sync.sem, PL_VK_ALLOC);
+    vk->FreeCommandBuffers(vk->dev, cmd->pool->pool, 1, &cmd->buf);
+
+    pl_free(cmd);
+}
+
+static struct vk_cmd *vk_cmd_create(struct vk_cmdpool *pool)
+{
+    struct vk_ctx *vk = pool->vk;
+    struct vk_cmd *cmd = pl_zalloc_ptr(NULL, cmd);
+    cmd->pool = pool;
+
+    VkCommandBufferAllocateInfo ainfo = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+        .commandPool = pool->pool,
+        .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+        .commandBufferCount = 1,
+    };
+
+    VK(vk->AllocateCommandBuffers(vk->dev, &ainfo, &cmd->buf));
+
+    static const VkSemaphoreTypeCreateInfo stinfo = {
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO,
+        .semaphoreType  = VK_SEMAPHORE_TYPE_TIMELINE,
+        .initialValue   = 0,
+    };
+
+    static const VkSemaphoreCreateInfo sinfo = {
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+        .pNext = &stinfo,
+    };
+
+    VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &cmd->sync.sem));
+    PL_VK_NAME(SEMAPHORE, cmd->sync.sem, "cmd");
+
+    return cmd;
+
+error:
+    vk_cmd_destroy(cmd);
+    vk->failed = true;
+    return NULL;
+}
+
+void vk_dev_callback(struct vk_ctx *vk, vk_cb callback,
+                     const void *priv, const void *arg)
+{
+    pl_mutex_lock(&vk->lock);
+    if (vk->cmds_pending.num > 0) {
+        struct vk_cmd *last_cmd = vk->cmds_pending.elem[vk->cmds_pending.num - 1];
+        vk_cmd_callback(last_cmd, callback, priv, arg);
+    } else {
+        // The device was already idle, so we can just immediately call it
+        callback((void *) priv, (void *) arg);
+    }
+    pl_mutex_unlock(&vk->lock);
+}
+
+void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback,
+                     const void *priv, const void *arg)
+{
+    PL_ARRAY_APPEND(cmd, cmd->callbacks, (struct vk_callback) {
+        .run  = callback,
+        .priv = (void *) priv,
+        .arg  = (void *) arg,
+    });
+}
+
+void vk_cmd_dep(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem dep)
+{
+    PL_ARRAY_APPEND(cmd, cmd->deps, (VkSemaphoreSubmitInfo) {
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
+        .semaphore  = dep.sem,
+        .value      = dep.value,
+        .stageMask  = stage,
+    });
+}
+
+void vk_cmd_sig(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem sig)
+{
+    VkSemaphoreSubmitInfo sinfo = {
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
+        .semaphore  = sig.sem,
+        .value      = sig.value,
+        .stageMask  = stage,
+    };
+
+    // Try updating existing semaphore signal operations in-place
+    for (int i = 0; i < cmd->sigs.num; i++) {
+        if (cmd->sigs.elem[i].semaphore == sig.sem) {
+            pl_assert(sig.value > cmd->sigs.elem[i].value);
+            cmd->sigs.elem[i] = sinfo;
+            return;
+        }
+    }
+
+    PL_ARRAY_APPEND(cmd, cmd->sigs, sinfo);
+}
+
+#define SET(FLAG, CHECK)  \
+    if (flags2 & (CHECK)) \
+        flags |= FLAG
+
+static VkAccessFlags lower_access2(VkAccessFlags2 flags2)
+{
+    VkAccessFlags flags = flags2 & VK_ACCESS_FLAG_BITS_MAX_ENUM;
+    SET(VK_ACCESS_SHADER_READ_BIT,  VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
+                                    VK_ACCESS_2_SHADER_STORAGE_READ_BIT);
+    SET(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT);
+    return flags;
+}
+
+static VkPipelineStageFlags lower_stage2(VkPipelineStageFlags2 flags2)
+{
+    VkPipelineStageFlags flags = flags2 & VK_PIPELINE_STAGE_FLAG_BITS_MAX_ENUM;
+    SET(VK_PIPELINE_STAGE_TRANSFER_BIT,     VK_PIPELINE_STAGE_2_COPY_BIT |
+                                            VK_PIPELINE_STAGE_2_RESOLVE_BIT |
+                                            VK_PIPELINE_STAGE_2_BLIT_BIT |
+                                            VK_PIPELINE_STAGE_2_CLEAR_BIT);
+    SET(VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
+                                            VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT);
+    return flags;
+}
+
+#undef SET
+
+void vk_cmd_barrier(struct vk_cmd *cmd, const VkDependencyInfo *info)
+{
+    struct vk_ctx *vk = cmd->pool->vk;
+    if (vk->CmdPipelineBarrier2KHR) {
+        vk->CmdPipelineBarrier2KHR(cmd->buf, info);
+        return;
+    }
+
+    pl_assert(!info->pNext);
+    pl_assert(info->memoryBarrierCount == 0);
+    pl_assert(info->bufferMemoryBarrierCount + info->imageMemoryBarrierCount == 1);
+
+    if (info->bufferMemoryBarrierCount) {
+
+        const VkBufferMemoryBarrier2 *barr2 = info->pBufferMemoryBarriers;
+        const VkBufferMemoryBarrier barr = {
+            .sType               = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
+            .pNext               = barr2->pNext,
+            .srcAccessMask       = lower_access2(barr2->srcAccessMask),
+            .dstAccessMask       = lower_access2(barr2->dstAccessMask),
+            .srcQueueFamilyIndex = barr2->srcQueueFamilyIndex,
+            .dstQueueFamilyIndex = barr2->dstQueueFamilyIndex,
+            .buffer              = barr2->buffer,
+            .offset              = barr2->offset,
+            .size                = barr2->size,
+        };
+
+        vk->CmdPipelineBarrier(cmd->buf, lower_stage2(barr2->srcStageMask),
+                               lower_stage2(barr2->dstStageMask),
+                               info->dependencyFlags,
+                               0, NULL, 1, &barr, 0, NULL);
+
+    } else {
+
+        const VkImageMemoryBarrier2 *barr2 = info->pImageMemoryBarriers;
+        const VkImageMemoryBarrier barr = {
+            .sType               = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+            .pNext               = barr2->pNext,
+            .srcAccessMask       = lower_access2(barr2->srcAccessMask),
+            .dstAccessMask       = lower_access2(barr2->dstAccessMask),
+            .oldLayout           = barr2->oldLayout,
+            .newLayout           = barr2->newLayout,
+            .srcQueueFamilyIndex = barr2->srcQueueFamilyIndex,
+            .dstQueueFamilyIndex = barr2->dstQueueFamilyIndex,
+            .image               = barr2->image,
+            .subresourceRange    = barr2->subresourceRange,
+        };
+
+        vk->CmdPipelineBarrier(cmd->buf, lower_stage2(barr2->srcStageMask),
+                               lower_stage2(barr2->dstStageMask),
+                               info->dependencyFlags,
+                               0, NULL, 0, NULL, 1, &barr);
+    }
+}
+
+struct vk_sync_scope vk_sem_barrier(struct vk_cmd *cmd, struct vk_sem *sem,
+                                    VkPipelineStageFlags2 stage,
+                                    VkAccessFlags2 access, bool is_trans)
+{
+    bool is_write = (access & vk_access_write) || is_trans;
+
+    // Writes need to be synchronized against the last *read* (which is
+    // transitively synchronized against the last write), reads only
+    // need to be synchronized against the last write.
+    struct vk_sync_scope last = sem->write;
+    if (is_write && sem->read.access)
+        last = sem->read;
+
+    if (last.queue != cmd->queue) {
+        if (!is_write && sem->read.queue == cmd->queue) {
+            // No semaphore needed in this case because the implicit submission
+            // order execution dependencies already transitively imply a wait
+            // for the previous write
+        } else if (last.sync.sem) {
+            // Image barrier still needs to depend on this stage for implicit
+            // ordering guarantees to apply properly
+            vk_cmd_dep(cmd, stage, last.sync);
+            last.stage = stage;
+        }
+
+        // Last access is on different queue, so no pipeline barrier needed
+        last.access = 0;
+    }
+
+    if (!is_write && sem->read.queue == cmd->queue &&
+        (sem->read.stage & stage) == stage &&
+        (sem->read.access & access) == access)
+    {
+        // A past pipeline barrier already covers this access transitively, so
+        // we don't need to emit another pipeline barrier at all
+        last.access = 0;
+    }
+
+    if (is_write) {
+        sem->write = (struct vk_sync_scope) {
+            .sync = cmd->sync,
+            .queue = cmd->queue,
+            .stage = stage,
+            .access = access,
+        };
+
+        sem->read = (struct vk_sync_scope) {
+            .sync = cmd->sync,
+            .queue = cmd->queue,
+            // no stage or access scope, because no reads happened yet
+        };
+    } else if (sem->read.queue == cmd->queue) {
+        // Coalesce multiple same-queue reads into a single access scope
+        sem->read.sync = cmd->sync;
+        sem->read.stage |= stage;
+        sem->read.access |= access;
+    } else {
+        sem->read = (struct vk_sync_scope) {
+            .sync = cmd->sync,
+            .queue = cmd->queue,
+            .stage = stage,
+            .access = access,
+        };
+    }
+
+    // We never need to include pipeline barriers for reads, only writes
+    last.access &= vk_access_write;
+    return last;
+}
+
+struct vk_cmdpool *vk_cmdpool_create(struct vk_ctx *vk, int qf, int qnum,
+                                     VkQueueFamilyProperties props)
+{
+    struct vk_cmdpool *pool = pl_alloc_ptr(NULL, pool);
+    *pool = (struct vk_cmdpool) {
+        .vk         = vk,
+        .props      = props,
+        .qf         = qf,
+        .queues     = pl_calloc(pool, qnum, sizeof(VkQueue)),
+        .num_queues = qnum,
+    };
+
+    for (int n = 0; n < qnum; n++)
+        vk->GetDeviceQueue(vk->dev, qf, n, &pool->queues[n]);
+
+    VkCommandPoolCreateInfo cinfo = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+        .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT |
+                 VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
+        .queueFamilyIndex = qf,
+    };
+
+    VK(vk->CreateCommandPool(vk->dev, &cinfo, PL_VK_ALLOC, &pool->pool));
+    return pool;
+
+error:
+    vk_cmdpool_destroy(pool);
+    vk->failed = true;
+    return NULL;
+}
+
+void vk_cmdpool_destroy(struct vk_cmdpool *pool)
+{
+    if (!pool)
+        return;
+
+    for (int i = 0; i < pool->cmds.num; i++)
+        vk_cmd_destroy(pool->cmds.elem[i]);
+
+    struct vk_ctx *vk = pool->vk;
+    vk->DestroyCommandPool(vk->dev, pool->pool, PL_VK_ALLOC);
+    pl_free(pool);
+}
+
+struct vk_cmd *vk_cmd_begin(struct vk_cmdpool *pool, pl_debug_tag debug_tag)
+{
+    struct vk_ctx *vk = pool->vk;
+
+    // Garbage collect the cmdpool first, to increase the chances of getting
+    // an already-available command buffer.
+    vk_poll_commands(vk, 0);
+
+    struct vk_cmd *cmd = NULL;
+    pl_mutex_lock(&vk->lock);
+    if (!PL_ARRAY_POP(pool->cmds, &cmd)) {
+        cmd = vk_cmd_create(pool);
+        if (!cmd) {
+            pl_mutex_unlock(&vk->lock);
+            goto error;
+        }
+    }
+
+    cmd->qindex = pool->idx_queues;
+    cmd->queue = pool->queues[cmd->qindex];
+    pl_mutex_unlock(&vk->lock);
+
+    VkCommandBufferBeginInfo binfo = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+        .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
+    };
+
+    VK(vk->BeginCommandBuffer(cmd->buf, &binfo));
+
+    debug_tag = PL_DEF(debug_tag, "vk_cmd");
+    PL_VK_NAME_HANDLE(COMMAND_BUFFER, cmd->buf, debug_tag);
+    PL_VK_NAME(SEMAPHORE, cmd->sync.sem, debug_tag);
+
+    cmd->sync.value++;
+    vk_cmd_sig(cmd, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, cmd->sync);
+    return cmd;
+
+error:
+    // Something has to be seriously messed up if we get to this point
+    vk_cmd_destroy(cmd);
+    vk->failed = true;
+    return NULL;
+}
+
+static VkResult vk_queue_submit2(struct vk_ctx *vk, VkQueue queue,
+                                 const VkSubmitInfo2 *info2, VkFence fence)
+{
+    if (vk->QueueSubmit2KHR)
+        return vk->QueueSubmit2KHR(queue, 1, info2, fence);
+
+    const uint32_t num_deps = info2->waitSemaphoreInfoCount;
+    const uint32_t num_sigs = info2->signalSemaphoreInfoCount;
+    const uint32_t num_cmds = info2->commandBufferInfoCount;
+
+    void *tmp = pl_tmp(NULL);
+    VkSemaphore *deps           = pl_calloc_ptr(tmp, num_deps, deps);
+    VkPipelineStageFlags *masks = pl_calloc_ptr(tmp, num_deps, masks);
+    uint64_t *depvals           = pl_calloc_ptr(tmp, num_deps, depvals);
+    VkSemaphore *sigs           = pl_calloc_ptr(tmp, num_sigs, sigs);
+    uint64_t *sigvals           = pl_calloc_ptr(tmp, num_sigs, sigvals);
+    VkCommandBuffer *cmds       = pl_calloc_ptr(tmp, num_cmds, cmds);
+
+    for (int i = 0; i < num_deps; i++) {
+        deps[i] = info2->pWaitSemaphoreInfos[i].semaphore;
+        masks[i] = info2->pWaitSemaphoreInfos[i].stageMask;
+        depvals[i] = info2->pWaitSemaphoreInfos[i].value;
+    }
+    for (int i = 0; i < num_sigs; i++) {
+        sigs[i] = info2->pSignalSemaphoreInfos[i].semaphore;
+        sigvals[i] = info2->pSignalSemaphoreInfos[i].value;
+    }
+    for (int i = 0; i < num_cmds; i++)
+        cmds[i] = info2->pCommandBufferInfos[i].commandBuffer;
+
+    const VkTimelineSemaphoreSubmitInfo tinfo = {
+        .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO,
+        .pNext = info2->pNext,
+        .waitSemaphoreValueCount = num_deps,
+        .pWaitSemaphoreValues = depvals,
+        .signalSemaphoreValueCount = num_sigs,
+        .pSignalSemaphoreValues = sigvals,
+    };
+
+    const VkSubmitInfo info = {
+        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+        .pNext = &tinfo,
+        .waitSemaphoreCount = num_deps,
+        .pWaitSemaphores = deps,
+        .pWaitDstStageMask = masks,
+        .commandBufferCount = num_cmds,
+        .pCommandBuffers = cmds,
+        .signalSemaphoreCount = num_sigs,
+        .pSignalSemaphores = sigs,
+    };
+
+    VkResult res = vk->QueueSubmit(queue, 1, &info, fence);
+    pl_free(tmp);
+    return res;
+}
+
+bool vk_cmd_submit(struct vk_cmd **pcmd)
+{
+    struct vk_cmd *cmd = *pcmd;
+    if (!cmd)
+        return true;
+
+    *pcmd = NULL;
+    struct vk_cmdpool *pool = cmd->pool;
+    struct vk_ctx *vk = pool->vk;
+
+    VK(vk->EndCommandBuffer(cmd->buf));
+
+    VkSubmitInfo2 sinfo = {
+        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
+        .waitSemaphoreInfoCount = cmd->deps.num,
+        .pWaitSemaphoreInfos = cmd->deps.elem,
+        .signalSemaphoreInfoCount = cmd->sigs.num,
+        .pSignalSemaphoreInfos = cmd->sigs.elem,
+        .commandBufferInfoCount = 1,
+        .pCommandBufferInfos = &(VkCommandBufferSubmitInfo) {
+            .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+            .commandBuffer = cmd->buf,
+        },
+    };
+
+    if (pl_msg_test(vk->log, PL_LOG_TRACE)) {
+        PL_TRACE(vk, "Submitting command %p on queue %p (QF %d):",
+                 (void *) cmd->buf, (void *) cmd->queue, pool->qf);
+        for (int n = 0; n < cmd->deps.num; n++) {
+            PL_TRACE(vk, "    waits on semaphore 0x%"PRIx64" = %"PRIu64,
+                     (uint64_t) cmd->deps.elem[n].semaphore, cmd->deps.elem[n].value);
+        }
+        for (int n = 0; n < cmd->sigs.num; n++) {
+            PL_TRACE(vk, "    signals semaphore 0x%"PRIx64" = %"PRIu64,
+                    (uint64_t) cmd->sigs.elem[n].semaphore, cmd->sigs.elem[n].value);
+        }
+        if (cmd->callbacks.num)
+            PL_TRACE(vk, "    signals %d callbacks", cmd->callbacks.num);
+    }
+
+    vk->lock_queue(vk->queue_ctx, pool->qf, cmd->qindex);
+    VkResult res = vk_queue_submit2(vk, cmd->queue, &sinfo, VK_NULL_HANDLE);
+    vk->unlock_queue(vk->queue_ctx, pool->qf, cmd->qindex);
+    PL_VK_ASSERT(res, "vkQueueSubmit2");
+
+    pl_mutex_lock(&vk->lock);
+    PL_ARRAY_APPEND(vk->alloc, vk->cmds_pending, cmd);
+    pl_mutex_unlock(&vk->lock);
+    return true;
+
+error:
+    vk_cmd_reset(cmd);
+    pl_mutex_lock(&vk->lock);
+    PL_ARRAY_APPEND(pool, pool->cmds, cmd);
+    pl_mutex_unlock(&vk->lock);
+    vk->failed = true;
+    return false;
+}
+
+bool vk_poll_commands(struct vk_ctx *vk, uint64_t timeout)
+{
+    bool ret = false;
+    pl_mutex_lock(&vk->lock);
+
+    while (vk->cmds_pending.num) {
+        struct vk_cmd *cmd = vk->cmds_pending.elem[0];
+        struct vk_cmdpool *pool = cmd->pool;
+        pl_mutex_unlock(&vk->lock); // don't hold mutex while blocking
+        if (vk_cmd_poll(cmd, timeout) == VK_TIMEOUT)
+            return ret;
+        pl_mutex_lock(&vk->lock);
+        if (!vk->cmds_pending.num || vk->cmds_pending.elem[0] != cmd)
+            continue; // another thread modified this state while blocking
+
+        PL_TRACE(vk, "VkSemaphore signalled: 0x%"PRIx64" = %"PRIu64,
+                 (uint64_t) cmd->sync.sem, cmd->sync.value);
+        PL_ARRAY_REMOVE_AT(vk->cmds_pending, 0); // remove before callbacks
+        vk_cmd_reset(cmd);
+        PL_ARRAY_APPEND(pool, pool->cmds, cmd);
+        ret = true;
+
+        // If we've successfully spent some time waiting for at least one
+        // command, disable the timeout. This has the dual purpose of both
+        // making sure we don't over-wait due to repeat timeout application,
+        // but also makes sure we don't block on future commands if we've
+        // already spend time waiting for one.
+        timeout = 0;
+    }
+
+    pl_mutex_unlock(&vk->lock);
+    return ret;
+}
+
+void vk_rotate_queues(struct vk_ctx *vk)
+{
+    pl_mutex_lock(&vk->lock);
+
+    // Rotate the queues to ensure good parallelism across frames
+    for (int i = 0; i < vk->pools.num; i++) {
+        struct vk_cmdpool *pool = vk->pools.elem[i];
+        pool->idx_queues = (pool->idx_queues + 1) % pool->num_queues;
+        PL_TRACE(vk, "QF %d: %d/%d", pool->qf, pool->idx_queues, pool->num_queues);
+    }
+
+    pl_mutex_unlock(&vk->lock);
+}
+
+void vk_wait_idle(struct vk_ctx *vk)
+{
+    while (vk_poll_commands(vk, UINT64_MAX)) ;
+}
diff --git a/src/vulkan/command.h b/src/vulkan/command.h
new file mode 100644
index 0000000..4c70482
--- /dev/null
+++ b/src/vulkan/command.h
@@ -0,0 +1,142 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include "common.h"
+
+// Since lots of vulkan operations need to be done lazily once the affected
+// resources are no longer in use, provide an abstraction for tracking these.
+// In practice, these are only checked and run when submitting new commands, so
+// the actual execution may be delayed by a frame.
+typedef void (*vk_cb)(void *p, void *arg);
+
+struct vk_callback {
+    vk_cb run;
+    void *priv;
+    void *arg;
+};
+
+// Associate a callback with the completion of all currently pending commands.
+// This will essentially run once the device is completely idle.
+void vk_dev_callback(struct vk_ctx *vk, vk_cb callback,
+                     const void *priv, const void *arg);
+
+// Helper wrapper around command buffers that also track dependencies,
+// callbacks and synchronization primitives
+//
+// Thread-safety: Unsafe
+struct vk_cmd {
+    struct vk_cmdpool *pool; // pool it was allocated from
+    pl_vulkan_sem sync;      // pending execution, tied to lifetime of device
+    VkQueue queue;           // the submission queue (for recording/pending)
+    int qindex;              // the index of `queue` in `pool`
+    VkCommandBuffer buf;     // the command buffer itself
+    // Command dependencies and signals. Not owned by the vk_cmd.
+    PL_ARRAY(VkSemaphoreSubmitInfo) deps;
+    PL_ARRAY(VkSemaphoreSubmitInfo) sigs;
+    // "Callbacks" to fire once a command completes. These are used for
+    // multiple purposes, ranging from resource deallocation to fencing.
+    PL_ARRAY(struct vk_callback) callbacks;
+};
+
+// Associate a callback with the completion of the current command. This
+// function will be run once the command completes, or shortly thereafter.
+void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback,
+                     const void *priv, const void *arg);
+
+// Associate a raw dependency for the current command. This semaphore must
+// signal by the corresponding stage before the command may execute.
+void vk_cmd_dep(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem dep);
+
+// Associate a raw signal with the current command. This semaphore will signal
+// after the given stage completes.
+void vk_cmd_sig(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem sig);
+
+// Compatibility wrappers for vkCmdPipelineBarrier2 (works with pre-1.3)
+void vk_cmd_barrier(struct vk_cmd *cmd, const VkDependencyInfo *info);
+
+// Synchronization scope
+struct vk_sync_scope {
+    pl_vulkan_sem sync;         // semaphore of last access
+    VkQueue queue;              // source queue of last access
+    VkPipelineStageFlags2 stage;// stage bitmask of last access
+    VkAccessFlags2 access;      // access type bitmask
+};
+
+// Synchronization primitive
+struct vk_sem {
+    struct vk_sync_scope read, write;
+};
+
+// Updates the `vk_sem` state for a given access. If `is_trans` is set, this
+// access is treated as a write (since it alters the resource's state).
+//
+// Returns a struct describing the previous access to a resource. A pipeline
+// barrier is only required if the previous access scope is nonzero.
+struct vk_sync_scope vk_sem_barrier(struct vk_cmd *cmd, struct vk_sem *sem,
+                                    VkPipelineStageFlags2 stage,
+                                    VkAccessFlags2 access, bool is_trans);
+
+// Command pool / queue family hybrid abstraction
+struct vk_cmdpool {
+    struct vk_ctx *vk;
+    VkQueueFamilyProperties props;
+    int qf; // queue family index
+    VkCommandPool pool;
+    VkQueue *queues;
+    int num_queues;
+    int idx_queues;
+    // Command buffers associated with this queue. These are available for
+    // re-recording
+    PL_ARRAY(struct vk_cmd *) cmds;
+};
+
+// Set up a vk_cmdpool corresponding to a queue family. `qnum` may be less than
+// `props.queueCount`, to restrict the number of queues in this queue family.
+struct vk_cmdpool *vk_cmdpool_create(struct vk_ctx *vk, int qf, int qnum,
+                                     VkQueueFamilyProperties props);
+
+void vk_cmdpool_destroy(struct vk_cmdpool *pool);
+
+// Fetch a command buffer from a command pool and begin recording to it.
+// Returns NULL on failure.
+struct vk_cmd *vk_cmd_begin(struct vk_cmdpool *pool, pl_debug_tag debug_tag);
+
+// Finish recording a command buffer and submit it for execution. This function
+// takes over ownership of **cmd, and sets *cmd to NULL in doing so.
+bool vk_cmd_submit(struct vk_cmd **cmd);
+
+// Block until some commands complete executing. This is the only function that
+// actually processes the callbacks. Will wait at most `timeout` nanoseconds
+// for the completion of any command. The timeout may also be passed as 0, in
+// which case this function will not block, but only poll for completed
+// commands. Returns whether any forward progress was made.
+//
+// This does *not* flush any queued commands, forgetting to do so may result
+// in infinite loops if waiting for the completion of callbacks that were
+// never flushed!
+bool vk_poll_commands(struct vk_ctx *vk, uint64_t timeout);
+
+// Rotate through queues in each command pool. Call this once per frame, after
+// submitting all of the command buffers for that frame. Calling this more
+// often than that is possible but bad for performance.
+void vk_rotate_queues(struct vk_ctx *vk);
+
+// Wait until all commands are complete, i.e. the device is idle. This is
+// basically equivalent to calling `vk_poll_commands` with a timeout of
+// UINT64_MAX until it returns `false`.
+void vk_wait_idle(struct vk_ctx *vk);
diff --git a/src/vulkan/common.h b/src/vulkan/common.h
new file mode 100644
index 0000000..31b309e
--- /dev/null
+++ b/src/vulkan/common.h
@@ -0,0 +1,234 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#define VK_NO_PROTOTYPES
+#define VK_ENABLE_BETA_EXTENSIONS // for VK_KHR_portability_subset
+#define VK_USE_PLATFORM_METAL_EXT
+
+#include "../common.h"
+#include "../log.h"
+#include "../pl_thread.h"
+
+#include <libplacebo/vulkan.h>
+
+#ifdef PL_HAVE_WIN32
+#include <windows.h>
+#include <vulkan/vulkan_win32.h>
+#endif
+
+// Vulkan allows the optional use of a custom allocator. We don't need one but
+// mark this parameter with a better name in case we ever decide to change this
+// in the future. (And to make the code more readable)
+#define PL_VK_ALLOC NULL
+
+// Type of a vulkan function that needs to be loaded
+#define PL_VK_FUN(name) PFN_vk##name name
+
+// Load a vulkan instance-level extension function directly (on the stack)
+#define PL_VK_LOAD_FUN(inst, name, get_addr) \
+    PL_VK_FUN(name) = (PFN_vk##name) get_addr(inst, "vk" #name);
+
+#ifndef VK_VENDOR_ID_NVIDIA
+#define VK_VENDOR_ID_NVIDIA 0x10DE
+#endif
+
+// Shared struct used to hold vulkan context information
+struct vk_ctx {
+    pl_mutex lock;
+    pl_vulkan vulkan;
+    void *alloc; // host allocations bound to the lifetime of this vk_ctx
+    struct vk_malloc *ma; // VRAM malloc layer
+    pl_vk_inst internal_instance;
+    pl_log log;
+    VkInstance inst;
+    VkPhysicalDevice physd;
+    VkPhysicalDeviceProperties props;
+    VkPhysicalDeviceFeatures2 features;
+    uint32_t api_ver; // device API version
+    VkDevice dev;
+    bool imported; // device was not created by us
+
+    // Generic error flag for catching "failed" devices
+    bool failed;
+
+    // Enabled extensions
+    PL_ARRAY(const char *) exts;
+
+    // Command pools (one per queue family)
+    PL_ARRAY(struct vk_cmdpool *) pools;
+
+    // Pointers into `pools` (always set)
+    struct vk_cmdpool *pool_graphics;
+    struct vk_cmdpool *pool_compute;
+    struct vk_cmdpool *pool_transfer;
+
+    // Queue locking functions
+    PL_ARRAY(PL_ARRAY(pl_mutex)) queue_locks;
+    void (*lock_queue)(void *queue_ctx, uint32_t qf, uint32_t idx);
+    void (*unlock_queue)(void *queue_ctx, uint32_t qf, uint32_t idx);
+    void *queue_ctx;
+
+    // Pending commands. These are shared for the entire mpvk_ctx to ensure
+    // submission and callbacks are FIFO
+    PL_ARRAY(struct vk_cmd *) cmds_pending; // submitted but not completed
+
+    // Pending callbacks that still need to be drained before processing
+    // callbacks for the next command (in case commands are recursively being
+    // polled from another callback)
+    const struct vk_callback *pending_callbacks;
+    int num_pending_callbacks;
+
+    // Instance-level function pointers
+    PL_VK_FUN(CreateDevice);
+    PL_VK_FUN(EnumerateDeviceExtensionProperties);
+    PL_VK_FUN(GetDeviceProcAddr);
+    PL_VK_FUN(GetInstanceProcAddr);
+    PL_VK_FUN(GetPhysicalDeviceExternalBufferProperties);
+    PL_VK_FUN(GetPhysicalDeviceExternalSemaphoreProperties);
+    PL_VK_FUN(GetPhysicalDeviceFeatures2KHR);
+    PL_VK_FUN(GetPhysicalDeviceFormatProperties);
+    PL_VK_FUN(GetPhysicalDeviceFormatProperties2KHR);
+    PL_VK_FUN(GetPhysicalDeviceImageFormatProperties2KHR);
+    PL_VK_FUN(GetPhysicalDeviceMemoryProperties);
+    PL_VK_FUN(GetPhysicalDeviceProperties);
+    PL_VK_FUN(GetPhysicalDeviceProperties2);
+    PL_VK_FUN(GetPhysicalDeviceQueueFamilyProperties);
+    PL_VK_FUN(GetPhysicalDeviceSurfaceCapabilitiesKHR);
+    PL_VK_FUN(GetPhysicalDeviceSurfaceFormatsKHR);
+    PL_VK_FUN(GetPhysicalDeviceSurfacePresentModesKHR);
+    PL_VK_FUN(GetPhysicalDeviceSurfaceSupportKHR);
+
+    // Device-level function pointers
+    PL_VK_FUN(AcquireNextImageKHR);
+    PL_VK_FUN(AllocateCommandBuffers);
+    PL_VK_FUN(AllocateDescriptorSets);
+    PL_VK_FUN(AllocateMemory);
+    PL_VK_FUN(BeginCommandBuffer);
+    PL_VK_FUN(BindBufferMemory);
+    PL_VK_FUN(BindImageMemory);
+    PL_VK_FUN(CmdBeginDebugUtilsLabelEXT);
+    PL_VK_FUN(CmdBeginRenderPass);
+    PL_VK_FUN(CmdBindDescriptorSets);
+    PL_VK_FUN(CmdBindIndexBuffer);
+    PL_VK_FUN(CmdBindPipeline);
+    PL_VK_FUN(CmdBindVertexBuffers);
+    PL_VK_FUN(CmdBlitImage);
+    PL_VK_FUN(CmdClearColorImage);
+    PL_VK_FUN(CmdCopyBuffer);
+    PL_VK_FUN(CmdCopyBufferToImage);
+    PL_VK_FUN(CmdCopyImage);
+    PL_VK_FUN(CmdCopyImageToBuffer);
+    PL_VK_FUN(CmdDispatch);
+    PL_VK_FUN(CmdDraw);
+    PL_VK_FUN(CmdDrawIndexed);
+    PL_VK_FUN(CmdEndDebugUtilsLabelEXT);
+    PL_VK_FUN(CmdEndRenderPass);
+    PL_VK_FUN(CmdPipelineBarrier);
+    PL_VK_FUN(CmdPipelineBarrier2KHR);
+    PL_VK_FUN(CmdPushConstants);
+    PL_VK_FUN(CmdPushDescriptorSetKHR);
+    PL_VK_FUN(CmdResetQueryPool);
+    PL_VK_FUN(CmdSetScissor);
+    PL_VK_FUN(CmdSetViewport);
+    PL_VK_FUN(CmdUpdateBuffer);
+    PL_VK_FUN(CmdWriteTimestamp);
+    PL_VK_FUN(CreateBuffer);
+    PL_VK_FUN(CreateBufferView);
+    PL_VK_FUN(CreateCommandPool);
+    PL_VK_FUN(CreateComputePipelines);
+    PL_VK_FUN(CreateDebugReportCallbackEXT);
+    PL_VK_FUN(CreateDescriptorPool);
+    PL_VK_FUN(CreateDescriptorSetLayout);
+    PL_VK_FUN(CreateFence);
+    PL_VK_FUN(CreateFramebuffer);
+    PL_VK_FUN(CreateGraphicsPipelines);
+    PL_VK_FUN(CreateImage);
+    PL_VK_FUN(CreateImageView);
+    PL_VK_FUN(CreatePipelineCache);
+    PL_VK_FUN(CreatePipelineLayout);
+    PL_VK_FUN(CreateQueryPool);
+    PL_VK_FUN(CreateRenderPass);
+    PL_VK_FUN(CreateSampler);
+    PL_VK_FUN(CreateSemaphore);
+    PL_VK_FUN(CreateShaderModule);
+    PL_VK_FUN(CreateSwapchainKHR);
+    PL_VK_FUN(DestroyBuffer);
+    PL_VK_FUN(DestroyBufferView);
+    PL_VK_FUN(DestroyCommandPool);
+    PL_VK_FUN(DestroyDebugReportCallbackEXT);
+    PL_VK_FUN(DestroyDescriptorPool);
+    PL_VK_FUN(DestroyDescriptorSetLayout);
+    PL_VK_FUN(DestroyDevice);
+    PL_VK_FUN(DestroyFence);
+    PL_VK_FUN(DestroyFramebuffer);
+    PL_VK_FUN(DestroyImage);
+    PL_VK_FUN(DestroyImageView);
+    PL_VK_FUN(DestroyInstance);
+    PL_VK_FUN(DestroyPipeline);
+    PL_VK_FUN(DestroyPipelineCache);
+    PL_VK_FUN(DestroyPipelineLayout);
+    PL_VK_FUN(DestroyQueryPool);
+    PL_VK_FUN(DestroyRenderPass);
+    PL_VK_FUN(DestroySampler);
+    PL_VK_FUN(DestroySemaphore);
+    PL_VK_FUN(DestroyShaderModule);
+    PL_VK_FUN(DestroySwapchainKHR);
+    PL_VK_FUN(DeviceWaitIdle);
+    PL_VK_FUN(EndCommandBuffer);
+    PL_VK_FUN(FlushMappedMemoryRanges);
+    PL_VK_FUN(FreeCommandBuffers);
+    PL_VK_FUN(FreeMemory);
+    PL_VK_FUN(GetBufferMemoryRequirements);
+    PL_VK_FUN(GetDeviceQueue);
+    PL_VK_FUN(GetImageDrmFormatModifierPropertiesEXT);
+    PL_VK_FUN(GetImageMemoryRequirements2);
+    PL_VK_FUN(GetImageSubresourceLayout);
+    PL_VK_FUN(GetMemoryFdKHR);
+    PL_VK_FUN(GetMemoryFdPropertiesKHR);
+    PL_VK_FUN(GetMemoryHostPointerPropertiesEXT);
+    PL_VK_FUN(GetPipelineCacheData);
+    PL_VK_FUN(GetQueryPoolResults);
+    PL_VK_FUN(GetSemaphoreFdKHR);
+    PL_VK_FUN(GetSwapchainImagesKHR);
+    PL_VK_FUN(InvalidateMappedMemoryRanges);
+    PL_VK_FUN(MapMemory);
+    PL_VK_FUN(QueuePresentKHR);
+    PL_VK_FUN(QueueSubmit);
+    PL_VK_FUN(QueueSubmit2KHR);
+    PL_VK_FUN(QueueWaitIdle);
+    PL_VK_FUN(ResetFences);
+    PL_VK_FUN(ResetQueryPool);
+    PL_VK_FUN(SetDebugUtilsObjectNameEXT);
+    PL_VK_FUN(SetHdrMetadataEXT);
+    PL_VK_FUN(UpdateDescriptorSets);
+    PL_VK_FUN(WaitForFences);
+    PL_VK_FUN(WaitSemaphores);
+
+#ifdef PL_HAVE_WIN32
+    PL_VK_FUN(GetMemoryWin32HandleKHR);
+    PL_VK_FUN(GetSemaphoreWin32HandleKHR);
+#endif
+
+#ifdef VK_EXT_metal_objects
+    PL_VK_FUN(ExportMetalObjectsEXT);
+#endif
+#ifdef VK_EXT_full_screen_exclusive
+    PL_VK_FUN(AcquireFullScreenExclusiveModeEXT);
+#endif
+};
diff --git a/src/vulkan/context.c b/src/vulkan/context.c
new file mode 100644
index 0000000..ad8a859
--- /dev/null
+++ b/src/vulkan/context.c
@@ -0,0 +1,1704 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include "command.h"
+#include "utils.h"
+#include "gpu.h"
+
+#ifdef PL_HAVE_VK_PROC_ADDR
+VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetInstanceProcAddr(
+    VkInstance                                  instance,
+    const char*                                 pName);
+#endif
+
+const struct pl_vk_inst_params pl_vk_inst_default_params = {0};
+
+struct vk_fun {
+    const char *name;
+    size_t offset;
+    bool device_level;
+};
+
+struct vk_ext {
+    const char *name;
+    const struct vk_fun *funs;
+};
+
+#define PL_VK_INST_FUN(N)                   \
+    { .name = "vk" #N,                      \
+      .offset = offsetof(struct vk_ctx, N), \
+    }
+
+#define PL_VK_DEV_FUN(N)                    \
+    { .name = "vk" #N,                      \
+      .offset = offsetof(struct vk_ctx, N), \
+      .device_level = true,                 \
+    }
+
+// Table of optional vulkan instance extensions
+static const char *vk_instance_extensions[] = {
+    VK_KHR_SURFACE_EXTENSION_NAME,
+    VK_EXT_SWAPCHAIN_COLOR_SPACE_EXTENSION_NAME,
+    VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME,
+    VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME,
+    VK_KHR_GET_SURFACE_CAPABILITIES_2_EXTENSION_NAME,
+};
+
+// List of mandatory instance-level function pointers, including functions
+// associated with mandatory instance extensions
+static const struct vk_fun vk_inst_funs[] = {
+    PL_VK_INST_FUN(CreateDevice),
+    PL_VK_INST_FUN(EnumerateDeviceExtensionProperties),
+    PL_VK_INST_FUN(GetDeviceProcAddr),
+    PL_VK_INST_FUN(GetPhysicalDeviceExternalBufferProperties),
+    PL_VK_INST_FUN(GetPhysicalDeviceExternalSemaphoreProperties),
+    PL_VK_INST_FUN(GetPhysicalDeviceFeatures2KHR),
+    PL_VK_INST_FUN(GetPhysicalDeviceFormatProperties),
+    PL_VK_INST_FUN(GetPhysicalDeviceFormatProperties2KHR),
+    PL_VK_INST_FUN(GetPhysicalDeviceImageFormatProperties2KHR),
+    PL_VK_INST_FUN(GetPhysicalDeviceMemoryProperties),
+    PL_VK_INST_FUN(GetPhysicalDeviceProperties),
+    PL_VK_INST_FUN(GetPhysicalDeviceProperties2),
+    PL_VK_INST_FUN(GetPhysicalDeviceQueueFamilyProperties),
+
+    // These are not actually mandatory, but they're universal enough that we
+    // just load them unconditionally (in lieu of not having proper support for
+    // loading arbitrary instance extensions). Their use is generally guarded
+    // behind various VkSurfaceKHR values already being provided by the API
+    // user (implying this extension is loaded).
+    PL_VK_INST_FUN(GetPhysicalDeviceSurfaceCapabilitiesKHR),
+    PL_VK_INST_FUN(GetPhysicalDeviceSurfaceFormatsKHR),
+    PL_VK_INST_FUN(GetPhysicalDeviceSurfacePresentModesKHR),
+    PL_VK_INST_FUN(GetPhysicalDeviceSurfaceSupportKHR),
+};
+
+// Table of vulkan device extensions and functions they load, including
+// functions exported by dependent instance-level extensions
+static const struct vk_ext vk_device_extensions[] = {
+    {
+        .name = VK_KHR_SWAPCHAIN_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(AcquireNextImageKHR),
+            PL_VK_DEV_FUN(CreateSwapchainKHR),
+            PL_VK_DEV_FUN(DestroySwapchainKHR),
+            PL_VK_DEV_FUN(GetSwapchainImagesKHR),
+            PL_VK_DEV_FUN(QueuePresentKHR),
+            {0}
+        },
+    }, {
+        .name = VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(CmdPushDescriptorSetKHR),
+            {0}
+        },
+    }, {
+        .name = VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(GetMemoryFdKHR),
+            {0}
+        },
+    }, {
+        .name = VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(GetMemoryFdPropertiesKHR),
+            {0}
+        },
+#ifdef PL_HAVE_WIN32
+    }, {
+        .name = VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(GetMemoryWin32HandleKHR),
+            {0}
+        },
+#endif
+    }, {
+        .name = VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(GetMemoryHostPointerPropertiesEXT),
+            {0}
+        },
+    }, {
+        .name = VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(GetSemaphoreFdKHR),
+            {0}
+        },
+#ifdef PL_HAVE_WIN32
+    }, {
+        .name = VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(GetSemaphoreWin32HandleKHR),
+            {0}
+        },
+#endif
+    }, {
+        .name = VK_EXT_PCI_BUS_INFO_EXTENSION_NAME,
+    }, {
+        .name = VK_EXT_HDR_METADATA_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(SetHdrMetadataEXT),
+            {0}
+        },
+    }, {
+        .name = VK_EXT_IMAGE_DRM_FORMAT_MODIFIER_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(GetImageDrmFormatModifierPropertiesEXT),
+            {0}
+        },
+#ifdef VK_KHR_portability_subset
+    }, {
+        .name = VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME,
+#endif
+#ifdef VK_EXT_metal_objects
+    }, {
+        .name = VK_EXT_METAL_OBJECTS_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(ExportMetalObjectsEXT),
+            {0}
+        },
+#endif
+#ifdef VK_EXT_full_screen_exclusive
+    }, {
+        .name = VK_EXT_FULL_SCREEN_EXCLUSIVE_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(AcquireFullScreenExclusiveModeEXT),
+            {0}
+        },
+#endif
+    }, {
+        .name = VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME,
+        .funs = (const struct vk_fun[]) {
+            PL_VK_DEV_FUN(CmdPipelineBarrier2KHR),
+            PL_VK_DEV_FUN(QueueSubmit2KHR),
+            {0}
+        },
+    },
+};
+
+// Make sure to keep this in sync with the above!
+const char * const pl_vulkan_recommended_extensions[] = {
+    VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME,
+    VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME,
+    VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME,
+    VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME,
+    VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME,
+#ifdef PL_HAVE_WIN32
+    VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME,
+    VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME,
+#endif
+    VK_EXT_PCI_BUS_INFO_EXTENSION_NAME,
+    VK_EXT_HDR_METADATA_EXTENSION_NAME,
+    VK_EXT_IMAGE_DRM_FORMAT_MODIFIER_EXTENSION_NAME,
+#ifdef VK_KHR_portability_subset
+    VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME,
+#endif
+#ifdef VK_EXT_metal_objects
+    VK_EXT_METAL_OBJECTS_EXTENSION_NAME,
+#endif
+#ifdef VK_EXT_full_screen_exclusive
+    VK_EXT_FULL_SCREEN_EXCLUSIVE_EXTENSION_NAME,
+#endif
+    VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME,
+};
+
+const int pl_vulkan_num_recommended_extensions =
+    PL_ARRAY_SIZE(pl_vulkan_recommended_extensions);
+
+// +1 because VK_KHR_swapchain is not automatically pulled in
+static_assert(PL_ARRAY_SIZE(pl_vulkan_recommended_extensions) + 1 ==
+              PL_ARRAY_SIZE(vk_device_extensions),
+              "pl_vulkan_recommended_extensions out of sync with "
+              "vk_device_extensions?");
+
+// Recommended features; keep in sync with libavutil vulkan hwcontext
+static const VkPhysicalDeviceVulkan13Features recommended_vk13 = {
+    .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES,
+    .computeFullSubgroups = true,
+    .maintenance4 = true,
+    .shaderZeroInitializeWorkgroupMemory = true,
+    .synchronization2 = true,
+};
+
+static const VkPhysicalDeviceVulkan12Features recommended_vk12 = {
+    .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES,
+    .pNext = (void *) &recommended_vk13,
+    .bufferDeviceAddress = true,
+    .storagePushConstant8 = true,
+    .shaderInt8 = true,
+    .shaderFloat16 = true,
+    .shaderSharedInt64Atomics = true,
+    .storageBuffer8BitAccess = true,
+    .uniformAndStorageBuffer8BitAccess = true,
+    .vulkanMemoryModel = true,
+    .vulkanMemoryModelDeviceScope = true,
+};
+
+static const VkPhysicalDeviceVulkan11Features recommended_vk11 = {
+    .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES,
+    .pNext = (void *) &recommended_vk12,
+    .samplerYcbcrConversion = true,
+    .storagePushConstant16 = true,
+};
+
+const VkPhysicalDeviceFeatures2 pl_vulkan_recommended_features = {
+    .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2,
+    .pNext = (void *) &recommended_vk11,
+    .features = {
+        .shaderImageGatherExtended = true,
+        .shaderStorageImageReadWithoutFormat = true,
+        .shaderStorageImageWriteWithoutFormat = true,
+
+        // Needed for GPU-assisted validation, but not harmful to enable
+        .fragmentStoresAndAtomics = true,
+        .vertexPipelineStoresAndAtomics = true,
+        .shaderInt64 = true,
+    }
+};
+
+// Required features
+static const VkPhysicalDeviceVulkan12Features required_vk12 = {
+    .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES,
+    .hostQueryReset = true,
+    .timelineSemaphore = true,
+};
+
+static const VkPhysicalDeviceVulkan11Features required_vk11 = {
+    .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES,
+    .pNext = (void *) &required_vk12,
+};
+
+const VkPhysicalDeviceFeatures2 pl_vulkan_required_features = {
+    .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2,
+    .pNext = (void *) &required_vk11,
+};
+
+static bool check_required_features(struct vk_ctx *vk)
+{
+    #define CHECK_FEATURE(maj, min, feat) do {                                  \
+        const VkPhysicalDeviceVulkan##maj##min##Features *f;                    \
+        f = vk_find_struct(&vk->features,                                       \
+            VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_##maj##_##min##_FEATURES); \
+        if (!f || !f->feat) {                                                   \
+            PL_ERR(vk, "Missing device feature: " #feat);                       \
+            return false;                                                       \
+        }                                                                       \
+    } while (0)
+
+    CHECK_FEATURE(1, 2, hostQueryReset);
+    CHECK_FEATURE(1, 2, timelineSemaphore);
+
+    #undef CHECK_FEATURE
+    return true;
+}
+
+
+// List of mandatory device-level functions
+//
+// Note: Also includes VK_EXT_debug_utils functions, even though they aren't
+// mandatory, simply because we load that extension in a special way.
+static const struct vk_fun vk_dev_funs[] = {
+    PL_VK_DEV_FUN(AllocateCommandBuffers),
+    PL_VK_DEV_FUN(AllocateDescriptorSets),
+    PL_VK_DEV_FUN(AllocateMemory),
+    PL_VK_DEV_FUN(BeginCommandBuffer),
+    PL_VK_DEV_FUN(BindBufferMemory),
+    PL_VK_DEV_FUN(BindImageMemory),
+    PL_VK_DEV_FUN(CmdBeginDebugUtilsLabelEXT),
+    PL_VK_DEV_FUN(CmdBeginRenderPass),
+    PL_VK_DEV_FUN(CmdBindDescriptorSets),
+    PL_VK_DEV_FUN(CmdBindIndexBuffer),
+    PL_VK_DEV_FUN(CmdBindPipeline),
+    PL_VK_DEV_FUN(CmdBindVertexBuffers),
+    PL_VK_DEV_FUN(CmdBlitImage),
+    PL_VK_DEV_FUN(CmdClearColorImage),
+    PL_VK_DEV_FUN(CmdCopyBuffer),
+    PL_VK_DEV_FUN(CmdCopyBufferToImage),
+    PL_VK_DEV_FUN(CmdCopyImage),
+    PL_VK_DEV_FUN(CmdCopyImageToBuffer),
+    PL_VK_DEV_FUN(CmdDispatch),
+    PL_VK_DEV_FUN(CmdDraw),
+    PL_VK_DEV_FUN(CmdDrawIndexed),
+    PL_VK_DEV_FUN(CmdEndDebugUtilsLabelEXT),
+    PL_VK_DEV_FUN(CmdEndRenderPass),
+    PL_VK_DEV_FUN(CmdPipelineBarrier),
+    PL_VK_DEV_FUN(CmdPushConstants),
+    PL_VK_DEV_FUN(CmdResetQueryPool),
+    PL_VK_DEV_FUN(CmdSetScissor),
+    PL_VK_DEV_FUN(CmdSetViewport),
+    PL_VK_DEV_FUN(CmdUpdateBuffer),
+    PL_VK_DEV_FUN(CmdWriteTimestamp),
+    PL_VK_DEV_FUN(CreateBuffer),
+    PL_VK_DEV_FUN(CreateBufferView),
+    PL_VK_DEV_FUN(CreateCommandPool),
+    PL_VK_DEV_FUN(CreateComputePipelines),
+    PL_VK_DEV_FUN(CreateDescriptorPool),
+    PL_VK_DEV_FUN(CreateDescriptorSetLayout),
+    PL_VK_DEV_FUN(CreateFence),
+    PL_VK_DEV_FUN(CreateFramebuffer),
+    PL_VK_DEV_FUN(CreateGraphicsPipelines),
+    PL_VK_DEV_FUN(CreateImage),
+    PL_VK_DEV_FUN(CreateImageView),
+    PL_VK_DEV_FUN(CreatePipelineCache),
+    PL_VK_DEV_FUN(CreatePipelineLayout),
+    PL_VK_DEV_FUN(CreateQueryPool),
+    PL_VK_DEV_FUN(CreateRenderPass),
+    PL_VK_DEV_FUN(CreateSampler),
+    PL_VK_DEV_FUN(CreateSemaphore),
+    PL_VK_DEV_FUN(CreateShaderModule),
+    PL_VK_DEV_FUN(DestroyBuffer),
+    PL_VK_DEV_FUN(DestroyBufferView),
+    PL_VK_DEV_FUN(DestroyCommandPool),
+    PL_VK_DEV_FUN(DestroyDescriptorPool),
+    PL_VK_DEV_FUN(DestroyDescriptorSetLayout),
+    PL_VK_DEV_FUN(DestroyDevice),
+    PL_VK_DEV_FUN(DestroyFence),
+    PL_VK_DEV_FUN(DestroyFramebuffer),
+    PL_VK_DEV_FUN(DestroyImage),
+    PL_VK_DEV_FUN(DestroyImageView),
+    PL_VK_DEV_FUN(DestroyInstance),
+    PL_VK_DEV_FUN(DestroyPipeline),
+    PL_VK_DEV_FUN(DestroyPipelineCache),
+    PL_VK_DEV_FUN(DestroyPipelineLayout),
+    PL_VK_DEV_FUN(DestroyQueryPool),
+    PL_VK_DEV_FUN(DestroyRenderPass),
+    PL_VK_DEV_FUN(DestroySampler),
+    PL_VK_DEV_FUN(DestroySemaphore),
+    PL_VK_DEV_FUN(DestroyShaderModule),
+    PL_VK_DEV_FUN(DeviceWaitIdle),
+    PL_VK_DEV_FUN(EndCommandBuffer),
+    PL_VK_DEV_FUN(FlushMappedMemoryRanges),
+    PL_VK_DEV_FUN(FreeCommandBuffers),
+    PL_VK_DEV_FUN(FreeMemory),
+    PL_VK_DEV_FUN(GetBufferMemoryRequirements),
+    PL_VK_DEV_FUN(GetDeviceQueue),
+    PL_VK_DEV_FUN(GetImageMemoryRequirements2),
+    PL_VK_DEV_FUN(GetImageSubresourceLayout),
+    PL_VK_DEV_FUN(GetPipelineCacheData),
+    PL_VK_DEV_FUN(GetQueryPoolResults),
+    PL_VK_DEV_FUN(InvalidateMappedMemoryRanges),
+    PL_VK_DEV_FUN(MapMemory),
+    PL_VK_DEV_FUN(QueueSubmit),
+    PL_VK_DEV_FUN(QueueWaitIdle),
+    PL_VK_DEV_FUN(ResetFences),
+    PL_VK_DEV_FUN(ResetQueryPool),
+    PL_VK_DEV_FUN(SetDebugUtilsObjectNameEXT),
+    PL_VK_DEV_FUN(UpdateDescriptorSets),
+    PL_VK_DEV_FUN(WaitForFences),
+    PL_VK_DEV_FUN(WaitSemaphores),
+};
+
+static void load_vk_fun(struct vk_ctx *vk, const struct vk_fun *fun)
+{
+    PFN_vkVoidFunction *pfn = (void *) ((uintptr_t) vk + (ptrdiff_t) fun->offset);
+
+    if (fun->device_level) {
+        *pfn = vk->GetDeviceProcAddr(vk->dev, fun->name);
+    } else {
+        *pfn = vk->GetInstanceProcAddr(vk->inst, fun->name);
+    };
+
+    if (!*pfn) {
+        // Some functions get their extension suffix stripped when promoted
+        // to core. As a very simple work-around to this, try loading the
+        // function a second time with the reserved suffixes stripped.
+        static const char *ext_suffixes[] = { "KHR", "EXT" };
+        pl_str fun_name = pl_str0(fun->name);
+        char buf[64];
+
+        for (int i = 0; i < PL_ARRAY_SIZE(ext_suffixes); i++) {
+            if (!pl_str_eatend0(&fun_name, ext_suffixes[i]))
+                continue;
+
+            pl_assert(sizeof(buf) > fun_name.len);
+            snprintf(buf, sizeof(buf), "%.*s", PL_STR_FMT(fun_name));
+            if (fun->device_level) {
+                *pfn = vk->GetDeviceProcAddr(vk->dev, buf);
+            } else {
+                *pfn = vk->GetInstanceProcAddr(vk->inst, buf);
+            }
+            return;
+        }
+    }
+}
+
+// Private struct for pl_vk_inst
+struct priv {
+    VkDebugUtilsMessengerEXT debug_utils_cb;
+};
+
+void pl_vk_inst_destroy(pl_vk_inst *inst_ptr)
+{
+    pl_vk_inst inst = *inst_ptr;
+    if (!inst)
+        return;
+
+    struct priv *p = PL_PRIV(inst);
+    if (p->debug_utils_cb) {
+        PL_VK_LOAD_FUN(inst->instance, DestroyDebugUtilsMessengerEXT, inst->get_proc_addr);
+        DestroyDebugUtilsMessengerEXT(inst->instance, p->debug_utils_cb, PL_VK_ALLOC);
+    }
+
+    PL_VK_LOAD_FUN(inst->instance, DestroyInstance, inst->get_proc_addr);
+    DestroyInstance(inst->instance, PL_VK_ALLOC);
+    pl_free_ptr((void **) inst_ptr);
+}
+
+static VkBool32 VKAPI_PTR vk_dbg_utils_cb(VkDebugUtilsMessageSeverityFlagBitsEXT sev,
+                                          VkDebugUtilsMessageTypeFlagsEXT msgType,
+                                          const VkDebugUtilsMessengerCallbackDataEXT *data,
+                                          void *priv)
+{
+    pl_log log = priv;
+
+    // Ignore errors for messages that we consider false positives
+    switch (data->messageIdNumber) {
+    case 0x7cd0911d: // VUID-VkSwapchainCreateInfoKHR-imageExtent-01274
+    case 0x8928392f: // UNASSIGNED-BestPractices-NonSuccess-Result
+    case 0xdc18ad6b: // UNASSIGNED-BestPractices-vkAllocateMemory-small-allocation
+    case 0xb3d4346b: // UNASSIGNED-BestPractices-vkBindMemory-small-dedicated-allocation
+    case 0x6cfe18a5: // UNASSIGNED-BestPractices-SemaphoreCount
+    case 0x48a09f6c: // UNASSIGNED-BestPractices-pipeline-stage-flags
+    // profile chain expectations
+    case 0x30f4ac70: // VUID-VkImageCreateInfo-pNext-06811
+        return false;
+
+    case 0x5f379b89: // UNASSIGNED-BestPractices-Error-Result
+        if (strstr(data->pMessage, "VK_ERROR_FORMAT_NOT_SUPPORTED"))
+            return false;
+        break;
+
+    case 0xf6a37cfa: // VUID-vkGetImageSubresourceLayout-format-04461
+        // Work around https://github.com/KhronosGroup/Vulkan-Docs/issues/2109
+        return false;
+    }
+
+    enum pl_log_level lev;
+    switch (sev) {
+    case VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT:     lev = PL_LOG_ERR;   break;
+    case VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT:   lev = PL_LOG_WARN;  break;
+    case VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT:      lev = PL_LOG_DEBUG; break;
+    case VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT:   lev = PL_LOG_TRACE; break;
+    default:                                                lev = PL_LOG_INFO;  break;
+    }
+
+    pl_msg(log, lev, "vk %s", data->pMessage);
+
+    for (int i = 0; i < data->queueLabelCount; i++)
+        pl_msg(log, lev, "    during %s", data->pQueueLabels[i].pLabelName);
+    for (int i = 0; i < data->cmdBufLabelCount; i++)
+        pl_msg(log, lev, "    inside %s", data->pCmdBufLabels[i].pLabelName);
+    for (int i = 0; i < data->objectCount; i++) {
+        const VkDebugUtilsObjectNameInfoEXT *obj = &data->pObjects[i];
+        pl_msg(log, lev, "    using %s: %s (0x%llx)",
+               vk_obj_type(obj->objectType),
+               obj->pObjectName ? obj->pObjectName : "anon",
+               (unsigned long long) obj->objectHandle);
+    }
+
+    // The return value of this function determines whether the call will
+    // be explicitly aborted (to prevent GPU errors) or not. In this case,
+    // we generally want this to be on for the validation errors, but nothing
+    // else (e.g. performance warnings)
+    bool is_error = (sev & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) &&
+                    (msgType & VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT);
+
+    if (is_error) {
+        pl_log_stack_trace(log, lev);
+        pl_debug_abort();
+        return true;
+    }
+
+    return false;
+}
+
+static PFN_vkGetInstanceProcAddr get_proc_addr_fallback(pl_log log,
+                                    PFN_vkGetInstanceProcAddr get_proc_addr)
+{
+    if (get_proc_addr)
+        return get_proc_addr;
+
+#ifdef PL_HAVE_VK_PROC_ADDR
+    return vkGetInstanceProcAddr;
+#else
+    pl_fatal(log, "No `vkGetInstanceProcAddr` function provided, and "
+             "libplacebo built without linking against this function!");
+    return NULL;
+#endif
+}
+
+#define PRINTF_VER(ver) \
+    (int) VK_API_VERSION_MAJOR(ver), \
+    (int) VK_API_VERSION_MINOR(ver), \
+    (int) VK_API_VERSION_PATCH(ver)
+
+pl_vk_inst pl_vk_inst_create(pl_log log, const struct pl_vk_inst_params *params)
+{
+    void *tmp = pl_tmp(NULL);
+    params = PL_DEF(params, &pl_vk_inst_default_params);
+    VkInstance inst = NULL;
+    pl_clock_t start;
+
+    PL_ARRAY(const char *) exts = {0};
+
+    PFN_vkGetInstanceProcAddr get_addr;
+    if (!(get_addr = get_proc_addr_fallback(log, params->get_proc_addr)))
+        goto error;
+
+    // Query instance version support
+    uint32_t api_ver = VK_API_VERSION_1_0;
+    PL_VK_LOAD_FUN(NULL, EnumerateInstanceVersion, get_addr);
+    if (EnumerateInstanceVersion && EnumerateInstanceVersion(&api_ver) != VK_SUCCESS)
+        goto error;
+
+    pl_debug(log, "Available instance version: %d.%d.%d", PRINTF_VER(api_ver));
+
+    if (params->max_api_version) {
+        api_ver = PL_MIN(api_ver, params->max_api_version);
+        pl_info(log, "Restricting API version to %d.%d.%d... new version %d.%d.%d",
+                PRINTF_VER(params->max_api_version), PRINTF_VER(api_ver));
+    }
+
+    if (api_ver < PL_VK_MIN_VERSION) {
+        pl_fatal(log, "Instance API version %d.%d.%d is lower than the minimum "
+                 "required version of %d.%d.%d, cannot proceed!",
+                 PRINTF_VER(api_ver), PRINTF_VER(PL_VK_MIN_VERSION));
+        goto error;
+    }
+
+    VkInstanceCreateInfo info = {
+        .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
+        .pApplicationInfo = &(VkApplicationInfo) {
+            .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
+            .apiVersion = api_ver,
+        },
+    };
+
+    // Enumerate all supported layers
+    start = pl_clock_now();
+    PL_VK_LOAD_FUN(NULL, EnumerateInstanceLayerProperties, get_addr);
+    uint32_t num_layers_avail = 0;
+    EnumerateInstanceLayerProperties(&num_layers_avail, NULL);
+    VkLayerProperties *layers_avail = pl_calloc_ptr(tmp, num_layers_avail, layers_avail);
+    EnumerateInstanceLayerProperties(&num_layers_avail, layers_avail);
+    pl_log_cpu_time(log, start, pl_clock_now(), "enumerating instance layers");
+
+    pl_debug(log, "Available layers:");
+    for (int i = 0; i < num_layers_avail; i++) {
+        pl_debug(log, "    %s (v%d.%d.%d)", layers_avail[i].layerName,
+                 PRINTF_VER(layers_avail[i].specVersion));
+    }
+
+    PL_ARRAY(const char *) layers = {0};
+
+    // Sorted by priority
+    static const char *debug_layers[] = {
+        "VK_LAYER_KHRONOS_validation",
+        "VK_LAYER_LUNARG_standard_validation",
+    };
+
+    // This layer has to be initialized first, otherwise all sorts of weirdness
+    // happens (random segfaults, yum)
+    bool debug = params->debug;
+    uint32_t debug_layer = 0; // layer idx of debug layer
+    uint32_t debug_layer_version = 0;
+    if (debug) {
+        for (int i = 0; i < PL_ARRAY_SIZE(debug_layers); i++) {
+            for (int n = 0; n < num_layers_avail; n++) {
+                if (strcmp(debug_layers[i], layers_avail[n].layerName) != 0)
+                    continue;
+
+                debug_layer = n;
+                debug_layer_version = layers_avail[n].specVersion;
+                pl_info(log, "Enabling debug meta layer: %s (v%d.%d.%d)",
+                        debug_layers[i], PRINTF_VER(debug_layer_version));
+                PL_ARRAY_APPEND(tmp, layers, debug_layers[i]);
+                goto debug_layers_done;
+            }
+        }
+
+        // No layer found..
+        pl_warn(log, "API debugging requested but no debug meta layers present... ignoring");
+        debug = false;
+    }
+
+debug_layers_done: ;
+
+    for (int i = 0; i < params->num_layers; i++)
+        PL_ARRAY_APPEND(tmp, layers, params->layers[i]);
+
+    for (int i = 0; i < params->num_opt_layers; i++) {
+        const char *layer = params->opt_layers[i];
+        for (int n = 0; n < num_layers_avail; n++) {
+            if (strcmp(layer, layers_avail[n].layerName) == 0) {
+                PL_ARRAY_APPEND(tmp, layers, layer);
+                break;
+            }
+        }
+    }
+
+    // Enumerate all supported extensions
+    start = pl_clock_now();
+    PL_VK_LOAD_FUN(NULL, EnumerateInstanceExtensionProperties, get_addr);
+    uint32_t num_exts_avail = 0;
+    EnumerateInstanceExtensionProperties(NULL, &num_exts_avail, NULL);
+    VkExtensionProperties *exts_avail = pl_calloc_ptr(tmp, num_exts_avail, exts_avail);
+    EnumerateInstanceExtensionProperties(NULL, &num_exts_avail, exts_avail);
+
+    struct {
+        VkExtensionProperties *exts;
+        uint32_t num_exts;
+    } *layer_exts = pl_calloc_ptr(tmp, num_layers_avail, layer_exts);
+
+    // Enumerate extensions from layers
+    for (int i = 0; i < num_layers_avail; i++) {
+        VkExtensionProperties **lexts = &layer_exts[i].exts;
+        uint32_t *num = &layer_exts[i].num_exts;
+
+        EnumerateInstanceExtensionProperties(layers_avail[i].layerName, num, NULL);
+        *lexts = pl_calloc_ptr(tmp, *num, *lexts);
+        EnumerateInstanceExtensionProperties(layers_avail[i].layerName, num, *lexts);
+
+        // Replace all extensions that are already available globally by {0}
+        for (int j = 0; j < *num; j++) {
+            for (int k = 0; k < num_exts_avail; k++) {
+                if (strcmp((*lexts)[j].extensionName, exts_avail[k].extensionName) == 0)
+                    (*lexts)[j] = (VkExtensionProperties) {0};
+            }
+        }
+    }
+
+    pl_log_cpu_time(log, start, pl_clock_now(), "enumerating instance extensions");
+    pl_debug(log, "Available instance extensions:");
+    for (int i = 0; i < num_exts_avail; i++)
+        pl_debug(log, "    %s", exts_avail[i].extensionName);
+    for (int i = 0; i < num_layers_avail; i++) {
+        for (int j = 0; j < layer_exts[i].num_exts; j++) {
+            if (!layer_exts[i].exts[j].extensionName[0])
+                continue;
+
+            pl_debug(log, "    %s (via %s)",
+                     layer_exts[i].exts[j].extensionName,
+                     layers_avail[i].layerName);
+        }
+    }
+
+    // Add mandatory extensions
+    PL_ARRAY_APPEND(tmp, exts, VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME);
+
+    // Add optional extensions
+    for (int i = 0; i < PL_ARRAY_SIZE(vk_instance_extensions); i++) {
+        const char *ext = vk_instance_extensions[i];
+        for (int n = 0; n < num_exts_avail; n++) {
+            if (strcmp(ext, exts_avail[n].extensionName) == 0) {
+                PL_ARRAY_APPEND(tmp, exts, ext);
+                break;
+            }
+        }
+    }
+
+#ifdef VK_KHR_portability_enumeration
+    // Required for macOS ( MoltenVK ) compatibility
+    for (int n = 0; n < num_exts_avail; n++) {
+        if (strcmp(VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME, exts_avail[n].extensionName) == 0) {
+            PL_ARRAY_APPEND(tmp, exts, VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME);
+            info.flags |= VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR;
+            break;
+        }
+    }
+#endif
+
+    // Add extra user extensions
+    for (int i = 0; i < params->num_extensions; i++) {
+        const char *ext = params->extensions[i];
+        PL_ARRAY_APPEND(tmp, exts, ext);
+
+        // Enable any additional layers that are required for this extension
+        for (int n = 0; n < num_layers_avail; n++) {
+            for (int j = 0; j < layer_exts[n].num_exts; j++) {
+                if (!layer_exts[n].exts[j].extensionName[0])
+                    continue;
+                if (strcmp(ext, layer_exts[n].exts[j].extensionName) == 0) {
+                    PL_ARRAY_APPEND(tmp, layers, layers_avail[n].layerName);
+                    goto next_user_ext;
+                }
+            }
+        }
+
+next_user_ext: ;
+    }
+
+    // Add extra optional user extensions
+    for (int i = 0; i < params->num_opt_extensions; i++) {
+        const char *ext = params->opt_extensions[i];
+        for (int n = 0; n < num_exts_avail; n++) {
+            if (strcmp(ext, exts_avail[n].extensionName) == 0) {
+                PL_ARRAY_APPEND(tmp, exts, ext);
+                goto next_opt_user_ext;
+            }
+        }
+
+        for (int n = 0; n < num_layers_avail; n++) {
+            for (int j = 0; j < layer_exts[n].num_exts; j++) {
+                if (!layer_exts[n].exts[j].extensionName[0])
+                    continue;
+                if (strcmp(ext, layer_exts[n].exts[j].extensionName) == 0) {
+                    PL_ARRAY_APPEND(tmp, exts, ext);
+                    PL_ARRAY_APPEND(tmp, layers, layers_avail[n].layerName);
+                    goto next_opt_user_ext;
+                }
+            }
+        }
+
+next_opt_user_ext: ;
+    }
+
+    // If debugging is enabled, load the necessary debug utils extension
+    if (debug) {
+        const char * const ext = VK_EXT_DEBUG_UTILS_EXTENSION_NAME;
+        for (int n = 0; n < num_exts_avail; n++) {
+            if (strcmp(ext, exts_avail[n].extensionName) == 0) {
+                PL_ARRAY_APPEND(tmp, exts, ext);
+                goto debug_ext_done;
+            }
+        }
+
+        for (int n = 0; n < layer_exts[debug_layer].num_exts; n++) {
+            if (strcmp(ext, layer_exts[debug_layer].exts[n].extensionName) == 0) {
+                PL_ARRAY_APPEND(tmp, exts, ext);
+                goto debug_ext_done;
+            }
+        }
+
+        // No extension found
+        pl_warn(log, "API debug layers enabled but no debug report extension "
+                "found... ignoring. Debug messages may be spilling to "
+                "stdout/stderr!");
+        debug = false;
+    }
+
+debug_ext_done: ;
+
+    // Limit this to 1.3.250+ because of bugs in older versions.
+    if (debug && params->debug_extra &&
+        debug_layer_version >= VK_MAKE_API_VERSION(0, 1, 3, 259))
+    {
+        // Try enabling as many validation features as possible
+        static const VkValidationFeatureEnableEXT validation_features[] = {
+            VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT,
+            VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT,
+            VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT,
+            VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT,
+        };
+
+        static const VkValidationFeaturesEXT vinfo = {
+            .sType = VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT,
+            .pEnabledValidationFeatures = validation_features,
+            .enabledValidationFeatureCount = PL_ARRAY_SIZE(validation_features),
+        };
+
+        const char * const ext = VK_EXT_VALIDATION_FEATURES_EXTENSION_NAME;
+        for (int n = 0; n < num_exts_avail; n++) {
+            if (strcmp(ext, exts_avail[n].extensionName) == 0) {
+                PL_ARRAY_APPEND(tmp, exts, ext);
+                vk_link_struct(&info, &vinfo);
+                goto debug_extra_ext_done;
+            }
+        }
+
+        for (int n = 0; n < layer_exts[debug_layer].num_exts; n++) {
+            if (strcmp(ext, layer_exts[debug_layer].exts[n].extensionName) == 0) {
+                PL_ARRAY_APPEND(tmp, exts, ext);
+                vk_link_struct(&info, &vinfo);
+                goto debug_extra_ext_done;
+            }
+        }
+
+        pl_warn(log, "GPU-assisted validation enabled but not supported by "
+                "instance, disabling...");
+    }
+
+debug_extra_ext_done: ;
+
+    info.ppEnabledExtensionNames = exts.elem;
+    info.enabledExtensionCount = exts.num;
+    info.ppEnabledLayerNames = layers.elem;
+    info.enabledLayerCount = layers.num;
+
+    pl_info(log, "Creating vulkan instance%s", exts.num ? " with extensions:" : "");
+    for (int i = 0; i < exts.num; i++)
+        pl_info(log, "    %s", exts.elem[i]);
+
+    if (layers.num) {
+        pl_info(log, "  and layers:");
+        for (int i = 0; i < layers.num; i++)
+            pl_info(log, "    %s", layers.elem[i]);
+    }
+
+    start = pl_clock_now();
+    PL_VK_LOAD_FUN(NULL, CreateInstance, get_addr);
+    VkResult res = CreateInstance(&info, PL_VK_ALLOC, &inst);
+    pl_log_cpu_time(log, start, pl_clock_now(), "creating vulkan instance");
+    if (res != VK_SUCCESS) {
+        pl_fatal(log, "Failed creating instance: %s", vk_res_str(res));
+        goto error;
+    }
+
+    struct pl_vk_inst_t *pl_vk = pl_zalloc_obj(NULL, pl_vk, struct priv);
+    struct priv *p = PL_PRIV(pl_vk);
+    *pl_vk = (struct pl_vk_inst_t) {
+        .instance = inst,
+        .api_version = api_ver,
+        .get_proc_addr = get_addr,
+        .extensions = pl_steal(pl_vk, exts.elem),
+        .num_extensions = exts.num,
+        .layers = pl_steal(pl_vk, layers.elem),
+        .num_layers = layers.num,
+    };
+
+    // Set up a debug callback to catch validation messages
+    if (debug) {
+        VkDebugUtilsMessengerCreateInfoEXT dinfo = {
+            .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT,
+            .messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT |
+                               VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT |
+                               VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT |
+                               VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT,
+            .messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT |
+                           VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT |
+                           VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT,
+            .pfnUserCallback = vk_dbg_utils_cb,
+            .pUserData = (void *) log,
+        };
+
+        PL_VK_LOAD_FUN(inst, CreateDebugUtilsMessengerEXT, get_addr);
+        CreateDebugUtilsMessengerEXT(inst, &dinfo, PL_VK_ALLOC, &p->debug_utils_cb);
+    }
+
+    pl_free(tmp);
+    return pl_vk;
+
+error:
+    pl_fatal(log, "Failed initializing vulkan instance");
+    if (inst) {
+        PL_VK_LOAD_FUN(inst, DestroyInstance, get_addr);
+        DestroyInstance(inst, PL_VK_ALLOC);
+    }
+    pl_free(tmp);
+    return NULL;
+}
+
+const struct pl_vulkan_params pl_vulkan_default_params = { PL_VULKAN_DEFAULTS };
+
+void pl_vulkan_destroy(pl_vulkan *pl_vk)
+{
+    if (!*pl_vk)
+        return;
+
+    struct vk_ctx *vk = PL_PRIV(*pl_vk);
+    if (vk->dev) {
+        if ((*pl_vk)->gpu) {
+            PL_DEBUG(vk, "Waiting for remaining commands...");
+            pl_gpu_finish((*pl_vk)->gpu);
+            pl_assert(vk->cmds_pending.num == 0);
+
+            pl_gpu_destroy((*pl_vk)->gpu);
+        }
+        vk_malloc_destroy(&vk->ma);
+        for (int i = 0; i < vk->pools.num; i++)
+            vk_cmdpool_destroy(vk->pools.elem[i]);
+
+        if (!vk->imported)
+            vk->DestroyDevice(vk->dev, PL_VK_ALLOC);
+    }
+
+    for (int i = 0; i < vk->queue_locks.num; i++) {
+        for (int n = 0; n < vk->queue_locks.elem[i].num; n++)
+            pl_mutex_destroy(&vk->queue_locks.elem[i].elem[n]);
+    }
+
+    pl_vk_inst_destroy(&vk->internal_instance);
+    pl_mutex_destroy(&vk->lock);
+    pl_free_ptr((void **) pl_vk);
+}
+
+static bool supports_surf(pl_log log, VkInstance inst,
+                          PFN_vkGetInstanceProcAddr get_addr,
+                          VkPhysicalDevice physd, VkSurfaceKHR surf)
+{
+    // Hack for the VK macro's logging to work
+    struct { pl_log log; } *vk = (void *) &log;
+
+    PL_VK_LOAD_FUN(inst, GetPhysicalDeviceQueueFamilyProperties, get_addr);
+    PL_VK_LOAD_FUN(inst, GetPhysicalDeviceSurfaceSupportKHR, get_addr);
+    uint32_t qfnum = 0;
+    GetPhysicalDeviceQueueFamilyProperties(physd, &qfnum, NULL);
+
+    for (int i = 0; i < qfnum; i++) {
+        VkBool32 sup = false;
+        VK(GetPhysicalDeviceSurfaceSupportKHR(physd, i, surf, &sup));
+        if (sup)
+            return true;
+    }
+
+error:
+    return false;
+}
+
+VkPhysicalDevice pl_vulkan_choose_device(pl_log log,
+                                const struct pl_vulkan_device_params *params)
+{
+    // Hack for the VK macro's logging to work
+    struct { pl_log log; } *vk = (void *) &log;
+    PL_INFO(vk, "Probing for vulkan devices:");
+
+    pl_assert(params->instance);
+    VkInstance inst = params->instance;
+    VkPhysicalDevice dev = VK_NULL_HANDLE;
+
+    PFN_vkGetInstanceProcAddr get_addr;
+    if (!(get_addr = get_proc_addr_fallback(log, params->get_proc_addr)))
+        return NULL;
+
+    PL_VK_LOAD_FUN(inst, EnumeratePhysicalDevices, get_addr);
+    PL_VK_LOAD_FUN(inst, GetPhysicalDeviceProperties2, get_addr);
+    pl_assert(GetPhysicalDeviceProperties2);
+
+    pl_clock_t start = pl_clock_now();
+    VkPhysicalDevice *devices = NULL;
+    uint32_t num = 0;
+    VK(EnumeratePhysicalDevices(inst, &num, NULL));
+    devices = pl_calloc_ptr(NULL, num, devices);
+    VK(EnumeratePhysicalDevices(inst, &num, devices));
+    pl_log_cpu_time(log, start, pl_clock_now(), "enumerating physical devices");
+
+    static const struct { const char *name; int priority; } types[] = {
+        [VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU]   = {"discrete",   5},
+        [VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU] = {"integrated", 4},
+        [VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU]    = {"virtual",    3},
+        [VK_PHYSICAL_DEVICE_TYPE_CPU]            = {"software",   2},
+        [VK_PHYSICAL_DEVICE_TYPE_OTHER]          = {"other",      1},
+    };
+
+    static const uint8_t nil[VK_UUID_SIZE] = {0};
+    bool uuid_set = memcmp(params->device_uuid, nil, VK_UUID_SIZE) != 0;
+
+    int best = -1;
+    for (int i = 0; i < num; i++) {
+        VkPhysicalDeviceIDPropertiesKHR id_props = {
+            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR,
+        };
+
+        VkPhysicalDeviceProperties2 prop = {
+            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
+            .pNext = &id_props,
+        };
+
+        GetPhysicalDeviceProperties2(devices[i], &prop);
+        VkPhysicalDeviceType t = prop.properties.deviceType;
+        const char *dtype = t < PL_ARRAY_SIZE(types) ? types[t].name : "unknown?";
+        PL_INFO(vk, "    GPU %d: %s v%d.%d.%d (%s)", i, prop.properties.deviceName,
+                PRINTF_VER(prop.properties.apiVersion), dtype);
+        PL_INFO(vk, "           uuid: %s", PRINT_UUID(id_props.deviceUUID));
+
+        if (params->surface) {
+            if (!supports_surf(log, inst, get_addr, devices[i], params->surface)) {
+                PL_DEBUG(vk, "      -> excluding due to lack of surface support");
+                continue;
+            }
+        }
+
+        if (uuid_set) {
+            if (memcmp(id_props.deviceUUID, params->device_uuid, VK_UUID_SIZE) == 0) {
+                dev = devices[i];
+                continue;
+            } else {
+                PL_DEBUG(vk, "     -> excluding due to UUID mismatch");
+                continue;
+            }
+        } else if (params->device_name && params->device_name[0] != '\0') {
+            if (strcmp(params->device_name, prop.properties.deviceName) == 0) {
+                dev = devices[i];
+                continue;
+            } else {
+                PL_DEBUG(vk, "      -> excluding due to name mismatch");
+                continue;
+            }
+        }
+
+        if (!params->allow_software && t == VK_PHYSICAL_DEVICE_TYPE_CPU) {
+            PL_DEBUG(vk, "      -> excluding due to !params->allow_software");
+            continue;
+        }
+
+        if (prop.properties.apiVersion < PL_VK_MIN_VERSION) {
+            PL_DEBUG(vk, "      -> excluding due to too low API version");
+            continue;
+        }
+
+        int priority = t < PL_ARRAY_SIZE(types) ? types[t].priority : 0;
+        if (priority > best) {
+            dev = devices[i];
+            best = priority;
+        }
+    }
+
+error:
+    pl_free(devices);
+    return dev;
+}
+
+static void lock_queue_internal(void *priv, uint32_t qf, uint32_t qidx)
+{
+    struct vk_ctx *vk = priv;
+    pl_mutex_lock(&vk->queue_locks.elem[qf].elem[qidx]);
+}
+
+static void unlock_queue_internal(void *priv, uint32_t qf, uint32_t qidx)
+{
+    struct vk_ctx *vk = priv;
+    pl_mutex_unlock(&vk->queue_locks.elem[qf].elem[qidx]);
+}
+
+static void init_queue_locks(struct vk_ctx *vk, uint32_t qfnum,
+                             const VkQueueFamilyProperties *qfs)
+{
+    vk->queue_locks.elem = pl_calloc_ptr(vk->alloc, qfnum, vk->queue_locks.elem);
+    vk->queue_locks.num = qfnum;
+    for (int i = 0; i < qfnum; i++) {
+        const uint32_t qnum = qfs[i].queueCount;
+        vk->queue_locks.elem[i].elem = pl_calloc(vk->alloc, qnum, sizeof(pl_mutex));
+        vk->queue_locks.elem[i].num = qnum;
+        for (int n = 0; n < qnum; n++)
+            pl_mutex_init(&vk->queue_locks.elem[i].elem[n]);
+    }
+
+    vk->lock_queue = lock_queue_internal;
+    vk->unlock_queue = unlock_queue_internal;
+    vk->queue_ctx = vk;
+}
+
+// Find the most specialized queue supported a combination of flags. In cases
+// where there are multiple queue families at the same specialization level,
+// this finds the one with the most queues. Returns -1 if no queue was found.
+static int find_qf(VkQueueFamilyProperties *qfs, int qfnum, VkQueueFlags flags)
+{
+    int idx = -1;
+    for (int i = 0; i < qfnum; i++) {
+        if ((qfs[i].queueFlags & flags) != flags)
+            continue;
+
+        // QF is more specialized. Since we don't care about other bits like
+        // SPARSE_BIT, mask the ones we're interestew in
+        const VkQueueFlags mask = VK_QUEUE_GRAPHICS_BIT |
+                                  VK_QUEUE_TRANSFER_BIT |
+                                  VK_QUEUE_COMPUTE_BIT;
+
+        if (idx < 0 || (qfs[i].queueFlags & mask) < (qfs[idx].queueFlags & mask))
+            idx = i;
+
+        // QF has more queues (at the same specialization level)
+        if (qfs[i].queueFlags == qfs[idx].queueFlags &&
+            qfs[i].queueCount > qfs[idx].queueCount)
+            idx = i;
+    }
+
+    return idx;
+}
+
+static bool device_init(struct vk_ctx *vk, const struct pl_vulkan_params *params)
+{
+    pl_assert(vk->physd);
+    void *tmp = pl_tmp(NULL);
+
+    // Enumerate the queue families and find suitable families for each task
+    uint32_t qfnum = 0;
+    vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, NULL);
+    VkQueueFamilyProperties *qfs = pl_calloc_ptr(tmp, qfnum, qfs);
+    vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, qfs);
+    init_queue_locks(vk, qfnum, qfs);
+
+    PL_DEBUG(vk, "Queue families supported by device:");
+    for (int i = 0; i < qfnum; i++) {
+        PL_DEBUG(vk, "    %d: flags 0x%"PRIx32" num %"PRIu32, i,
+                 qfs[i].queueFlags, qfs[i].queueCount);
+    }
+
+    VkQueueFlagBits gfx_flags = VK_QUEUE_GRAPHICS_BIT;
+    if (!params->async_compute)
+        gfx_flags |= VK_QUEUE_COMPUTE_BIT;
+
+    int idx_gfx  = find_qf(qfs, qfnum, gfx_flags);
+    int idx_comp = find_qf(qfs, qfnum, VK_QUEUE_COMPUTE_BIT);
+    int idx_tf   = find_qf(qfs, qfnum, VK_QUEUE_TRANSFER_BIT);
+    if (idx_tf < 0)
+        idx_tf = idx_comp;
+
+    if (!params->async_compute)
+        idx_comp = idx_gfx;
+    if (!params->async_transfer)
+        idx_tf = idx_gfx;
+
+    PL_DEBUG(vk, "Using graphics queue %d", idx_gfx);
+    if (idx_tf != idx_gfx)
+        PL_INFO(vk, "Using async transfer (queue %d)", idx_tf);
+    if (idx_comp != idx_gfx)
+        PL_INFO(vk, "Using async compute (queue %d)", idx_comp);
+
+    // Vulkan requires at least one GRAPHICS+COMPUTE queue, so if this fails
+    // something is horribly wrong.
+    pl_assert(idx_gfx >= 0 && idx_comp >= 0 && idx_tf >= 0);
+
+    // If needed, ensure we can actually present to the surface using this queue
+    if (params->surface) {
+        VkBool32 sup = false;
+        VK(vk->GetPhysicalDeviceSurfaceSupportKHR(vk->physd, idx_gfx,
+                                                  params->surface, &sup));
+        if (!sup) {
+            PL_FATAL(vk, "Queue family does not support surface presentation!");
+            goto error;
+        }
+    }
+
+    // Enumerate all supported extensions
+    pl_clock_t start = pl_clock_now();
+    uint32_t num_exts_avail = 0;
+    VK(vk->EnumerateDeviceExtensionProperties(vk->physd, NULL, &num_exts_avail, NULL));
+    VkExtensionProperties *exts_avail = pl_calloc_ptr(tmp, num_exts_avail, exts_avail);
+    VK(vk->EnumerateDeviceExtensionProperties(vk->physd, NULL, &num_exts_avail, exts_avail));
+    pl_log_cpu_time(vk->log, start, pl_clock_now(), "enumerating device extensions");
+
+    PL_DEBUG(vk, "Available device extensions:");
+    for (int i = 0; i < num_exts_avail; i++)
+        PL_DEBUG(vk, "    %s", exts_avail[i].extensionName);
+
+    // Add all extensions we need
+    if (params->surface)
+        PL_ARRAY_APPEND(vk->alloc, vk->exts, VK_KHR_SWAPCHAIN_EXTENSION_NAME);
+
+    // Keep track of all optional function pointers associated with extensions
+    PL_ARRAY(const struct vk_fun *) ext_funs = {0};
+
+    // Add all optional device-level extensions extensions
+    for (int i = 0; i < PL_ARRAY_SIZE(vk_device_extensions); i++) {
+        const struct vk_ext *ext = &vk_device_extensions[i];
+        uint32_t core_ver = vk_ext_promoted_ver(ext->name);
+        if (core_ver && vk->api_ver >= core_ver) {
+            // Layer is already implicitly enabled by the API version
+            for (const struct vk_fun *f = ext->funs; f && f->name; f++)
+                PL_ARRAY_APPEND(tmp, ext_funs,  f);
+            continue;
+        }
+
+        for (int n = 0; n < num_exts_avail; n++) {
+            if (strcmp(ext->name, exts_avail[n].extensionName) == 0) {
+                PL_ARRAY_APPEND(vk->alloc, vk->exts, ext->name);
+                for (const struct vk_fun *f = ext->funs; f && f->name; f++)
+                    PL_ARRAY_APPEND(tmp, ext_funs, f);
+                break;
+            }
+        }
+    }
+
+    // Add extra user extensions
+    for (int i = 0; i < params->num_extensions; i++)
+        PL_ARRAY_APPEND(vk->alloc, vk->exts, params->extensions[i]);
+
+    // Add optional extra user extensions
+    for (int i = 0; i < params->num_opt_extensions; i++) {
+        const char *ext = params->opt_extensions[i];
+        for (int n = 0; n < num_exts_avail; n++) {
+            if (strcmp(ext, exts_avail[n].extensionName) == 0) {
+                PL_ARRAY_APPEND(vk->alloc, vk->exts, ext);
+                break;
+            }
+        }
+    }
+
+    VkPhysicalDeviceFeatures2 features = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR
+    };
+
+    vk_features_normalize(tmp, &pl_vulkan_required_features, vk->api_ver, &features);
+    vk_features_normalize(tmp, &pl_vulkan_recommended_features, vk->api_ver, &features);
+    vk_features_normalize(tmp, params->features, vk->api_ver, &features);
+
+    // Explicitly clear the features struct before querying feature support
+    // from the driver. This way, we don't mistakenly mark as supported
+    // features coming from structs the driver doesn't have support for.
+    VkPhysicalDeviceFeatures2 *features_sup = vk_chain_memdup(tmp, &features);;
+    for (VkBaseOutStructure *out = (void *) features_sup; out; out = out->pNext) {
+        const size_t size = vk_struct_size(out->sType);
+        memset(&out[1], 0, size - sizeof(out[0]));
+    }
+
+    vk->GetPhysicalDeviceFeatures2KHR(vk->physd, features_sup);
+
+    // Filter out unsupported features
+    for (VkBaseOutStructure *f = (VkBaseOutStructure *) &features; f; f = f->pNext) {
+        const VkBaseInStructure *sup = vk_find_struct(features_sup, f->sType);
+        VkBool32 *flags = (VkBool32 *) &f[1];
+        const VkBool32 *flags_sup = (const VkBool32 *) &sup[1];
+        const size_t size = vk_struct_size(f->sType) - sizeof(VkBaseOutStructure);
+        for (int i = 0; i < size / sizeof(VkBool32); i++)
+            flags[i] &= flags_sup[i];
+    }
+
+    // Construct normalized output chain
+    vk->features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
+    vk_features_normalize(vk->alloc, &features, 0, &vk->features);
+    if (!check_required_features(vk)) {
+        PL_FATAL(vk, "Vulkan device does not support all required features!");
+        goto error;
+    }
+
+    // Enable all queues at device creation time, to maximize compatibility
+    // with other API users (e.g. FFmpeg)
+    PL_ARRAY(VkDeviceQueueCreateInfo) qinfos = {0};
+    for (int i = 0; i < qfnum; i++) {
+        bool use_qf = i == idx_gfx || i == idx_comp || i == idx_tf;
+        use_qf |= qfs[i].queueFlags & params->extra_queues;
+        if (!use_qf)
+            continue;
+        PL_ARRAY_APPEND(tmp, qinfos, (VkDeviceQueueCreateInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
+            .queueFamilyIndex = i,
+            .queueCount = qfs[i].queueCount,
+            .pQueuePriorities = pl_calloc(tmp, qfs[i].queueCount, sizeof(float)),
+        });
+    }
+
+    VkDeviceCreateInfo dinfo = {
+        .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
+        .pNext = &features,
+        .pQueueCreateInfos = qinfos.elem,
+        .queueCreateInfoCount = qinfos.num,
+        .ppEnabledExtensionNames = vk->exts.elem,
+        .enabledExtensionCount = vk->exts.num,
+    };
+
+    PL_INFO(vk, "Creating vulkan device%s", vk->exts.num ? " with extensions:" : "");
+    for (int i = 0; i < vk->exts.num; i++)
+        PL_INFO(vk, "    %s", vk->exts.elem[i]);
+
+    start = pl_clock_now();
+    VK(vk->CreateDevice(vk->physd, &dinfo, PL_VK_ALLOC, &vk->dev));
+    pl_log_cpu_time(vk->log, start, pl_clock_now(), "creating vulkan device");
+
+    // Load all mandatory device-level functions
+    for (int i = 0; i < PL_ARRAY_SIZE(vk_dev_funs); i++)
+        load_vk_fun(vk, &vk_dev_funs[i]);
+
+    // Load all of the optional functions from the extensions we enabled
+    for (int i = 0; i < ext_funs.num; i++)
+        load_vk_fun(vk, ext_funs.elem[i]);
+
+    // Create the command pools for the queues we care about
+    const uint32_t qmax = PL_DEF(params->queue_count, UINT32_MAX);
+    for (int i = 0; i < qfnum; i++) {
+        if (i != idx_gfx && i != idx_tf && i != idx_comp)
+            continue; // ignore QFs not used internally
+
+        int qnum = qfs[i].queueCount;
+        if (qmax < qnum) {
+            PL_DEBUG(vk, "Restricting QF %d from %d queues to %d", i, qnum, qmax);
+            qnum = qmax;
+        }
+
+        struct vk_cmdpool *pool = vk_cmdpool_create(vk, i, qnum, qfs[i]);
+        if (!pool)
+            goto error;
+        PL_ARRAY_APPEND(vk->alloc, vk->pools, pool);
+
+        // Update the pool_* pointers based on the corresponding index
+        const char *qf_name = NULL;
+        if (i == idx_tf) {
+            vk->pool_transfer = pool;
+            qf_name = "transfer";
+        }
+        if (i == idx_comp) {
+            vk->pool_compute = pool;
+            qf_name = "compute";
+        }
+        if (i == idx_gfx) {
+            vk->pool_graphics = pool;
+            qf_name = "graphics";
+        }
+
+        for (int n = 0; n < pool->num_queues; n++)
+            PL_VK_NAME_HANDLE(QUEUE, pool->queues[n], qf_name);
+    }
+
+    pl_free(tmp);
+    return true;
+
+error:
+    PL_FATAL(vk, "Failed creating logical device!");
+    pl_free(tmp);
+    vk->failed = true;
+    return false;
+}
+
+static void lock_queue(pl_vulkan pl_vk, uint32_t qf, uint32_t qidx)
+{
+    struct vk_ctx *vk = PL_PRIV(pl_vk);
+    vk->lock_queue(vk->queue_ctx, qf, qidx);
+}
+
+static void unlock_queue(pl_vulkan pl_vk, uint32_t qf, uint32_t qidx)
+{
+    struct vk_ctx *vk = PL_PRIV(pl_vk);
+    vk->unlock_queue(vk->queue_ctx, qf, qidx);
+}
+
+static bool finalize_context(struct pl_vulkan_t *pl_vk, int max_glsl_version)
+{
+    struct vk_ctx *vk = PL_PRIV(pl_vk);
+
+    pl_assert(vk->pool_graphics);
+    pl_assert(vk->pool_compute);
+    pl_assert(vk->pool_transfer);
+
+    vk->ma = vk_malloc_create(vk);
+    if (!vk->ma)
+        return false;
+
+    pl_vk->gpu = pl_gpu_create_vk(vk);
+    if (!pl_vk->gpu)
+        return false;
+
+    // Blacklist / restrict features
+    if (max_glsl_version) {
+        struct pl_glsl_version *glsl = (struct pl_glsl_version *) &pl_vk->gpu->glsl;
+        glsl->version = PL_MIN(glsl->version, max_glsl_version);
+        glsl->version = PL_MAX(glsl->version, 140); // required for GL_KHR_vulkan_glsl
+        PL_INFO(vk, "Restricting GLSL version to %d... new version is %d",
+                max_glsl_version, glsl->version);
+    }
+
+    // Expose the resulting vulkan objects
+    pl_vk->instance = vk->inst;
+    pl_vk->phys_device = vk->physd;
+    pl_vk->device = vk->dev;
+    pl_vk->get_proc_addr = vk->GetInstanceProcAddr;
+    pl_vk->api_version = vk->api_ver;
+    pl_vk->extensions = vk->exts.elem;
+    pl_vk->num_extensions = vk->exts.num;
+    pl_vk->features = &vk->features;
+    pl_vk->num_queues = vk->pools.num;
+    pl_vk->queues = pl_calloc_ptr(vk->alloc, vk->pools.num, pl_vk->queues);
+    pl_vk->lock_queue = lock_queue;
+    pl_vk->unlock_queue = unlock_queue;
+
+    for (int i = 0; i < vk->pools.num; i++) {
+        struct pl_vulkan_queue *queues = (struct pl_vulkan_queue *) pl_vk->queues;
+        queues[i] = (struct pl_vulkan_queue) {
+            .index = vk->pools.elem[i]->qf,
+            .count = vk->pools.elem[i]->num_queues,
+        };
+
+        if (vk->pools.elem[i] == vk->pool_graphics)
+            pl_vk->queue_graphics = queues[i];
+        if (vk->pools.elem[i] == vk->pool_compute)
+            pl_vk->queue_compute = queues[i];
+        if (vk->pools.elem[i] == vk->pool_transfer)
+            pl_vk->queue_transfer = queues[i];
+    }
+
+    pl_assert(vk->lock_queue);
+    pl_assert(vk->unlock_queue);
+    return true;
+}
+
+pl_vulkan pl_vulkan_create(pl_log log, const struct pl_vulkan_params *params)
+{
+    params = PL_DEF(params, &pl_vulkan_default_params);
+    struct pl_vulkan_t *pl_vk = pl_zalloc_obj(NULL, pl_vk, struct vk_ctx);
+    struct vk_ctx *vk = PL_PRIV(pl_vk);
+    *vk = (struct vk_ctx) {
+        .vulkan = pl_vk,
+        .alloc = pl_vk,
+        .log = log,
+        .inst = params->instance,
+        .GetInstanceProcAddr = get_proc_addr_fallback(log, params->get_proc_addr),
+    };
+
+    pl_mutex_init_type(&vk->lock, PL_MUTEX_RECURSIVE);
+    if (!vk->GetInstanceProcAddr)
+        goto error;
+
+    if (!vk->inst) {
+        pl_assert(!params->surface);
+        pl_assert(!params->device);
+        PL_DEBUG(vk, "No VkInstance provided, creating one...");
+
+        // Mirror the instance params here to set `get_proc_addr` correctly
+        struct pl_vk_inst_params iparams;
+        iparams = *PL_DEF(params->instance_params, &pl_vk_inst_default_params);
+        iparams.get_proc_addr = params->get_proc_addr;
+        vk->internal_instance = pl_vk_inst_create(log, &iparams);
+        if (!vk->internal_instance)
+            goto error;
+        vk->inst = vk->internal_instance->instance;
+    }
+
+    // Directly load all mandatory instance-level function pointers, since
+    // these will be required for all further device creation logic
+    for (int i = 0; i < PL_ARRAY_SIZE(vk_inst_funs); i++)
+        load_vk_fun(vk, &vk_inst_funs[i]);
+
+    // Choose the physical device
+    if (params->device) {
+        PL_DEBUG(vk, "Using specified VkPhysicalDevice");
+        vk->physd = params->device;
+    } else {
+        struct pl_vulkan_device_params dparams = {
+            .instance       = vk->inst,
+            .get_proc_addr  = params->get_proc_addr,
+            .surface        = params->surface,
+            .device_name    = params->device_name,
+            .allow_software = params->allow_software,
+        };
+        memcpy(dparams.device_uuid, params->device_uuid, VK_UUID_SIZE);
+
+        vk->physd = pl_vulkan_choose_device(log, &dparams);
+        if (!vk->physd) {
+            PL_FATAL(vk, "Found no suitable device, giving up.");
+            goto error;
+        }
+    }
+
+    VkPhysicalDeviceIDPropertiesKHR id_props = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR,
+    };
+
+    VkPhysicalDeviceProperties2KHR prop = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
+        .pNext = &id_props,
+    };
+
+    vk->GetPhysicalDeviceProperties2(vk->physd, &prop);
+    vk->props = prop.properties;
+
+    PL_INFO(vk, "Vulkan device properties:");
+    PL_INFO(vk, "    Device Name: %s", prop.properties.deviceName);
+    PL_INFO(vk, "    Device ID: %"PRIx32":%"PRIx32, prop.properties.vendorID,
+            prop.properties.deviceID);
+    PL_INFO(vk, "    Device UUID: %s", PRINT_UUID(id_props.deviceUUID));
+    PL_INFO(vk, "    Driver version: %"PRIx32, prop.properties.driverVersion);
+    PL_INFO(vk, "    API version: %d.%d.%d", PRINTF_VER(prop.properties.apiVersion));
+
+    // Needed by device_init
+    vk->api_ver = prop.properties.apiVersion;
+    if (params->max_api_version) {
+        vk->api_ver = PL_MIN(vk->api_ver, params->max_api_version);
+        PL_INFO(vk, "Restricting API version to %d.%d.%d... new version %d.%d.%d",
+                PRINTF_VER(params->max_api_version), PRINTF_VER(vk->api_ver));
+    }
+
+    if (vk->api_ver < PL_VK_MIN_VERSION) {
+        PL_FATAL(vk, "Device API version %d.%d.%d is lower than the minimum "
+                 "required version of %d.%d.%d, cannot proceed!",
+                 PRINTF_VER(vk->api_ver), PRINTF_VER(PL_VK_MIN_VERSION));
+        goto error;
+    }
+
+    // Finally, initialize the logical device and the rest of the vk_ctx
+    if (!device_init(vk, params))
+        goto error;
+
+    if (!finalize_context(pl_vk, params->max_glsl_version))
+        goto error;
+
+    return pl_vk;
+
+error:
+    PL_FATAL(vk, "Failed initializing vulkan device");
+    pl_vulkan_destroy((pl_vulkan *) &pl_vk);
+    return NULL;
+}
+
+pl_vulkan pl_vulkan_import(pl_log log, const struct pl_vulkan_import_params *params)
+{
+    void *tmp = pl_tmp(NULL);
+
+    struct pl_vulkan_t *pl_vk = pl_zalloc_obj(NULL, pl_vk, struct vk_ctx);
+    struct vk_ctx *vk = PL_PRIV(pl_vk);
+    *vk = (struct vk_ctx) {
+        .vulkan = pl_vk,
+        .alloc = pl_vk,
+        .log = log,
+        .imported = true,
+        .inst = params->instance,
+        .physd = params->phys_device,
+        .dev = params->device,
+        .GetInstanceProcAddr = get_proc_addr_fallback(log, params->get_proc_addr),
+        .lock_queue = params->lock_queue,
+        .unlock_queue = params->unlock_queue,
+        .queue_ctx = params->queue_ctx,
+    };
+
+    pl_mutex_init_type(&vk->lock, PL_MUTEX_RECURSIVE);
+    if (!vk->GetInstanceProcAddr)
+        goto error;
+
+    for (int i = 0; i < PL_ARRAY_SIZE(vk_inst_funs); i++)
+        load_vk_fun(vk, &vk_inst_funs[i]);
+
+    VkPhysicalDeviceIDPropertiesKHR id_props = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR,
+    };
+
+    VkPhysicalDeviceProperties2KHR prop = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
+        .pNext = &id_props,
+    };
+
+    pl_assert(vk->GetPhysicalDeviceProperties2);
+    vk->GetPhysicalDeviceProperties2(vk->physd, &prop);
+    vk->props = prop.properties;
+
+    PL_INFO(vk, "Imported vulkan device properties:");
+    PL_INFO(vk, "    Device Name: %s", prop.properties.deviceName);
+    PL_INFO(vk, "    Device ID: %"PRIx32":%"PRIx32, prop.properties.vendorID,
+            prop.properties.deviceID);
+    PL_INFO(vk, "    Device UUID: %s", PRINT_UUID(id_props.deviceUUID));
+    PL_INFO(vk, "    Driver version: %"PRIx32, prop.properties.driverVersion);
+    PL_INFO(vk, "    API version: %d.%d.%d", PRINTF_VER(prop.properties.apiVersion));
+
+    vk->api_ver = prop.properties.apiVersion;
+    if (params->max_api_version) {
+        vk->api_ver = PL_MIN(vk->api_ver, params->max_api_version);
+        PL_INFO(vk, "Restricting API version to %d.%d.%d... new version %d.%d.%d",
+                PRINTF_VER(params->max_api_version), PRINTF_VER(vk->api_ver));
+    }
+
+    if (vk->api_ver < PL_VK_MIN_VERSION) {
+        PL_FATAL(vk, "Device API version %d.%d.%d is lower than the minimum "
+                 "required version of %d.%d.%d, cannot proceed!",
+                 PRINTF_VER(vk->api_ver), PRINTF_VER(PL_VK_MIN_VERSION));
+        goto error;
+    }
+
+    vk->features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
+    vk_features_normalize(vk->alloc, params->features, 0, &vk->features);
+    if (!check_required_features(vk)) {
+        PL_FATAL(vk, "Imported Vulkan device was not created with all required "
+                 "features!");
+        goto error;
+    }
+
+    // Load all mandatory device-level functions
+    for (int i = 0; i < PL_ARRAY_SIZE(vk_dev_funs); i++)
+        load_vk_fun(vk, &vk_dev_funs[i]);
+
+    // Load all of the optional functions from the extensions enabled
+    for (int i = 0; i < PL_ARRAY_SIZE(vk_device_extensions); i++) {
+        const struct vk_ext *ext = &vk_device_extensions[i];
+        uint32_t core_ver = vk_ext_promoted_ver(ext->name);
+        if (core_ver && vk->api_ver >= core_ver) {
+            for (const struct vk_fun *f = ext->funs; f && f->name; f++)
+                load_vk_fun(vk, f);
+            continue;
+        }
+        for (int n = 0; n < params->num_extensions; n++) {
+            if (strcmp(ext->name, params->extensions[n]) == 0) {
+                for (const struct vk_fun *f = ext->funs; f && f->name; f++)
+                    load_vk_fun(vk, f);
+                break;
+            }
+        }
+    }
+
+    uint32_t qfnum = 0;
+    vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, NULL);
+    VkQueueFamilyProperties *qfs = pl_calloc_ptr(tmp, qfnum, qfs);
+    vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, qfs);
+    if (!params->lock_queue)
+        init_queue_locks(vk, qfnum, qfs);
+
+    // Create the command pools for each unique qf that exists
+    struct {
+        const struct pl_vulkan_queue *info;
+        struct vk_cmdpool **pool;
+        VkQueueFlagBits flags; // *any* of these flags provide the cap
+    } qinfos[] = {
+        {
+            .info = &params->queue_graphics,
+            .pool = &vk->pool_graphics,
+            .flags = VK_QUEUE_GRAPHICS_BIT,
+        }, {
+            .info = &params->queue_compute,
+            .pool = &vk->pool_compute,
+            .flags = VK_QUEUE_COMPUTE_BIT,
+        }, {
+            .info = &params->queue_transfer,
+            .pool = &vk->pool_transfer,
+            .flags = VK_QUEUE_TRANSFER_BIT |
+                     VK_QUEUE_GRAPHICS_BIT |
+                     VK_QUEUE_COMPUTE_BIT,
+        }
+    };
+
+    for (int i = 0; i < PL_ARRAY_SIZE(qinfos); i++) {
+        int qf = qinfos[i].info->index;
+        struct vk_cmdpool **pool = qinfos[i].pool;
+        if (!qinfos[i].info->count)
+            continue;
+
+        // API sanity check
+        pl_assert(qfs[qf].queueFlags & qinfos[i].flags);
+
+        // See if we already created a pool for this queue family
+        for (int j = 0; j < i; j++) {
+            if (qinfos[j].info->count && qinfos[j].info->index == qf) {
+                *pool = *qinfos[j].pool;
+                goto next_qf;
+            }
+        }
+
+        *pool = vk_cmdpool_create(vk, qf, qinfos[i].info->count, qfs[qf]);
+        if (!*pool)
+            goto error;
+        PL_ARRAY_APPEND(vk->alloc, vk->pools, *pool);
+
+        // Pre-emptively set "lower priority" pools as well
+        for (int j = i+1; j < PL_ARRAY_SIZE(qinfos); j++) {
+            if (qfs[qf].queueFlags & qinfos[j].flags)
+                *qinfos[j].pool = *pool;
+        }
+
+next_qf: ;
+    }
+
+    if (!vk->pool_graphics) {
+        PL_ERR(vk, "No valid queues provided?");
+        goto error;
+    }
+
+    if (!finalize_context(pl_vk, params->max_glsl_version))
+        goto error;
+
+    pl_free(tmp);
+    return pl_vk;
+
+error:
+    PL_FATAL(vk, "Failed importing vulkan device");
+    pl_vulkan_destroy((pl_vulkan *) &pl_vk);
+    pl_free(tmp);
+    return NULL;
+}
diff --git a/src/vulkan/formats.c b/src/vulkan/formats.c
new file mode 100644
index 0000000..f0eb0fb
--- /dev/null
+++ b/src/vulkan/formats.c
@@ -0,0 +1,616 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "formats.h"
+
+#define FMT(_name, num, size, ftype, bits, idx) \
+    (struct pl_fmt_t) {                         \
+        .name = _name,                          \
+        .type = PL_FMT_##ftype,                 \
+        .num_components  = num,                 \
+        .component_depth = bits,                \
+        .internal_size   = size,                \
+        .opaque          = false,               \
+        .texel_size      = size,                \
+        .texel_align     = size,                \
+        .host_bits       = bits,                \
+        .sample_order    = idx,                 \
+    }
+
+#define IDX(...)  {__VA_ARGS__}
+#define BITS(...) {__VA_ARGS__}
+
+#define REGFMT(name, num, bits, type)           \
+    FMT(name, num, (num) * (bits) / 8, type,    \
+        BITS(bits, bits, bits, bits),           \
+        IDX(0, 1, 2, 3))
+
+#define EMUFMT(_name, in, en, ib, eb, ftype)    \
+    (struct pl_fmt_t) {                         \
+        .name = _name,                          \
+        .type = PL_FMT_##ftype,                 \
+        .num_components  = en,                  \
+        .component_depth = BITS(ib, ib, ib, ib),\
+        .internal_size   = (in) * (ib) / 8,     \
+        .opaque          = false,               \
+        .emulated        = true,                \
+        .texel_size      = (en) * (eb) / 8,     \
+        .texel_align     = (eb) / 8,            \
+        .host_bits       = BITS(eb, eb, eb, eb),\
+        .sample_order    = IDX(0, 1, 2, 3),     \
+    }
+
+#define PACKED16FMT(_name, num, b)              \
+    (struct pl_fmt_t) {                         \
+        .name            = _name,               \
+        .type            = PL_FMT_UNORM,        \
+        .num_components  = num,                 \
+        .component_depth = BITS(b, b, b, b),    \
+        .internal_size   = (num) * 2,           \
+        .texel_size      = (num) * 2,           \
+        .texel_align     = (num) * 2,           \
+        .host_bits       = BITS(16, 16, 16, 16),\
+        .sample_order    = IDX(0, 1, 2, 3),     \
+    }
+
+#define PLANARFMT(_name, planes, size, bits)    \
+    (struct pl_fmt_t) {                         \
+        .name            = _name,               \
+        .type            = PL_FMT_UNORM,        \
+        .num_planes      = planes,              \
+        .num_components  = 3,                   \
+        .component_depth = {bits, bits, bits},  \
+        .internal_size   = size,                \
+        .opaque          = true,                \
+    }
+
+static const struct vk_format rgb8e = {
+    .tfmt   = VK_FORMAT_R8G8B8A8_UNORM,
+    .bfmt   = VK_FORMAT_R8G8B8_UNORM,
+    .icomps = 4,
+    .fmt    = EMUFMT("rgb8", 4, 3, 8, 8, UNORM),
+};
+
+static const struct vk_format rgb16e = {
+    .tfmt   = VK_FORMAT_R16G16B16A16_UNORM,
+    .bfmt   = VK_FORMAT_R16G16B16_UNORM,
+    .icomps = 4,
+    .fmt    = EMUFMT("rgb16", 4, 3, 16, 16, UNORM),
+};
+
+static const struct vk_format vk_formats[] = {
+    // Regular, byte-aligned integer formats
+    {VK_FORMAT_R8_UNORM,              REGFMT("r8",       1,  8, UNORM)},
+    {VK_FORMAT_R8G8_UNORM,            REGFMT("rg8",      2,  8, UNORM)},
+    {VK_FORMAT_R8G8B8_UNORM,          REGFMT("rgb8",     3,  8, UNORM), .emufmt = &rgb8e},
+    {VK_FORMAT_R8G8B8A8_UNORM,        REGFMT("rgba8",    4,  8, UNORM)},
+    {VK_FORMAT_R16_UNORM,             REGFMT("r16",      1, 16, UNORM)},
+    {VK_FORMAT_R16G16_UNORM,          REGFMT("rg16",     2, 16, UNORM)},
+    {VK_FORMAT_R16G16B16_UNORM,       REGFMT("rgb16",    3, 16, UNORM), .emufmt = &rgb16e},
+    {VK_FORMAT_R16G16B16A16_UNORM,    REGFMT("rgba16",   4, 16, UNORM)},
+
+    {VK_FORMAT_R8_SNORM,              REGFMT("r8s",      1,  8, SNORM)},
+    {VK_FORMAT_R8G8_SNORM,            REGFMT("rg8s",     2,  8, SNORM)},
+    {VK_FORMAT_R8G8B8_SNORM,          REGFMT("rgb8s",    3,  8, SNORM)},
+    {VK_FORMAT_R8G8B8A8_SNORM,        REGFMT("rgba8s",   4,  8, SNORM)},
+    {VK_FORMAT_R16_SNORM,             REGFMT("r16s",     1, 16, SNORM)},
+    {VK_FORMAT_R16G16_SNORM,          REGFMT("rg16s",    2, 16, SNORM)},
+    {VK_FORMAT_R16G16B16_SNORM,       REGFMT("rgb16s",   3, 16, SNORM)},
+    {VK_FORMAT_R16G16B16A16_SNORM,    REGFMT("rgba16s",  4, 16, SNORM)},
+
+    // Float formats (native formats: hf = half float, df = double float)
+    {VK_FORMAT_R16_SFLOAT,            REGFMT("r16hf",    1, 16, FLOAT)},
+    {VK_FORMAT_R16G16_SFLOAT,         REGFMT("rg16hf",   2, 16, FLOAT)},
+    {VK_FORMAT_R16G16B16_SFLOAT,      REGFMT("rgb16hf",  3, 16, FLOAT)},
+    {VK_FORMAT_R16G16B16A16_SFLOAT,   REGFMT("rgba16hf", 4, 16, FLOAT)},
+    {VK_FORMAT_R32_SFLOAT,            REGFMT("r32f",     1, 32, FLOAT)},
+    {VK_FORMAT_R32G32_SFLOAT,         REGFMT("rg32f",    2, 32, FLOAT)},
+    {VK_FORMAT_R32G32B32_SFLOAT,      REGFMT("rgb32f",   3, 32, FLOAT)},
+    {VK_FORMAT_R32G32B32A32_SFLOAT,   REGFMT("rgba32f",  4, 32, FLOAT)},
+
+    // Float formats (emulated upload/download)
+    {VK_FORMAT_R16_SFLOAT,            EMUFMT("r16f",     1, 1, 16, 32, FLOAT)},
+    {VK_FORMAT_R16G16_SFLOAT,         EMUFMT("rg16f",    2, 2, 16, 32, FLOAT)},
+    {VK_FORMAT_R16G16B16_SFLOAT,      EMUFMT("rgb16f",   3, 3, 16, 32, FLOAT)},
+    {VK_FORMAT_R16G16B16A16_SFLOAT,   EMUFMT("rgba16f",  4, 4, 16, 32, FLOAT)},
+
+    // Integer-sampled formats
+    {VK_FORMAT_R8_UINT,               REGFMT("r8u",      1,  8, UINT)},
+    {VK_FORMAT_R8G8_UINT,             REGFMT("rg8u",     2,  8, UINT)},
+    {VK_FORMAT_R8G8B8_UINT,           REGFMT("rgb8u",    3,  8, UINT)},
+    {VK_FORMAT_R8G8B8A8_UINT,         REGFMT("rgba8u",   4,  8, UINT)},
+    {VK_FORMAT_R16_UINT,              REGFMT("r16u",     1, 16, UINT)},
+    {VK_FORMAT_R16G16_UINT,           REGFMT("rg16u",    2, 16, UINT)},
+    {VK_FORMAT_R16G16B16_UINT,        REGFMT("rgb16u",   3, 16, UINT)},
+    {VK_FORMAT_R16G16B16A16_UINT,     REGFMT("rgba16u",  4, 16, UINT)},
+    {VK_FORMAT_R32_UINT,              REGFMT("r32u",     1, 32, UINT)},
+    {VK_FORMAT_R32G32_UINT,           REGFMT("rg32u",    2, 32, UINT)},
+    {VK_FORMAT_R32G32B32_UINT,        REGFMT("rgb32u",   3, 32, UINT)},
+    {VK_FORMAT_R32G32B32A32_UINT,     REGFMT("rgba32u",  4, 32, UINT)},
+
+    {VK_FORMAT_R8_SINT,               REGFMT("r8i",      1,  8, SINT)},
+    {VK_FORMAT_R8G8_SINT,             REGFMT("rg8i",     2,  8, SINT)},
+    {VK_FORMAT_R8G8B8_SINT,           REGFMT("rgb8i",    3,  8, SINT)},
+    {VK_FORMAT_R8G8B8A8_SINT,         REGFMT("rgba8i",   4,  8, SINT)},
+    {VK_FORMAT_R16_SINT,              REGFMT("r16i",     1, 16, SINT)},
+    {VK_FORMAT_R16G16_SINT,           REGFMT("rg16i",    2, 16, SINT)},
+    {VK_FORMAT_R16G16B16_SINT,        REGFMT("rgb16i",   3, 16, SINT)},
+    {VK_FORMAT_R16G16B16A16_SINT,     REGFMT("rgba16i",  4, 16, SINT)},
+    {VK_FORMAT_R32_SINT,              REGFMT("r32i",     1, 32, SINT)},
+    {VK_FORMAT_R32G32_SINT,           REGFMT("rg32i",    2, 32, SINT)},
+    {VK_FORMAT_R32G32B32_SINT,        REGFMT("rgb32i",   3, 32, SINT)},
+    {VK_FORMAT_R32G32B32A32_SINT,     REGFMT("rgba32i",  4, 32, SINT)},
+
+    // "Swapped" component order formats
+    {VK_FORMAT_B8G8R8_UNORM,             FMT("bgr8",     3,  3, UNORM, BITS(8,  8,  8),     IDX(2, 1, 0))},
+    {VK_FORMAT_B8G8R8A8_UNORM,           FMT("bgra8",    4,  4, UNORM, BITS(8,  8,  8,  8), IDX(2, 1, 0, 3))},
+
+    {VK_FORMAT_B8G8R8_UINT,              FMT("bgr8u",    3,  3, UINT,  BITS(8,  8,  8),     IDX(2, 1, 0))},
+    {VK_FORMAT_B8G8R8A8_UINT,            FMT("bgra8u",   4,  4, UINT,  BITS(8,  8,  8,  8), IDX(2, 1, 0, 3))},
+
+    {VK_FORMAT_B8G8R8_SINT,              FMT("bgr8i",    3,  3, SINT,  BITS(8,  8,  8),     IDX(2, 1, 0))},
+    {VK_FORMAT_B8G8R8A8_SINT,            FMT("bgra8i",   4,  4, SINT,  BITS(8,  8,  8,  8), IDX(2, 1, 0, 3))},
+
+    // "Packed" integer formats
+    //
+    // Note: These have the component order reversed from what the vulkan name
+    // implies, because we order our IDX from LSB to MSB (consistent with the
+    // usual ordering from lowest byte to highest byte, on little endian
+    // platforms), but Vulkan names them from MSB to LSB.
+    {VK_FORMAT_R4G4_UNORM_PACK8,         FMT("gr4",      2,  1, UNORM, BITS(4,  4),         IDX(1, 0))},
+    {VK_FORMAT_B4G4R4A4_UNORM_PACK16,    FMT("argb4",    4,  2, UNORM, BITS(4,  4,  4,  4), IDX(3, 0, 1, 2))},
+    {VK_FORMAT_R4G4B4A4_UNORM_PACK16,    FMT("abgr4",    4,  2, UNORM, BITS(4,  4,  4,  4), IDX(3, 2, 1, 0))},
+
+    {VK_FORMAT_R5G6B5_UNORM_PACK16,      FMT("bgr565",   3,  2, UNORM, BITS(5,  6,  5),     IDX(2, 1, 0))},
+    {VK_FORMAT_B5G6R5_UNORM_PACK16,      FMT("rgb565",   3,  2, UNORM, BITS(5,  6,  5),     IDX(0, 1, 2))},
+
+    {VK_FORMAT_R5G5B5A1_UNORM_PACK16,    FMT("a1bgr5",   4,  2, UNORM, BITS(1,  5,  5,  5), IDX(3, 2, 1, 0))},
+    {VK_FORMAT_B5G5R5A1_UNORM_PACK16,    FMT("a1rgb5",   4,  2, UNORM, BITS(1,  5,  5,  5), IDX(3, 0, 1, 2))},
+    {VK_FORMAT_A1R5G5B5_UNORM_PACK16,    FMT("bgr5a1",   4,  2, UNORM, BITS(5,  5,  5,  1), IDX(2, 1, 0, 3))},
+
+    {VK_FORMAT_A2B10G10R10_UNORM_PACK32, FMT("rgb10a2",  4,  4, UNORM, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))},
+    {VK_FORMAT_A2R10G10B10_UNORM_PACK32, FMT("bgr10a2",  4,  4, UNORM, BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))},
+    {VK_FORMAT_A2B10G10R10_SNORM_PACK32, FMT("rgb10a2s", 4,  4, SNORM, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))},
+    {VK_FORMAT_A2R10G10B10_SNORM_PACK32, FMT("bgr10a2s", 4,  4, SNORM, BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))},
+    {VK_FORMAT_A2B10G10R10_UINT_PACK32,  FMT("rgb10a2u", 4,  4, UINT,  BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))},
+    {VK_FORMAT_A2R10G10B10_UINT_PACK32,  FMT("bgr10a2u", 4,  4, UINT,  BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))},
+    {VK_FORMAT_A2B10G10R10_SINT_PACK32,  FMT("rgb10a2i", 4,  4, SINT,  BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))},
+    {VK_FORMAT_A2R10G10B10_SINT_PACK32,  FMT("bgr10a2i", 4,  4, SINT,  BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))},
+
+
+    // Packed 16 bit formats
+    {VK_FORMAT_R10X6_UNORM_PACK16,                  PACKED16FMT("rx10",         1, 10)},
+    {VK_FORMAT_R10X6G10X6_UNORM_2PACK16,            PACKED16FMT("rxgx10",       2, 10)},
+    {VK_FORMAT_R12X4_UNORM_PACK16,                  PACKED16FMT("rx12",         1, 12)},
+    {VK_FORMAT_R12X4G12X4_UNORM_2PACK16,            PACKED16FMT("rxgx12",       2, 12)},
+
+    // FIXME: enabling these requires VK_EXT_rgba10x6_formats or equivalent
+    // {VK_FORMAT_R10X6G10X6B10X6A10X6_UNORM_4PACK16,  PACKED16FMT("rxgxbxax10",   4, 10)},
+    // {VK_FORMAT_R12X4G12X4B12X4A12X4_UNORM_4PACK16,  PACKED16FMT("rxgxbxax12",   4, 12)},
+
+    // Planar formats
+    {VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM, PLANARFMT("g8_b8_r8_420", 3, 12, 8),
+        .pfmt = {
+            {VK_FORMAT_R8_UNORM},
+            {VK_FORMAT_R8_UNORM, .sx = 1, .sy = 1},
+            {VK_FORMAT_R8_UNORM, .sx = 1, .sy = 1},
+        },
+    },
+    {VK_FORMAT_G8_B8_R8_3PLANE_422_UNORM, PLANARFMT("g8_b8_r8_422", 3, 16, 8),
+        .pfmt = {
+            {VK_FORMAT_R8_UNORM},
+            {VK_FORMAT_R8_UNORM, .sx = 1},
+            {VK_FORMAT_R8_UNORM, .sx = 1},
+        },
+    },
+    {VK_FORMAT_G8_B8_R8_3PLANE_444_UNORM, PLANARFMT("g8_b8_r8_444", 3, 24, 8),
+        .pfmt = {
+            {VK_FORMAT_R8_UNORM},
+            {VK_FORMAT_R8_UNORM},
+            {VK_FORMAT_R8_UNORM},
+        },
+    },
+
+    {VK_FORMAT_G16_B16_R16_3PLANE_420_UNORM, PLANARFMT("g16_b16_r16_420", 3, 24, 16),
+        .pfmt = {
+            {VK_FORMAT_R16_UNORM},
+            {VK_FORMAT_R16_UNORM, .sx = 1, .sy = 1},
+            {VK_FORMAT_R16_UNORM, .sx = 1, .sy = 1},
+        },
+    },
+    {VK_FORMAT_G16_B16_R16_3PLANE_422_UNORM, PLANARFMT("g16_b16_r16_422", 3, 32, 16),
+        .pfmt = {
+            {VK_FORMAT_R16_UNORM},
+            {VK_FORMAT_R16_UNORM, .sx = 1},
+            {VK_FORMAT_R16_UNORM, .sx = 1},
+        },
+    },
+    {VK_FORMAT_G16_B16_R16_3PLANE_444_UNORM, PLANARFMT("g16_b16_r16_444", 3, 48, 16),
+        .pfmt = {
+            {VK_FORMAT_R16_UNORM},
+            {VK_FORMAT_R16_UNORM},
+            {VK_FORMAT_R16_UNORM},
+        },
+    },
+
+    {VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_420_UNORM_3PACK16, PLANARFMT("gx10_bx10_rx10_420", 3, 24, 10),
+        .pfmt = {
+            {VK_FORMAT_R10X6_UNORM_PACK16},
+            {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1, .sy = 1},
+            {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1, .sy = 1},
+        },
+    },
+    {VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_422_UNORM_3PACK16, PLANARFMT("gx10_bx10_rx10_422", 3, 32, 10),
+        .pfmt = {
+            {VK_FORMAT_R10X6_UNORM_PACK16},
+            {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1},
+            {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1},
+        },
+    },
+    {VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_444_UNORM_3PACK16, PLANARFMT("gx10_bx10_rx10_444", 3, 48, 10),
+        .pfmt = {
+            {VK_FORMAT_R10X6_UNORM_PACK16},
+            {VK_FORMAT_R10X6_UNORM_PACK16},
+            {VK_FORMAT_R10X6_UNORM_PACK16},
+        },
+    },
+
+    {VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_420_UNORM_3PACK16, PLANARFMT("gx12_bx12_rx12_420", 3, 24, 12),
+        .pfmt = {
+            {VK_FORMAT_R12X4_UNORM_PACK16},
+            {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1, .sy = 1},
+            {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1, .sy = 1},
+        },
+    },
+    {VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_422_UNORM_3PACK16, PLANARFMT("gx12_bx12_rx12_422", 3, 32, 12),
+        .pfmt = {
+            {VK_FORMAT_R12X4_UNORM_PACK16},
+            {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1},
+            {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1},
+        },
+    },
+    {VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_444_UNORM_3PACK16, PLANARFMT("gx12_bx12_rx12_444", 3, 48, 12),
+        .pfmt = {
+            {VK_FORMAT_R12X4_UNORM_PACK16},
+            {VK_FORMAT_R12X4_UNORM_PACK16},
+            {VK_FORMAT_R12X4_UNORM_PACK16},
+        },
+    },
+
+    {VK_FORMAT_G8_B8R8_2PLANE_420_UNORM, PLANARFMT("g8_br8_420", 2, 12, 8),
+        .pfmt = {
+            {VK_FORMAT_R8_UNORM},
+            {VK_FORMAT_R8G8_UNORM, .sx = 1, .sy = 1},
+        },
+    },
+    {VK_FORMAT_G8_B8R8_2PLANE_422_UNORM, PLANARFMT("g8_br8_422", 2, 16, 8),
+        .pfmt = {
+            {VK_FORMAT_R8_UNORM},
+            {VK_FORMAT_R8G8_UNORM, .sx = 1},
+        },
+    },
+    {VK_FORMAT_G8_B8R8_2PLANE_444_UNORM, PLANARFMT("g8_br8_444", 2, 24, 8),
+        .min_ver = VK_API_VERSION_1_3,
+        .pfmt = {
+            {VK_FORMAT_R8_UNORM},
+            {VK_FORMAT_R8G8_UNORM},
+        },
+    },
+
+    {VK_FORMAT_G16_B16R16_2PLANE_420_UNORM, PLANARFMT("g16_br16_420", 2, 24, 16),
+        .pfmt = {
+            {VK_FORMAT_R16_UNORM},
+            {VK_FORMAT_R16G16_UNORM, .sx = 1, .sy = 1},
+        },
+    },
+    {VK_FORMAT_G16_B16R16_2PLANE_422_UNORM, PLANARFMT("g16_br16_422", 2, 32, 16),
+        .pfmt = {
+            {VK_FORMAT_R16_UNORM},
+            {VK_FORMAT_R16G16_UNORM, .sx = 1},
+        },
+    },
+    {VK_FORMAT_G16_B16R16_2PLANE_444_UNORM, PLANARFMT("g16_br16_444", 2, 48, 16),
+        .min_ver = VK_API_VERSION_1_3,
+        .pfmt = {
+            {VK_FORMAT_R16_UNORM},
+            {VK_FORMAT_R16G16_UNORM},
+        },
+    },
+
+    {VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16, PLANARFMT("gx10_bxrx10_420", 2, 24, 10),
+        .pfmt = {
+            {VK_FORMAT_R10X6_UNORM_PACK16},
+            {VK_FORMAT_R10X6G10X6_UNORM_2PACK16, .sx = 1, .sy = 1},
+        },
+    },
+    {VK_FORMAT_G10X6_B10X6R10X6_2PLANE_422_UNORM_3PACK16, PLANARFMT("gx10_bxrx10_422", 2, 32, 10),
+        .pfmt = {
+            {VK_FORMAT_R10X6_UNORM_PACK16},
+            {VK_FORMAT_R10X6G10X6_UNORM_2PACK16, .sx = 1},
+        },
+    },
+    {VK_FORMAT_G10X6_B10X6R10X6_2PLANE_444_UNORM_3PACK16, PLANARFMT("gx10_bxrx10_444", 2, 48, 10),
+        .min_ver = VK_API_VERSION_1_3,
+        .pfmt = {
+            {VK_FORMAT_R10X6_UNORM_PACK16},
+            {VK_FORMAT_R10X6G10X6_UNORM_2PACK16},
+        },
+    },
+
+    {VK_FORMAT_G12X4_B12X4R12X4_2PLANE_420_UNORM_3PACK16, PLANARFMT("gx12_bxrx12_420", 2, 24, 12),
+        .pfmt = {
+            {VK_FORMAT_R12X4_UNORM_PACK16},
+            {VK_FORMAT_R12X4G12X4_UNORM_2PACK16, .sx = 1, .sy = 1},
+        },
+    },
+    {VK_FORMAT_G12X4_B12X4R12X4_2PLANE_422_UNORM_3PACK16, PLANARFMT("gx12_bxrx12_422", 2, 32, 12),
+        .pfmt = {
+            {VK_FORMAT_R12X4_UNORM_PACK16},
+            {VK_FORMAT_R12X4G12X4_UNORM_2PACK16, .sx = 1},
+        },
+    },
+    {VK_FORMAT_G12X4_B12X4R12X4_2PLANE_444_UNORM_3PACK16, PLANARFMT("gx12_bxrx12_444", 2, 48, 12),
+        .min_ver = VK_API_VERSION_1_3,
+        .pfmt = {
+            {VK_FORMAT_R12X4_UNORM_PACK16},
+            {VK_FORMAT_R12X4G12X4_UNORM_2PACK16},
+        },
+    },
+
+    {0}
+};
+
+#undef BITS
+#undef IDX
+#undef REGFMT
+#undef FMT
+
+void vk_setup_formats(struct pl_gpu_t *gpu)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    PL_ARRAY(pl_fmt) formats = {0};
+
+    // Texture format emulation requires at least support for texel buffers
+    bool has_emu = gpu->glsl.compute && gpu->limits.max_buffer_texels;
+
+    for (const struct vk_format *pvk_fmt = vk_formats; pvk_fmt->tfmt; pvk_fmt++) {
+        const struct vk_format *vk_fmt = pvk_fmt;
+
+        // Skip formats that require a too new version of Vulkan
+        if (vk_fmt->min_ver > vk->api_ver)
+            continue;
+
+        // Skip formats with innately emulated representation if unsupported
+        if (vk_fmt->fmt.emulated && !has_emu)
+            continue;
+
+        // Suppress some errors/warnings spit out by the format probing code
+        pl_log_level_cap(vk->log, PL_LOG_INFO);
+
+        bool has_drm_mods = vk->GetImageDrmFormatModifierPropertiesEXT;
+        VkDrmFormatModifierPropertiesEXT modifiers[16] = {0};
+        VkDrmFormatModifierPropertiesListEXT drm_props = {
+            .sType = VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT,
+            .drmFormatModifierCount = PL_ARRAY_SIZE(modifiers),
+            .pDrmFormatModifierProperties = modifiers,
+        };
+
+        VkFormatProperties2KHR prop2 = {
+            .sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2,
+            .pNext = has_drm_mods ? &drm_props : NULL,
+        };
+
+        vk->GetPhysicalDeviceFormatProperties2KHR(vk->physd, vk_fmt->tfmt, &prop2);
+
+        // If wholly unsupported, try falling back to the emulation formats
+        // for texture operations
+        VkFormatProperties *prop = &prop2.formatProperties;
+        while (has_emu && !prop->optimalTilingFeatures && vk_fmt->emufmt) {
+            vk_fmt = vk_fmt->emufmt;
+            vk->GetPhysicalDeviceFormatProperties2KHR(vk->physd, vk_fmt->tfmt, &prop2);
+        }
+
+        VkFormatFeatureFlags texflags = prop->optimalTilingFeatures;
+        VkFormatFeatureFlags bufflags = prop->bufferFeatures;
+        if (vk_fmt->fmt.emulated) {
+            // Emulated formats might have a different buffer representation
+            // than their texture representation. If they don't, assume their
+            // buffer representation is nonsensical (e.g. r16f)
+            if (vk_fmt->bfmt) {
+                vk->GetPhysicalDeviceFormatProperties(vk->physd, vk_fmt->bfmt, prop);
+                bufflags = prop->bufferFeatures;
+            } else {
+                bufflags = 0;
+            }
+        } else if (vk_fmt->fmt.num_planes) {
+            // Planar textures cannot be used directly
+            texflags = bufflags = 0;
+        }
+
+        pl_log_level_cap(vk->log, PL_LOG_NONE);
+
+        struct pl_fmt_t *fmt = pl_alloc_obj(gpu, fmt, struct pl_fmt_vk);
+        struct pl_fmt_vk *fmtp = PL_PRIV(fmt);
+        *fmt = vk_fmt->fmt;
+        *fmtp = (struct pl_fmt_vk) {
+            .vk_fmt = vk_fmt
+        };
+
+        // Always set the signature to the actual texture format, so we can use
+        // it to guarantee renderpass compatibility.
+        fmt->signature = (uint64_t) vk_fmt->tfmt;
+
+        // For sanity, clear the superfluous fields
+        for (int i = fmt->num_components; i < 4; i++) {
+            fmt->component_depth[i] = 0;
+            fmt->sample_order[i] = 0;
+            fmt->host_bits[i] = 0;
+        }
+
+        // We can set this universally
+        fmt->fourcc = pl_fmt_fourcc(fmt);
+
+        if (has_drm_mods) {
+
+            if (drm_props.drmFormatModifierCount == PL_ARRAY_SIZE(modifiers)) {
+                PL_WARN(gpu, "DRM modifier list for format %s possibly truncated",
+                        fmt->name);
+            }
+
+            // Query the list of supported DRM modifiers from the driver
+            PL_ARRAY(uint64_t) modlist = {0};
+            for (int i = 0; i < drm_props.drmFormatModifierCount; i++) {
+                if (modifiers[i].drmFormatModifierPlaneCount > 1) {
+                    PL_TRACE(gpu, "Ignoring format modifier %s of "
+                             "format %s because its plane count %d > 1",
+                             PRINT_DRM_MOD(modifiers[i].drmFormatModifier),
+                             fmt->name, modifiers[i].drmFormatModifierPlaneCount);
+                    continue;
+                }
+
+                // Only warn about texture format features relevant to us
+                const VkFormatFeatureFlags flag_mask =
+                    VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT |
+                    VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT |
+                    VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT |
+                    VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT |
+                    VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
+                    VK_FORMAT_FEATURE_BLIT_SRC_BIT |
+                    VK_FORMAT_FEATURE_BLIT_DST_BIT;
+
+
+                VkFormatFeatureFlags flags = modifiers[i].drmFormatModifierTilingFeatures;
+                if ((flags & flag_mask) != (texflags & flag_mask)) {
+                    PL_DEBUG(gpu, "DRM format modifier %s of format %s "
+                            "supports fewer caps (0x%"PRIx32") than optimal tiling "
+                            "(0x%"PRIx32"), may result in limited capability!",
+                            PRINT_DRM_MOD(modifiers[i].drmFormatModifier),
+                            fmt->name, flags, texflags);
+                }
+
+                PL_ARRAY_APPEND(fmt, modlist, modifiers[i].drmFormatModifier);
+            }
+
+            fmt->num_modifiers = modlist.num;
+            fmt->modifiers = modlist.elem;
+
+        } else if (gpu->export_caps.tex & PL_HANDLE_DMA_BUF) {
+
+            // Hard-code a list of static mods that we're likely to support
+            static const uint64_t static_mods[2] = {
+                DRM_FORMAT_MOD_INVALID,
+                DRM_FORMAT_MOD_LINEAR,
+            };
+
+            fmt->num_modifiers = PL_ARRAY_SIZE(static_mods);
+            fmt->modifiers = static_mods;
+
+        }
+
+        struct { VkFormatFeatureFlags flags; enum pl_fmt_caps caps; } bufbits[] = {
+            {VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT,        PL_FMT_CAP_VERTEX},
+            {VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT, PL_FMT_CAP_TEXEL_UNIFORM},
+            {VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT, PL_FMT_CAP_TEXEL_STORAGE},
+        };
+
+        for (int i = 0; i < PL_ARRAY_SIZE(bufbits); i++) {
+            if ((bufflags & bufbits[i].flags) == bufbits[i].flags)
+                fmt->caps |= bufbits[i].caps;
+        }
+
+        if (fmt->caps) {
+            fmt->glsl_type = pl_var_glsl_type_name(pl_var_from_fmt(fmt, ""));
+            pl_assert(fmt->glsl_type);
+        }
+
+        struct { VkFormatFeatureFlags flags; enum pl_fmt_caps caps; } bits[] = {
+            {VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT,      PL_FMT_CAP_BLENDABLE},
+            {VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT, PL_FMT_CAP_LINEAR},
+            {VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT,               PL_FMT_CAP_SAMPLEABLE},
+            {VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT,               PL_FMT_CAP_STORABLE},
+            {VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT,            PL_FMT_CAP_RENDERABLE},
+
+            // We don't distinguish between the two blit modes for pl_fmt_caps
+            {VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT,
+                PL_FMT_CAP_BLITTABLE},
+        };
+
+        for (int i = 0; i < PL_ARRAY_SIZE(bits); i++) {
+            if ((texflags & bits[i].flags) == bits[i].flags)
+                fmt->caps |= bits[i].caps;
+        }
+
+        // For blit emulation via compute shaders
+        if (!(fmt->caps & PL_FMT_CAP_BLITTABLE) && (fmt->caps & PL_FMT_CAP_STORABLE)) {
+            fmt->caps |= PL_FMT_CAP_BLITTABLE;
+            fmtp->blit_emulated = true;
+        }
+
+        // This is technically supported for all textures, but the semantics
+        // of pl_gpu require it only be listed for non-opaque ones
+        if (!fmt->opaque)
+            fmt->caps |= PL_FMT_CAP_HOST_READABLE;
+
+        // Vulkan requires a minimum GLSL version that supports textureGather()
+        if (fmt->caps & PL_FMT_CAP_SAMPLEABLE)
+            fmt->gatherable = true;
+
+        // Disable implied capabilities where the dependencies are unavailable
+        enum pl_fmt_caps storable = PL_FMT_CAP_STORABLE | PL_FMT_CAP_TEXEL_STORAGE;
+        if (!(fmt->caps & PL_FMT_CAP_SAMPLEABLE))
+            fmt->caps &= ~PL_FMT_CAP_LINEAR;
+        if (!gpu->glsl.compute)
+            fmt->caps &= ~storable;
+
+        bool has_nofmt = vk->features.features.shaderStorageImageReadWithoutFormat &&
+                         vk->features.features.shaderStorageImageWriteWithoutFormat;
+
+        if (fmt->caps & storable) {
+            int real_comps = PL_DEF(vk_fmt->icomps, fmt->num_components);
+            fmt->glsl_format = pl_fmt_glsl_format(fmt, real_comps);
+            if (!fmt->glsl_format && !has_nofmt) {
+                PL_DEBUG(gpu, "Storable format '%s' has no matching GLSL "
+                         "format qualifier but read/write without format "
+                         "is not supported.. disabling", fmt->name);
+                fmt->caps &= ~storable;
+            }
+        }
+
+        if (fmt->caps & storable)
+            fmt->caps |= PL_FMT_CAP_READWRITE;
+
+        // Pick sub-plane formats for planar formats
+        for (int n = 0; n < fmt->num_planes; n++) {
+            for (int i = 0; i < formats.num; i++) {
+                if (formats.elem[i]->signature == vk_fmt->pfmt[n].fmt) {
+                    fmt->planes[n].format = formats.elem[i];
+                    fmt->planes[n].shift_x = vk_fmt->pfmt[n].sx;
+                    fmt->planes[n].shift_y = vk_fmt->pfmt[n].sy;
+                    break;
+                }
+            }
+
+            pl_assert(fmt->planes[n].format);
+        }
+
+        PL_ARRAY_APPEND(gpu, formats, fmt);
+    }
+
+    gpu->formats = formats.elem;
+    gpu->num_formats = formats.num;
+}
diff --git a/src/vulkan/formats.h b/src/vulkan/formats.h
new file mode 100644
index 0000000..b1408fd
--- /dev/null
+++ b/src/vulkan/formats.h
@@ -0,0 +1,34 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+#include "gpu.h"
+
+struct vk_format {
+    VkFormat tfmt;      // internal vulkan format enum (textures)
+    struct pl_fmt_t fmt;// pl_fmt template (features will be auto-detected)
+    int icomps;         // internal component count (or 0 to infer from `fmt`)
+    VkFormat bfmt;      // vulkan format for use as buffers (or 0 to use `tfmt`)
+    const struct vk_format *emufmt; // alternate format for emulation
+    uint32_t min_ver;   // minimum vulkan API version for this format to exist
+    struct { VkFormat fmt; int sx, sy; } pfmt[4]; // plane formats (for planar textures)
+};
+
+// Add all supported formats to the `pl_gpu` format list
+void vk_setup_formats(struct pl_gpu_t *gpu);
diff --git a/src/vulkan/gpu.c b/src/vulkan/gpu.c
new file mode 100644
index 0000000..69aca67
--- /dev/null
+++ b/src/vulkan/gpu.c
@@ -0,0 +1,924 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+#include "formats.h"
+#include "glsl/spirv.h"
+
+#ifdef PL_HAVE_UNIX
+#include <unistd.h>
+#endif
+
+// Gives us enough queries for 8 results
+#define QUERY_POOL_SIZE 16
+
+struct pl_timer_t {
+    VkQueryPool qpool; // even=start, odd=stop
+    int index_write; // next index to write to
+    int index_read; // next index to read from
+    uint64_t pending; // bitmask of queries that are still running
+};
+
+static inline uint64_t timer_bit(int index)
+{
+    return 1llu << (index / 2);
+}
+
+static void timer_destroy_cb(pl_gpu gpu, pl_timer timer)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    pl_assert(!timer->pending);
+    vk->DestroyQueryPool(vk->dev, timer->qpool, PL_VK_ALLOC);
+    pl_free(timer);
+}
+
+static pl_timer vk_timer_create(pl_gpu gpu)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    pl_timer timer = pl_alloc_ptr(NULL, timer);
+    *timer = (struct pl_timer_t) {0};
+
+    struct VkQueryPoolCreateInfo qinfo = {
+        .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
+        .queryType = VK_QUERY_TYPE_TIMESTAMP,
+        .queryCount = QUERY_POOL_SIZE,
+    };
+
+    VK(vk->CreateQueryPool(vk->dev, &qinfo, PL_VK_ALLOC, &timer->qpool));
+    return timer;
+
+error:
+    timer_destroy_cb(gpu, timer);
+    return NULL;
+}
+
+static void vk_timer_destroy(pl_gpu gpu, pl_timer timer)
+{
+    vk_gpu_idle_callback(gpu, (vk_cb) timer_destroy_cb, gpu, timer);
+}
+
+static uint64_t vk_timer_query(pl_gpu gpu, pl_timer timer)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    if (timer->index_read == timer->index_write)
+        return 0; // no more unprocessed results
+
+    vk_poll_commands(vk, 0);
+    if (timer->pending & timer_bit(timer->index_read))
+        return 0; // still waiting for results
+
+    VkResult res;
+    uint64_t ts[2] = {0};
+    res = vk->GetQueryPoolResults(vk->dev, timer->qpool, timer->index_read, 2,
+                                  sizeof(ts), &ts[0], sizeof(uint64_t),
+                                  VK_QUERY_RESULT_64_BIT);
+
+    switch (res) {
+    case VK_SUCCESS:
+        timer->index_read = (timer->index_read + 2) % QUERY_POOL_SIZE;
+        return (ts[1] - ts[0]) * vk->props.limits.timestampPeriod;
+    case VK_NOT_READY:
+        return 0;
+    default:
+        PL_VK_ASSERT(res, "Retrieving query pool results");
+    }
+
+error:
+    return 0;
+}
+
+static void timer_begin(pl_gpu gpu, struct vk_cmd *cmd, pl_timer timer)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    if (!timer)
+        return;
+
+    if (!cmd->pool->props.timestampValidBits) {
+        PL_TRACE(gpu, "QF %d does not support timestamp queries", cmd->pool->qf);
+        return;
+    }
+
+    vk_poll_commands(vk, 0);
+    if (timer->pending & timer_bit(timer->index_write))
+        return; // next query is still running, skip this timer
+
+    VkQueueFlags reset_flags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT;
+    if (cmd->pool->props.queueFlags & reset_flags) {
+        // Use direct command buffer resets
+        vk->CmdResetQueryPool(cmd->buf, timer->qpool, timer->index_write, 2);
+    } else {
+        // Use host query reset
+        vk->ResetQueryPool(vk->dev, timer->qpool, timer->index_write, 2);
+    }
+
+    vk->CmdWriteTimestamp(cmd->buf, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+                          timer->qpool, timer->index_write);
+
+    p->cmd_timer = timer;
+}
+
+static inline bool supports_marks(struct vk_cmd *cmd) {
+    // Spec says debug markers are only available on graphics/compute queues
+    VkQueueFlags flags = cmd->pool->props.queueFlags;
+    return flags & (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT);
+}
+
+struct vk_cmd *_begin_cmd(pl_gpu gpu, enum queue_type type, const char *label,
+                          pl_timer timer)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    pl_mutex_lock(&p->recording);
+
+    struct vk_cmdpool *pool;
+    switch (type) {
+    case ANY:      pool = p->cmd ? p->cmd->pool : vk->pool_graphics; break;
+    case GRAPHICS: pool = vk->pool_graphics; break;
+    case COMPUTE:  pool = vk->pool_compute;  break;
+    case TRANSFER: pool = vk->pool_transfer; break;
+    default: pl_unreachable();
+    }
+
+    if (!p->cmd || p->cmd->pool != pool) {
+        vk_cmd_submit(&p->cmd);
+        p->cmd = vk_cmd_begin(pool, label);
+        if (!p->cmd) {
+            pl_mutex_unlock(&p->recording);
+            return NULL;
+        }
+    }
+
+    if (vk->CmdBeginDebugUtilsLabelEXT && supports_marks(p->cmd)) {
+        vk->CmdBeginDebugUtilsLabelEXT(p->cmd->buf, &(VkDebugUtilsLabelEXT) {
+            .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,
+            .pLabelName = label,
+        });
+    }
+
+    timer_begin(gpu, p->cmd, timer);
+    return p->cmd;
+}
+
+static void timer_end_cb(void *ptimer, void *pindex)
+{
+    pl_timer timer = ptimer;
+    int index = (uintptr_t) pindex;
+    timer->pending &= ~timer_bit(index);
+}
+
+bool _end_cmd(pl_gpu gpu, struct vk_cmd **pcmd, bool submit)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    bool ret = true;
+    if (!pcmd) {
+        if (submit) {
+            pl_mutex_lock(&p->recording);
+            ret = vk_cmd_submit(&p->cmd);
+            pl_mutex_unlock(&p->recording);
+        }
+        return ret;
+    }
+
+    struct vk_cmd *cmd = *pcmd;
+    pl_assert(p->cmd == cmd);
+
+    if (p->cmd_timer) {
+        pl_timer timer = p->cmd_timer;
+        vk->CmdWriteTimestamp(cmd->buf, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+                              timer->qpool, timer->index_write + 1);
+
+        timer->pending |= timer_bit(timer->index_write);
+        vk_cmd_callback(cmd, (vk_cb) timer_end_cb, timer,
+                        (void *) (uintptr_t) timer->index_write);
+
+        timer->index_write = (timer->index_write + 2) % QUERY_POOL_SIZE;
+        if (timer->index_write == timer->index_read) {
+            // forcibly drop the least recent result to make space
+            timer->index_read = (timer->index_read + 2) % QUERY_POOL_SIZE;
+        }
+
+        p->cmd_timer = NULL;
+    }
+
+    if (vk->CmdEndDebugUtilsLabelEXT && supports_marks(cmd))
+        vk->CmdEndDebugUtilsLabelEXT(cmd->buf);
+
+    if (submit)
+        ret = vk_cmd_submit(&p->cmd);
+
+    pl_mutex_unlock(&p->recording);
+    return ret;
+}
+
+void vk_gpu_idle_callback(pl_gpu gpu, vk_cb cb, const void *priv, const void *arg)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    pl_mutex_lock(&p->recording);
+    if (p->cmd) {
+        vk_cmd_callback(p->cmd, cb, priv, arg);
+    } else {
+        vk_dev_callback(vk, cb, priv, arg);
+    }
+    pl_mutex_unlock(&p->recording);
+}
+
+static void vk_gpu_destroy(pl_gpu gpu)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    vk_cmd_submit(&p->cmd);
+    vk_wait_idle(vk);
+
+    for (enum pl_tex_sample_mode s = 0; s < PL_TEX_SAMPLE_MODE_COUNT; s++) {
+        for (enum pl_tex_address_mode a = 0; a < PL_TEX_ADDRESS_MODE_COUNT; a++)
+            vk->DestroySampler(vk->dev, p->samplers[s][a], PL_VK_ALLOC);
+    }
+
+    pl_spirv_destroy(&p->spirv);
+    pl_mutex_destroy(&p->recording);
+    pl_free((void *) gpu);
+}
+
+pl_vulkan pl_vulkan_get(pl_gpu gpu)
+{
+    const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+    if (impl->destroy == vk_gpu_destroy) {
+        struct pl_vk *p = (struct pl_vk *) impl;
+        return p->vk->vulkan;
+    }
+
+    return NULL;
+}
+
+static pl_handle_caps vk_sync_handle_caps(struct vk_ctx *vk)
+{
+    pl_handle_caps caps = 0;
+
+    for (int i = 0; vk_sync_handle_list[i]; i++) {
+        enum pl_handle_type type = vk_sync_handle_list[i];
+
+        VkPhysicalDeviceExternalSemaphoreInfo info = {
+            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_SEMAPHORE_INFO_KHR,
+            .handleType = vk_sync_handle_type(type),
+        };
+
+        VkExternalSemaphoreProperties props = {
+            .sType = VK_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_PROPERTIES_KHR,
+        };
+
+        vk->GetPhysicalDeviceExternalSemaphoreProperties(vk->physd, &info, &props);
+        VkExternalSemaphoreFeatureFlags flags = props.externalSemaphoreFeatures;
+        if ((props.compatibleHandleTypes & info.handleType) &&
+            (flags & VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT_KHR))
+        {
+            caps |= type;
+        }
+    }
+
+    return caps;
+}
+
+static pl_handle_caps vk_tex_handle_caps(struct vk_ctx *vk, bool import)
+{
+    pl_handle_caps caps = 0;
+
+    for (int i = 0; vk_mem_handle_list[i]; i++) {
+        enum pl_handle_type handle_type = vk_mem_handle_list[i];
+        if (handle_type == PL_HANDLE_DMA_BUF && !vk->GetImageDrmFormatModifierPropertiesEXT) {
+            PL_DEBUG(vk, "Tex caps for %s (0x%x) unsupported: no DRM modifiers",
+                     vk_handle_name(vk_mem_handle_type(PL_HANDLE_DMA_BUF)),
+                     (unsigned int) PL_HANDLE_DMA_BUF);
+            continue;
+        }
+
+        // Query whether creation of a "basic" dummy texture would work
+        VkPhysicalDeviceImageDrmFormatModifierInfoEXT drm_pinfo = {
+            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT,
+            .drmFormatModifier = DRM_FORMAT_MOD_LINEAR,
+            .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        };
+
+        VkPhysicalDeviceExternalImageFormatInfoKHR ext_pinfo = {
+            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO_KHR,
+            .handleType = vk_mem_handle_type(handle_type),
+        };
+
+        VkPhysicalDeviceImageFormatInfo2KHR pinfo = {
+            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2_KHR,
+            .pNext = &ext_pinfo,
+            .format = VK_FORMAT_R8_UNORM,
+            .type = VK_IMAGE_TYPE_2D,
+            .tiling = VK_IMAGE_TILING_OPTIMAL,
+            .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+        };
+
+        if (handle_type == PL_HANDLE_DMA_BUF) {
+            vk_link_struct(&pinfo, &drm_pinfo);
+            pinfo.tiling = VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
+        }
+
+        VkExternalImageFormatPropertiesKHR ext_props = {
+            .sType = VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES_KHR,
+        };
+
+        VkImageFormatProperties2KHR props = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2_KHR,
+            .pNext = &ext_props,
+        };
+
+        VkResult res;
+        res = vk->GetPhysicalDeviceImageFormatProperties2KHR(vk->physd, &pinfo, &props);
+        if (res != VK_SUCCESS) {
+            PL_DEBUG(vk, "Tex caps for %s (0x%x) unsupported: %s",
+                     vk_handle_name(ext_pinfo.handleType),
+                     (unsigned int) handle_type,
+                     vk_res_str(res));
+            continue;
+        }
+
+        if (vk_external_mem_check(vk, &ext_props.externalMemoryProperties,
+                                  handle_type, import))
+        {
+            caps |= handle_type;
+        }
+    }
+
+#ifdef VK_EXT_metal_objects
+    if (vk->ExportMetalObjectsEXT && import)
+        caps |= PL_HANDLE_MTL_TEX | PL_HANDLE_IOSURFACE;
+#endif
+
+    return caps;
+}
+
+static const VkFilter filters[PL_TEX_SAMPLE_MODE_COUNT] = {
+    [PL_TEX_SAMPLE_NEAREST] = VK_FILTER_NEAREST,
+    [PL_TEX_SAMPLE_LINEAR]  = VK_FILTER_LINEAR,
+};
+
+static inline struct pl_spirv_version get_spirv_version(const struct vk_ctx *vk)
+{
+    if (vk->api_ver >= VK_API_VERSION_1_3) {
+        const VkPhysicalDeviceMaintenance4Features *device_maintenance4;
+        device_maintenance4 = vk_find_struct(&vk->features,
+            VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_4_FEATURES);
+
+        if (device_maintenance4 && device_maintenance4->maintenance4) {
+            return (struct pl_spirv_version) {
+                .env_version = VK_API_VERSION_1_3,
+                .spv_version = PL_SPV_VERSION(1, 6),
+            };
+        }
+    }
+
+    pl_assert(vk->api_ver >= VK_API_VERSION_1_2);
+    return (struct pl_spirv_version) {
+        .env_version = VK_API_VERSION_1_2,
+        .spv_version = PL_SPV_VERSION(1, 5),
+    };
+}
+
+static const struct pl_gpu_fns pl_fns_vk;
+
+pl_gpu pl_gpu_create_vk(struct vk_ctx *vk)
+{
+    pl_assert(vk->dev);
+
+    struct pl_gpu_t *gpu = pl_zalloc_obj(NULL, gpu, struct pl_vk);
+    gpu->log = vk->log;
+
+    struct pl_vk *p = PL_PRIV(gpu);
+    pl_mutex_init(&p->recording);
+    p->vk = vk;
+    p->impl = pl_fns_vk;
+    p->spirv = pl_spirv_create(vk->log, get_spirv_version(vk));
+    if (!p->spirv)
+        goto error;
+
+    // Query all device properties
+    VkPhysicalDevicePCIBusInfoPropertiesEXT pci_props = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PCI_BUS_INFO_PROPERTIES_EXT,
+    };
+
+    VkPhysicalDeviceIDPropertiesKHR id_props = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR,
+        .pNext = &pci_props,
+    };
+
+    VkPhysicalDevicePushDescriptorPropertiesKHR pushd_props = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR,
+        .pNext = &id_props,
+    };
+
+    VkPhysicalDeviceSubgroupProperties group_props = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES,
+        .pNext = &pushd_props,
+    };
+
+    VkPhysicalDeviceExternalMemoryHostPropertiesEXT host_props = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_MEMORY_HOST_PROPERTIES_EXT,
+        .pNext = &group_props,
+    };
+
+    VkPhysicalDeviceProperties2KHR props = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
+        .pNext = &host_props,
+    };
+
+    bool is_portability = false;
+
+#ifdef VK_KHR_portability_subset
+    VkPhysicalDevicePortabilitySubsetPropertiesKHR port_props = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PORTABILITY_SUBSET_PROPERTIES_KHR,
+        .minVertexInputBindingStrideAlignment = 1,
+    };
+
+    for (int i = 0; i < vk->exts.num; i++) {
+        if (!strcmp(vk->exts.elem[i], VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME)) {
+            vk_link_struct(&props, &port_props);
+            is_portability = true;
+            break;
+        }
+    }
+#endif
+
+    vk->GetPhysicalDeviceProperties2(vk->physd, &props);
+    VkPhysicalDeviceLimits limits = props.properties.limits;
+
+    // Determine GLSL features and limits
+    gpu->glsl = (struct pl_glsl_version) {
+        .version = 450,
+        .vulkan = true,
+        .compute = true,
+        .max_shmem_size = limits.maxComputeSharedMemorySize,
+        .max_group_threads = limits.maxComputeWorkGroupInvocations,
+        .max_group_size = {
+            limits.maxComputeWorkGroupSize[0],
+            limits.maxComputeWorkGroupSize[1],
+            limits.maxComputeWorkGroupSize[2],
+        },
+    };
+
+    VkShaderStageFlags req_stages = VK_SHADER_STAGE_FRAGMENT_BIT |
+                                    VK_SHADER_STAGE_COMPUTE_BIT;
+    VkSubgroupFeatureFlags req_flags = VK_SUBGROUP_FEATURE_BASIC_BIT |
+                                       VK_SUBGROUP_FEATURE_VOTE_BIT |
+                                       VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
+                                       VK_SUBGROUP_FEATURE_BALLOT_BIT |
+                                       VK_SUBGROUP_FEATURE_SHUFFLE_BIT;
+
+    if ((group_props.supportedStages & req_stages) == req_stages &&
+        (group_props.supportedOperations & req_flags) == req_flags)
+    {
+        gpu->glsl.subgroup_size = group_props.subgroupSize;
+    }
+
+    if (vk->features.features.shaderImageGatherExtended) {
+        gpu->glsl.min_gather_offset = limits.minTexelGatherOffset;
+        gpu->glsl.max_gather_offset = limits.maxTexelGatherOffset;
+    }
+
+    const size_t max_size = vk_malloc_avail(vk->ma, 0);
+    gpu->limits = (struct pl_gpu_limits) {
+        // pl_gpu
+        .thread_safe        = true,
+        .callbacks          = true,
+        // pl_buf
+        .max_buf_size       = max_size,
+        .max_ubo_size       = PL_MIN(limits.maxUniformBufferRange, max_size),
+        .max_ssbo_size      = PL_MIN(limits.maxStorageBufferRange, max_size),
+        .max_vbo_size       = vk_malloc_avail(vk->ma, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT),
+        .max_mapped_size    = vk_malloc_avail(vk->ma, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT),
+        .max_buffer_texels  = PL_MIN(limits.maxTexelBufferElements, max_size),
+        .align_host_ptr     = host_props.minImportedHostPointerAlignment,
+        .host_cached        = vk_malloc_avail(vk->ma, VK_MEMORY_PROPERTY_HOST_CACHED_BIT),
+        // pl_tex
+        .max_tex_1d_dim     = limits.maxImageDimension1D,
+        .max_tex_2d_dim     = limits.maxImageDimension2D,
+        .max_tex_3d_dim     = limits.maxImageDimension3D,
+        .blittable_1d_3d    = true,
+        .buf_transfer       = true,
+        .align_tex_xfer_pitch  = limits.optimalBufferCopyRowPitchAlignment,
+        .align_tex_xfer_offset = pl_lcm(limits.optimalBufferCopyOffsetAlignment, 4),
+        // pl_pass
+        .max_variable_comps = 0, // vulkan doesn't support these at all
+        .max_constants      = SIZE_MAX,
+        .array_size_constants = !is_portability,
+        .max_pushc_size     = limits.maxPushConstantsSize,
+#ifdef VK_KHR_portability_subset
+        .align_vertex_stride = port_props.minVertexInputBindingStrideAlignment,
+#else
+        .align_vertex_stride = 1,
+#endif
+        .max_dispatch = {
+            limits.maxComputeWorkGroupCount[0],
+            limits.maxComputeWorkGroupCount[1],
+            limits.maxComputeWorkGroupCount[2],
+        },
+        .fragment_queues    = vk->pool_graphics->num_queues,
+        .compute_queues     = vk->pool_compute->num_queues,
+    };
+
+    gpu->export_caps.buf = vk_malloc_handle_caps(vk->ma, false);
+    gpu->import_caps.buf = vk_malloc_handle_caps(vk->ma, true);
+    gpu->export_caps.tex = vk_tex_handle_caps(vk, false);
+    gpu->import_caps.tex = vk_tex_handle_caps(vk, true);
+    gpu->export_caps.sync = vk_sync_handle_caps(vk);
+    gpu->import_caps.sync = 0; // Not supported yet
+
+    if (pl_gpu_supports_interop(gpu)) {
+        pl_static_assert(sizeof(gpu->uuid) == VK_UUID_SIZE);
+        memcpy(gpu->uuid, id_props.deviceUUID, sizeof(gpu->uuid));
+
+        gpu->pci.domain = pci_props.pciDomain;
+        gpu->pci.bus = pci_props.pciBus;
+        gpu->pci.device = pci_props.pciDevice;
+        gpu->pci.function = pci_props.pciFunction;
+    }
+
+    if (vk->CmdPushDescriptorSetKHR)
+        p->max_push_descriptors = pushd_props.maxPushDescriptors;
+
+    vk_setup_formats(gpu);
+
+    // Compute the correct minimum texture alignment
+    p->min_texel_alignment = 1;
+    for (int i = 0; i < gpu->num_formats; i++) {
+        if (gpu->formats[i]->emulated || gpu->formats[i]->opaque)
+            continue;
+        size_t texel_size = gpu->formats[i]->texel_size;
+        p->min_texel_alignment = pl_lcm(p->min_texel_alignment, texel_size);
+    }
+    PL_DEBUG(gpu, "Minimum texel alignment: %zu", p->min_texel_alignment);
+
+    // Initialize the samplers
+    for (enum pl_tex_sample_mode s = 0; s < PL_TEX_SAMPLE_MODE_COUNT; s++) {
+        for (enum pl_tex_address_mode a = 0; a < PL_TEX_ADDRESS_MODE_COUNT; a++) {
+            static const VkSamplerAddressMode modes[PL_TEX_ADDRESS_MODE_COUNT] = {
+                [PL_TEX_ADDRESS_CLAMP]  = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
+                [PL_TEX_ADDRESS_REPEAT] = VK_SAMPLER_ADDRESS_MODE_REPEAT,
+                [PL_TEX_ADDRESS_MIRROR] = VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT,
+            };
+
+            VkSamplerCreateInfo sinfo = {
+                .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
+                .magFilter = filters[s],
+                .minFilter = filters[s],
+                .addressModeU = modes[a],
+                .addressModeV = modes[a],
+                .addressModeW = modes[a],
+                .maxAnisotropy = 1.0,
+            };
+
+            VK(vk->CreateSampler(vk->dev, &sinfo, PL_VK_ALLOC, &p->samplers[s][a]));
+        }
+    }
+
+    return pl_gpu_finalize(gpu);
+
+error:
+    vk_gpu_destroy(gpu);
+    return NULL;
+}
+
+static void vk_sync_destroy(pl_gpu gpu, pl_sync sync)
+{
+    if (!sync)
+        return;
+
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_sync_vk *sync_vk = PL_PRIV(sync);
+
+#ifdef PL_HAVE_UNIX
+    if (sync->handle_type == PL_HANDLE_FD) {
+        if (sync->wait_handle.fd > -1)
+            close(sync->wait_handle.fd);
+        if (sync->signal_handle.fd > -1)
+            close(sync->signal_handle.fd);
+    }
+#endif
+#ifdef PL_HAVE_WIN32
+    if (sync->handle_type == PL_HANDLE_WIN32) {
+        if (sync->wait_handle.handle != NULL)
+            CloseHandle(sync->wait_handle.handle);
+        if (sync->signal_handle.handle != NULL)
+            CloseHandle(sync->signal_handle.handle);
+    }
+    // PL_HANDLE_WIN32_KMT is just an identifier. It doesn't get closed.
+#endif
+
+    vk->DestroySemaphore(vk->dev, sync_vk->wait, PL_VK_ALLOC);
+    vk->DestroySemaphore(vk->dev, sync_vk->signal, PL_VK_ALLOC);
+
+    pl_free((void *) sync);
+}
+
+void vk_sync_deref(pl_gpu gpu, pl_sync sync)
+{
+    if (!sync)
+        return;
+
+    struct pl_sync_vk *sync_vk = PL_PRIV(sync);
+    if (pl_rc_deref(&sync_vk->rc))
+        vk_sync_destroy(gpu, sync);
+}
+
+static pl_sync vk_sync_create(pl_gpu gpu, enum pl_handle_type handle_type)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    struct pl_sync_t *sync = pl_zalloc_obj(NULL, sync, struct pl_sync_vk);
+    sync->handle_type = handle_type;
+
+    struct pl_sync_vk *sync_vk = PL_PRIV(sync);
+    pl_rc_init(&sync_vk->rc);
+
+    VkExportSemaphoreCreateInfoKHR einfo = {
+        .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR,
+        .handleTypes = vk_sync_handle_type(handle_type),
+    };
+
+    switch (handle_type) {
+    case PL_HANDLE_FD:
+        sync->wait_handle.fd = -1;
+        sync->signal_handle.fd = -1;
+        break;
+    case PL_HANDLE_WIN32:
+    case PL_HANDLE_WIN32_KMT:
+        sync->wait_handle.handle = NULL;
+        sync->signal_handle.handle = NULL;
+        break;
+    case PL_HANDLE_DMA_BUF:
+    case PL_HANDLE_HOST_PTR:
+    case PL_HANDLE_MTL_TEX:
+    case PL_HANDLE_IOSURFACE:
+        pl_unreachable();
+    }
+
+    const VkSemaphoreCreateInfo sinfo = {
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+        .pNext = &einfo,
+    };
+
+    VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &sync_vk->wait));
+    VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &sync_vk->signal));
+    PL_VK_NAME(SEMAPHORE, sync_vk->wait, "sync wait");
+    PL_VK_NAME(SEMAPHORE, sync_vk->signal, "sync signal");
+
+#ifdef PL_HAVE_UNIX
+    if (handle_type == PL_HANDLE_FD) {
+        VkSemaphoreGetFdInfoKHR finfo = {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR,
+            .semaphore = sync_vk->wait,
+            .handleType = einfo.handleTypes,
+        };
+
+        VK(vk->GetSemaphoreFdKHR(vk->dev, &finfo, &sync->wait_handle.fd));
+
+        finfo.semaphore = sync_vk->signal;
+        VK(vk->GetSemaphoreFdKHR(vk->dev, &finfo, &sync->signal_handle.fd));
+    }
+#endif
+
+#ifdef PL_HAVE_WIN32
+    if (handle_type == PL_HANDLE_WIN32 ||
+        handle_type == PL_HANDLE_WIN32_KMT)
+    {
+        VkSemaphoreGetWin32HandleInfoKHR handle_info = {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR,
+            .semaphore = sync_vk->wait,
+            .handleType = einfo.handleTypes,
+        };
+
+        VK(vk->GetSemaphoreWin32HandleKHR(vk->dev, &handle_info,
+                                          &sync->wait_handle.handle));
+
+        handle_info.semaphore = sync_vk->signal;
+        VK(vk->GetSemaphoreWin32HandleKHR(vk->dev, &handle_info,
+                                          &sync->signal_handle.handle));
+    }
+#endif
+
+    return sync;
+
+error:
+    vk_sync_destroy(gpu, sync);
+    return NULL;
+}
+
+void pl_vulkan_sem_destroy(pl_gpu gpu, VkSemaphore *semaphore)
+{
+    VkSemaphore sem = *semaphore;
+    if (!sem)
+        return;
+
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    vk->DestroySemaphore(vk->dev, sem, PL_VK_ALLOC);
+    *semaphore = VK_NULL_HANDLE;
+}
+
+VkSemaphore pl_vulkan_sem_create(pl_gpu gpu, const struct pl_vulkan_sem_params *params)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    pl_assert(PL_ISPOT(params->export_handle));
+    if ((params->export_handle & gpu->export_caps.sync) != params->export_handle) {
+        PL_ERR(gpu, "Invalid handle type 0x%"PRIx64" specified for "
+               "`pl_vulkan_sem_create`!", (uint64_t) params->export_handle);
+        return VK_NULL_HANDLE;
+    }
+
+    switch (params->export_handle) {
+    case PL_HANDLE_FD:
+        params->out_handle->fd = -1;
+        break;
+    case PL_HANDLE_WIN32:
+    case PL_HANDLE_WIN32_KMT:
+        params->out_handle->handle = NULL;
+        break;
+    case PL_HANDLE_DMA_BUF:
+    case PL_HANDLE_HOST_PTR:
+    case PL_HANDLE_MTL_TEX:
+    case PL_HANDLE_IOSURFACE:
+        pl_unreachable();
+    }
+
+    const VkExportSemaphoreCreateInfoKHR einfo = {
+        .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR,
+        .handleTypes = vk_sync_handle_type(params->export_handle),
+    };
+
+    const VkSemaphoreTypeCreateInfo stinfo = {
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO,
+        .pNext = params->export_handle ? &einfo : NULL,
+        .semaphoreType = params->type,
+        .initialValue = params->initial_value,
+    };
+
+    const VkSemaphoreCreateInfo sinfo = {
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+        .pNext = &stinfo,
+    };
+
+    VkSemaphore sem = VK_NULL_HANDLE;
+    VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &sem));
+    PL_VK_NAME(SEMAPHORE, sem, PL_DEF(params->debug_tag, "pl_vulkan_sem"));
+
+#ifdef PL_HAVE_UNIX
+    if (params->export_handle == PL_HANDLE_FD) {
+        VkSemaphoreGetFdInfoKHR finfo = {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR,
+            .handleType = einfo.handleTypes,
+            .semaphore = sem,
+        };
+
+        VK(vk->GetSemaphoreFdKHR(vk->dev, &finfo, &params->out_handle->fd));
+    }
+#endif
+
+#ifdef PL_HAVE_WIN32
+    if (params->export_handle == PL_HANDLE_WIN32 ||
+        params->export_handle == PL_HANDLE_WIN32_KMT)
+    {
+        VkSemaphoreGetWin32HandleInfoKHR handle_info = {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR,
+            .handleType = einfo.handleTypes,
+            .semaphore = sem,
+        };
+
+        VK(vk->GetSemaphoreWin32HandleKHR(vk->dev, &handle_info,
+                                          &params->out_handle->handle));
+    }
+#endif
+
+    return sem;
+
+error:
+#ifdef PL_HAVE_UNIX
+    if (params->export_handle == PL_HANDLE_FD) {
+        if (params->out_handle->fd > -1)
+            close(params->out_handle->fd);
+    }
+#endif
+#ifdef PL_HAVE_WIN32
+    if (params->export_handle == PL_HANDLE_WIN32) {
+        if (params->out_handle->handle != NULL)
+            CloseHandle(params->out_handle->handle);
+    }
+    // PL_HANDLE_WIN32_KMT is just an identifier. It doesn't get closed.
+#endif
+    vk->DestroySemaphore(vk->dev, sem, PL_VK_ALLOC);
+    return VK_NULL_HANDLE;
+}
+
+static void vk_gpu_flush(pl_gpu gpu)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    CMD_SUBMIT(NULL);
+    vk_rotate_queues(vk);
+    vk_malloc_garbage_collect(vk->ma);
+}
+
+static void vk_gpu_finish(pl_gpu gpu)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    CMD_SUBMIT(NULL);
+    vk_wait_idle(vk);
+}
+
+static bool vk_gpu_is_failed(pl_gpu gpu)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    return vk->failed;
+}
+
+struct vk_cmd *pl_vk_steal_cmd(pl_gpu gpu)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    pl_mutex_lock(&p->recording);
+    struct vk_cmd *cmd = p->cmd;
+    p->cmd = NULL;
+    pl_mutex_unlock(&p->recording);
+
+    struct vk_cmdpool *pool = vk->pool_graphics;
+    if (!cmd || cmd->pool != pool) {
+        vk_cmd_submit(&cmd);
+        cmd = vk_cmd_begin(pool, NULL);
+    }
+
+    return cmd;
+}
+
+void pl_vk_print_heap(pl_gpu gpu, enum pl_log_level lev)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    vk_malloc_print_stats(vk->ma, lev);
+}
+
+static const struct pl_gpu_fns pl_fns_vk = {
+    .destroy                = vk_gpu_destroy,
+    .tex_create             = vk_tex_create,
+    .tex_destroy            = vk_tex_deref,
+    .tex_invalidate         = vk_tex_invalidate,
+    .tex_clear_ex           = vk_tex_clear_ex,
+    .tex_blit               = vk_tex_blit,
+    .tex_upload             = vk_tex_upload,
+    .tex_download           = vk_tex_download,
+    .tex_poll               = vk_tex_poll,
+    .tex_export             = vk_tex_export,
+    .buf_create             = vk_buf_create,
+    .buf_destroy            = vk_buf_deref,
+    .buf_write              = vk_buf_write,
+    .buf_read               = vk_buf_read,
+    .buf_copy               = vk_buf_copy,
+    .buf_export             = vk_buf_export,
+    .buf_poll               = vk_buf_poll,
+    .desc_namespace         = vk_desc_namespace,
+    .pass_create            = vk_pass_create,
+    .pass_destroy           = vk_pass_destroy,
+    .pass_run               = vk_pass_run,
+    .sync_create            = vk_sync_create,
+    .sync_destroy           = vk_sync_deref,
+    .timer_create           = vk_timer_create,
+    .timer_destroy          = vk_timer_destroy,
+    .timer_query            = vk_timer_query,
+    .gpu_flush              = vk_gpu_flush,
+    .gpu_finish             = vk_gpu_finish,
+    .gpu_is_failed          = vk_gpu_is_failed,
+};
diff --git a/src/vulkan/gpu.h b/src/vulkan/gpu.h
new file mode 100644
index 0000000..041de13
--- /dev/null
+++ b/src/vulkan/gpu.h
@@ -0,0 +1,175 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+#include "command.h"
+#include "formats.h"
+#include "malloc.h"
+#include "utils.h"
+
+#include "../gpu.h"
+#include "../glsl/spirv.h"
+#include "../pl_thread.h"
+
+pl_gpu pl_gpu_create_vk(struct vk_ctx *vk);
+
+// This function takes the current graphics command and steals it from the
+// GPU, so the caller can do custom vk_cmd_ calls on it. The caller should
+// submit it as well.
+struct vk_cmd *pl_vk_steal_cmd(pl_gpu gpu);
+
+// Print memory usage statistics
+void pl_vk_print_heap(pl_gpu, enum pl_log_level);
+
+// --- pl_gpu internal structs and helpers
+
+struct pl_fmt_vk {
+    const struct vk_format *vk_fmt;
+    bool blit_emulated;
+};
+
+enum queue_type {
+    GRAPHICS,
+    COMPUTE,
+    TRANSFER,
+    ANY,
+};
+
+struct pl_vk {
+    struct pl_gpu_fns impl;
+    struct vk_ctx *vk;
+    pl_spirv spirv;
+
+    // Some additional cached device limits and features checks
+    uint32_t max_push_descriptors;
+    size_t min_texel_alignment;
+
+    // The "currently recording" command. This will be queued and replaced by
+    // a new command every time we need to "switch" between queue families.
+    pl_mutex recording;
+    struct vk_cmd *cmd;
+    pl_timer cmd_timer;
+
+    // Array of VkSamplers for every combination of sample/address modes
+    VkSampler samplers[PL_TEX_SAMPLE_MODE_COUNT][PL_TEX_ADDRESS_MODE_COUNT];
+
+    // To avoid spamming warnings
+    bool warned_modless;
+};
+
+struct vk_cmd *_begin_cmd(pl_gpu, enum queue_type, const char *label, pl_timer);
+bool _end_cmd(pl_gpu, struct vk_cmd **, bool submit);
+
+#define CMD_BEGIN(type)              _begin_cmd(gpu, type, __func__, NULL)
+#define CMD_BEGIN_TIMED(type, timer) _begin_cmd(gpu, type, __func__, timer)
+#define CMD_FINISH(cmd) _end_cmd(gpu, cmd, false)
+#define CMD_SUBMIT(cmd) _end_cmd(gpu, cmd, true)
+
+// Helper to fire a callback the next time the `pl_gpu` is in an idle state
+//
+// Use this instead of `vk_dev_callback` when you need to clean up after
+// resources that might possibly still be in use by the `pl_gpu` at the time of
+// creating the callback.
+void vk_gpu_idle_callback(pl_gpu, vk_cb, const void *priv, const void *arg);
+
+struct pl_tex_vk {
+    pl_rc_t rc;
+    bool external_img;
+    enum queue_type transfer_queue;
+    VkImageType type;
+    VkImage img;
+    VkImageAspectFlags aspect;
+    struct vk_memslice mem;
+    // cached properties
+    VkFormat img_fmt;
+    VkImageUsageFlags usage_flags;
+    // for sampling
+    VkImageView view;
+    // for rendering
+    VkFramebuffer framebuffer;
+    // for vk_tex_upload/download fallback code
+    pl_fmt texel_fmt;
+    // for planar textures (as a convenience)
+    int num_planes;
+    struct pl_tex_vk *planes[4];
+
+    // synchronization and current state (planes only)
+    struct vk_sem sem;
+    VkImageLayout layout;
+    PL_ARRAY(pl_vulkan_sem) ext_deps; // external semaphore, not owned by the pl_tex
+    pl_sync ext_sync; // indicates an exported image
+    uint32_t qf; // last queue family to access this texture (for barriers)
+    bool may_invalidate;
+    bool held;
+};
+
+pl_tex vk_tex_create(pl_gpu, const struct pl_tex_params *);
+void vk_tex_deref(pl_gpu, pl_tex);
+void vk_tex_invalidate(pl_gpu, pl_tex);
+void vk_tex_clear_ex(pl_gpu, pl_tex, const union pl_clear_color);
+void vk_tex_blit(pl_gpu, const struct pl_tex_blit_params *);
+bool vk_tex_upload(pl_gpu, const struct pl_tex_transfer_params *);
+bool vk_tex_download(pl_gpu, const struct pl_tex_transfer_params *);
+bool vk_tex_poll(pl_gpu, pl_tex, uint64_t timeout);
+bool vk_tex_export(pl_gpu, pl_tex, pl_sync);
+void vk_tex_barrier(pl_gpu, struct vk_cmd *, pl_tex, VkPipelineStageFlags2,
+                    VkAccessFlags2, VkImageLayout, uint32_t qf);
+
+struct pl_buf_vk {
+    pl_rc_t rc;
+    struct vk_memslice mem;
+    enum queue_type update_queue;
+    VkBufferView view; // for texel buffers
+
+    // synchronization and current state
+    struct vk_sem sem;
+    bool exported;
+    bool needs_flush;
+};
+
+pl_buf vk_buf_create(pl_gpu, const struct pl_buf_params *);
+void vk_buf_deref(pl_gpu, pl_buf);
+void vk_buf_write(pl_gpu, pl_buf, size_t offset, const void *src, size_t size);
+bool vk_buf_read(pl_gpu, pl_buf, size_t offset, void *dst, size_t size);
+void vk_buf_copy(pl_gpu, pl_buf dst, size_t dst_offset,
+                 pl_buf src, size_t src_offset, size_t size);
+bool vk_buf_export(pl_gpu, pl_buf);
+bool vk_buf_poll(pl_gpu, pl_buf, uint64_t timeout);
+
+// Helper to ease buffer barrier creation. (`offset` is relative to pl_buf)
+void vk_buf_barrier(pl_gpu, struct vk_cmd *, pl_buf, VkPipelineStageFlags2,
+                    VkAccessFlags2, size_t offset, size_t size, bool export);
+
+// Flush visible writes to a buffer made by the API
+void vk_buf_flush(pl_gpu, struct vk_cmd *, pl_buf, size_t offset, size_t size);
+
+struct pl_pass_vk;
+
+int vk_desc_namespace(pl_gpu, enum pl_desc_type);
+pl_pass vk_pass_create(pl_gpu, const struct pl_pass_params *);
+void vk_pass_destroy(pl_gpu, pl_pass);
+void vk_pass_run(pl_gpu, const struct pl_pass_run_params *);
+
+struct pl_sync_vk {
+    pl_rc_t rc;
+    VkSemaphore wait;
+    VkSemaphore signal;
+};
+
+void vk_sync_deref(pl_gpu, pl_sync);
diff --git a/src/vulkan/gpu_buf.c b/src/vulkan/gpu_buf.c
new file mode 100644
index 0000000..2f317bc
--- /dev/null
+++ b/src/vulkan/gpu_buf.c
@@ -0,0 +1,470 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+
+void vk_buf_barrier(pl_gpu gpu, struct vk_cmd *cmd, pl_buf buf,
+                    VkPipelineStageFlags2 stage, VkAccessFlags2 access,
+                    size_t offset, size_t size, bool export)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+    pl_assert(!export || !buf_vk->exported); // can't re-export exported buffers
+    pl_rc_ref(&buf_vk->rc);
+
+    bool needs_flush = buf_vk->needs_flush || buf->params.host_mapped ||
+                       buf->params.import_handle == PL_HANDLE_HOST_PTR;
+    bool noncoherent = buf_vk->mem.data && !buf_vk->mem.coherent;
+    if (needs_flush && noncoherent) {
+        VK(vk->FlushMappedMemoryRanges(vk->dev, 1, &(struct VkMappedMemoryRange) {
+            .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+            .memory = buf_vk->mem.vkmem,
+            .offset = buf_vk->mem.map_offset,
+            .size = buf_vk->mem.map_size,
+        }));
+
+        // Just ignore errors, not much we can do about them other than
+        // logging them and moving on...
+    error: ;
+    }
+
+    struct vk_sync_scope last;
+    last = vk_sem_barrier(cmd, &buf_vk->sem, stage, access, export);
+
+    // CONCURRENT buffers require transitioning to/from IGNORED, EXCLUSIVE
+    // buffers require transitioning to/from the concrete QF index
+    uint32_t qf = vk->pools.num > 1 ? VK_QUEUE_FAMILY_IGNORED : cmd->pool->qf;
+    uint32_t src_qf = buf_vk->exported ? VK_QUEUE_FAMILY_EXTERNAL_KHR : qf;
+    uint32_t dst_qf = export ? VK_QUEUE_FAMILY_EXTERNAL_KHR : qf;
+
+    if (last.access || src_qf != dst_qf) {
+        vk_cmd_barrier(cmd, &(VkDependencyInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+            .bufferMemoryBarrierCount = 1,
+            .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) {
+                .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+                .srcStageMask = last.stage,
+                .srcAccessMask = last.access,
+                .dstStageMask = stage,
+                .dstAccessMask = access,
+                .srcQueueFamilyIndex = src_qf,
+                .dstQueueFamilyIndex = dst_qf,
+                .buffer = buf_vk->mem.buf,
+                .offset = buf_vk->mem.offset + offset,
+                .size = size,
+            },
+        });
+    }
+
+    buf_vk->needs_flush = false;
+    buf_vk->exported = export;
+    vk_cmd_callback(cmd, (vk_cb) vk_buf_deref, gpu, buf);
+}
+
+void vk_buf_deref(pl_gpu gpu, pl_buf buf)
+{
+    if (!buf)
+        return;
+
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+    if (pl_rc_deref(&buf_vk->rc)) {
+        vk->DestroyBufferView(vk->dev, buf_vk->view, PL_VK_ALLOC);
+        vk_malloc_free(vk->ma, &buf_vk->mem);
+        pl_free((void *) buf);
+    }
+}
+
+pl_buf vk_buf_create(pl_gpu gpu, const struct pl_buf_params *params)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    struct pl_buf_t *buf = pl_zalloc_obj(NULL, buf, struct pl_buf_vk);
+    buf->params = *params;
+    buf->params.initial_data = NULL;
+
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+    pl_rc_init(&buf_vk->rc);
+
+    struct vk_malloc_params mparams = {
+        .reqs = {
+            .size = PL_ALIGN2(params->size, 4), // for vk_buf_write
+            .memoryTypeBits = UINT32_MAX,
+            .alignment = 1,
+        },
+        // these are always set, because `vk_buf_copy` can always be used
+        .buf_usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+                     VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+        .export_handle = params->export_handle,
+        .import_handle = params->import_handle,
+        .shared_mem = params->shared_mem,
+        .debug_tag = params->debug_tag,
+    };
+
+    // Mandatory/optimal buffer offset alignment
+    VkDeviceSize *align = &mparams.reqs.alignment;
+    VkDeviceSize extra_align = vk->props.limits.optimalBufferCopyOffsetAlignment;
+
+    // Try and align all buffers to the minimum texel alignment, to make sure
+    // tex_upload/tex_download always gets aligned buffer copies if possible
+    extra_align = pl_lcm(extra_align, p->min_texel_alignment);
+
+    enum pl_buf_mem_type mem_type = params->memory_type;
+    bool is_texel = false;
+
+    if (params->uniform) {
+        mparams.buf_usage |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
+        *align = pl_lcm(*align, vk->props.limits.minUniformBufferOffsetAlignment);
+        mem_type = PL_BUF_MEM_DEVICE;
+        if (params->format) {
+            mparams.buf_usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
+            is_texel = true;
+        }
+    }
+
+    if (params->storable) {
+        mparams.buf_usage |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+        *align = pl_lcm(*align, vk->props.limits.minStorageBufferOffsetAlignment);
+        buf_vk->update_queue = COMPUTE;
+        mem_type = PL_BUF_MEM_DEVICE;
+        if (params->format) {
+            mparams.buf_usage |= VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT;
+            is_texel = true;
+        }
+    }
+
+    if (is_texel) {
+        *align = pl_lcm(*align, vk->props.limits.minTexelBufferOffsetAlignment);
+        *align = pl_lcm(*align, params->format->texel_size);
+    }
+
+    if (params->drawable) {
+        mparams.buf_usage |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT |
+                             VK_BUFFER_USAGE_INDEX_BUFFER_BIT;
+        mem_type = PL_BUF_MEM_DEVICE;
+    }
+
+    if (params->host_writable || params->initial_data) {
+        // Buffers should be written using mapped memory if possible
+        mparams.optimal = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+        // Use the transfer queue for updates on very large buffers (1 MB)
+        if (params->size > 1024*1024)
+            buf_vk->update_queue = TRANSFER;
+    }
+
+    if (params->host_mapped || params->host_readable) {
+        mparams.required |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+
+        if (params->size > 1024) {
+            // Prefer cached memory for large buffers (1 kB) which may be read
+            // from, because uncached reads are extremely slow
+            mparams.optimal |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+        }
+    }
+
+    switch (mem_type) {
+    case PL_BUF_MEM_AUTO:
+        // We generally prefer VRAM since it's faster than RAM, but any number
+        // of other requirements could potentially exclude it, so just mark it
+        // as optimal by default.
+        if (!(mparams.optimal & VK_MEMORY_PROPERTY_HOST_CACHED_BIT))
+            mparams.optimal |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+        break;
+    case PL_BUF_MEM_DEVICE:
+        // Force device local memory.
+        mparams.required |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+        break;
+    case PL_BUF_MEM_HOST:
+        // This isn't a true guarantee, but actually trying to restrict the
+        // device-local bit locks out all memory heaps on iGPUs. Requiring
+        // the memory be host-mapped is the easiest compromise.
+        mparams.required |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+        mparams.optimal  |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+        break;
+    case PL_BUF_MEM_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+    if (params->import_handle) {
+        size_t offset = params->shared_mem.offset;
+        if (PL_ALIGN(offset, *align) != offset) {
+            PL_ERR(gpu, "Imported memory offset %zu violates minimum alignment "
+                   "requirement of enabled usage flags (%zu)!",
+                   offset, (size_t) *align);
+            goto error;
+        }
+    } else {
+        *align = pl_lcm(*align, extra_align);
+    }
+
+    if (!vk_malloc_slice(vk->ma, &buf_vk->mem, &mparams))
+        goto error;
+
+    if (params->host_mapped)
+        buf->data = buf_vk->mem.data;
+
+    if (params->export_handle) {
+        buf->shared_mem = buf_vk->mem.shared_mem;
+        buf->shared_mem.drm_format_mod = DRM_FORMAT_MOD_LINEAR;
+        buf_vk->exported = true;
+    }
+
+    if (is_texel) {
+        struct pl_fmt_vk *fmtp = PL_PRIV(params->format);
+        VkBufferViewCreateInfo vinfo = {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
+            .buffer = buf_vk->mem.buf,
+            .format = PL_DEF(fmtp->vk_fmt->bfmt, fmtp->vk_fmt->tfmt),
+            .offset = buf_vk->mem.offset,
+            .range = buf_vk->mem.size,
+        };
+
+        VK(vk->CreateBufferView(vk->dev, &vinfo, PL_VK_ALLOC, &buf_vk->view));
+        PL_VK_NAME(BUFFER_VIEW, buf_vk->view, PL_DEF(params->debug_tag, "texel"));
+    }
+
+    if (params->initial_data)
+        vk_buf_write(gpu, buf, 0, params->initial_data, params->size);
+
+    return buf;
+
+error:
+    vk_buf_deref(gpu, buf);
+    return NULL;
+}
+
+static void invalidate_buf(pl_gpu gpu, pl_buf buf)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+    if (buf_vk->mem.data && !buf_vk->mem.coherent) {
+        VK(vk->InvalidateMappedMemoryRanges(vk->dev, 1, &(VkMappedMemoryRange) {
+            .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+            .memory = buf_vk->mem.vkmem,
+            .offset = buf_vk->mem.map_offset,
+            .size = buf_vk->mem.map_size,
+        }));
+    }
+
+    // Ignore errors (after logging), nothing useful we can do anyway
+error: ;
+    vk_buf_deref(gpu, buf);
+}
+
+void vk_buf_flush(pl_gpu gpu, struct vk_cmd *cmd, pl_buf buf,
+                  size_t offset, size_t size)
+{
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+    // We need to perform a flush if the host is capable of reading back from
+    // the buffer, or if we intend to overwrite it using mapped memory
+    bool can_read = buf->params.host_readable;
+    bool can_write = buf_vk->mem.data && buf->params.host_writable;
+    if (buf->params.host_mapped || buf->params.import_handle == PL_HANDLE_HOST_PTR)
+        can_read = can_write = true;
+
+    if (!can_read && !can_write)
+        return;
+
+    vk_cmd_barrier(cmd, &(VkDependencyInfo) {
+        .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+        .bufferMemoryBarrierCount = 1,
+        .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+            .srcStageMask = buf_vk->sem.write.stage,
+            .srcAccessMask = buf_vk->sem.write.access,
+            .dstStageMask = VK_PIPELINE_STAGE_2_HOST_BIT,
+            .dstAccessMask = (can_read ? VK_ACCESS_2_HOST_READ_BIT : 0)
+                           | (can_write ? VK_ACCESS_2_HOST_WRITE_BIT : 0),
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = buf_vk->mem.buf,
+            .offset = buf_vk->mem.offset + offset,
+            .size = size,
+        },
+    });
+
+    // We need to hold on to the buffer until this barrier completes
+    vk_cmd_callback(cmd, (vk_cb) invalidate_buf, gpu, buf);
+    pl_rc_ref(&buf_vk->rc);
+}
+
+bool vk_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t timeout)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+    // Opportunistically check if we can re-use this buffer without flush
+    vk_poll_commands(vk, 0);
+    if (pl_rc_count(&buf_vk->rc) == 1)
+        return false;
+
+    // Otherwise, we're force to submit any queued command so that the
+    // user is guaranteed to see progress eventually, even if they call
+    // this in a tight loop
+    CMD_SUBMIT(NULL);
+    vk_poll_commands(vk, timeout);
+
+    return pl_rc_count(&buf_vk->rc) > 1;
+}
+
+void vk_buf_write(pl_gpu gpu, pl_buf buf, size_t offset,
+                  const void *data, size_t size)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+    // For host-mapped buffers, we can just directly memcpy the buffer contents.
+    // Otherwise, we can update the buffer from the GPU using a command buffer.
+    if (buf_vk->mem.data) {
+        // ensure no queued operations
+        while (vk_buf_poll(gpu, buf, UINT64_MAX))
+            ; // do nothing
+
+        uintptr_t addr = (uintptr_t) buf_vk->mem.data + offset;
+        memcpy((void *) addr, data, size);
+        buf_vk->needs_flush = true;
+    } else {
+        struct vk_cmd *cmd = CMD_BEGIN(buf_vk->update_queue);
+        if (!cmd) {
+            PL_ERR(gpu, "Failed updating buffer!");
+            return;
+        }
+
+        vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT,
+                       VK_ACCESS_2_TRANSFER_WRITE_BIT, offset, size, false);
+
+        // Vulkan requires `size` to be a multiple of 4, so we need to make
+        // sure to handle the end separately if the original data is not
+        const size_t max_transfer = 64 * 1024;
+        size_t size_rem = size % 4;
+        size_t size_base = size - size_rem;
+        VkDeviceSize buf_offset = buf_vk->mem.offset + offset;
+
+        if (size_base > max_transfer) {
+            PL_TRACE(gpu, "Using multiple vkCmdUpdateBuffer calls to upload "
+                     "large buffer. Consider using buffer-buffer transfers "
+                     "instead!");
+        }
+
+        for (size_t xfer = 0; xfer < size_base; xfer += max_transfer) {
+            vk->CmdUpdateBuffer(cmd->buf, buf_vk->mem.buf,
+                                buf_offset + xfer,
+                                PL_MIN(size_base, max_transfer),
+                                (void *) ((uint8_t *) data + xfer));
+        }
+
+        if (size_rem) {
+            uint8_t tail[4] = {0};
+            memcpy(tail, data, size_rem);
+            vk->CmdUpdateBuffer(cmd->buf, buf_vk->mem.buf, buf_offset + size_base,
+                                sizeof(tail), tail);
+        }
+
+        pl_assert(!buf->params.host_readable); // no flush needed due to this
+        CMD_FINISH(&cmd);
+    }
+}
+
+bool vk_buf_read(pl_gpu gpu, pl_buf buf, size_t offset, void *dest, size_t size)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+    pl_assert(buf_vk->mem.data);
+
+    if (vk_buf_poll(gpu, buf, 0) && buf_vk->sem.write.sync.sem) {
+        // ensure no more queued writes
+        VK(vk->WaitSemaphores(vk->dev, &(VkSemaphoreWaitInfo) {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
+            .semaphoreCount = 1,
+            .pSemaphores = &buf_vk->sem.write.sync.sem,
+            .pValues = &buf_vk->sem.write.sync.value,
+        }, UINT64_MAX));
+
+        // process callbacks
+        vk_poll_commands(vk, 0);
+    }
+
+    uintptr_t addr = (uintptr_t) buf_vk->mem.data + (size_t) offset;
+    memcpy(dest, (void *) addr, size);
+    return true;
+
+error:
+    return false;
+}
+
+void vk_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset,
+                 pl_buf src, size_t src_offset, size_t size)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_buf_vk *dst_vk = PL_PRIV(dst);
+    struct pl_buf_vk *src_vk = PL_PRIV(src);
+
+    struct vk_cmd *cmd = CMD_BEGIN(dst_vk->update_queue);
+    if (!cmd) {
+        PL_ERR(gpu, "Failed copying buffer!");
+        return;
+    }
+
+    vk_buf_barrier(gpu, cmd, dst, VK_PIPELINE_STAGE_2_COPY_BIT,
+                   VK_ACCESS_2_TRANSFER_WRITE_BIT, dst_offset, size, false);
+    vk_buf_barrier(gpu, cmd, src, VK_PIPELINE_STAGE_2_COPY_BIT,
+                   VK_ACCESS_2_TRANSFER_READ_BIT, src_offset, size, false);
+
+    VkBufferCopy region = {
+        .srcOffset = src_vk->mem.offset + src_offset,
+        .dstOffset = dst_vk->mem.offset + dst_offset,
+        .size = size,
+    };
+
+    vk->CmdCopyBuffer(cmd->buf, src_vk->mem.buf, dst_vk->mem.buf,
+                      1, &region);
+
+    vk_buf_flush(gpu, cmd, dst, dst_offset, size);
+    CMD_FINISH(&cmd);
+}
+
+bool vk_buf_export(pl_gpu gpu, pl_buf buf)
+{
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+    if (buf_vk->exported)
+        return true;
+
+    struct vk_cmd *cmd = CMD_BEGIN(ANY);
+    if (!cmd) {
+        PL_ERR(gpu, "Failed exporting buffer!");
+        return false;
+    }
+
+    // For the queue family ownership transfer, we can ignore all pipeline
+    // stages since the synchronization via fences/semaphores is required
+    vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_NONE, 0, 0,
+                   buf->params.size, true);
+
+
+    return CMD_SUBMIT(&cmd);
+}
diff --git a/src/vulkan/gpu_pass.c b/src/vulkan/gpu_pass.c
new file mode 100644
index 0000000..5ffe77d
--- /dev/null
+++ b/src/vulkan/gpu_pass.c
@@ -0,0 +1,964 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+#include "cache.h"
+#include "glsl/spirv.h"
+
+// For pl_pass.priv
+struct pl_pass_vk {
+    // Pipeline / render pass
+    VkPipeline base;
+    VkPipeline pipe;
+    VkPipelineLayout pipeLayout;
+    VkRenderPass renderPass;
+    // Descriptor set (bindings)
+    bool use_pushd;
+    VkDescriptorSetLayout dsLayout;
+    VkDescriptorPool dsPool;
+    // To keep track of which descriptor sets are and aren't available, we
+    // allocate a fixed number and use a bitmask of all available sets.
+    VkDescriptorSet dss[16];
+    uint16_t dmask;
+
+    // For recompilation
+    VkVertexInputAttributeDescription *attrs;
+    VkPipelineCache cache;
+    VkShaderModule vert;
+    VkShaderModule shader;
+
+    // For updating
+    VkWriteDescriptorSet *dswrite;
+    VkDescriptorImageInfo *dsiinfo;
+    VkDescriptorBufferInfo *dsbinfo;
+    VkSpecializationInfo specInfo;
+    size_t spec_size;
+};
+
+int vk_desc_namespace(pl_gpu gpu, enum pl_desc_type type)
+{
+    return 0;
+}
+
+static void pass_destroy_cb(pl_gpu gpu, pl_pass pass)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+
+    vk->DestroyPipeline(vk->dev, pass_vk->pipe, PL_VK_ALLOC);
+    vk->DestroyPipeline(vk->dev, pass_vk->base, PL_VK_ALLOC);
+    vk->DestroyRenderPass(vk->dev, pass_vk->renderPass, PL_VK_ALLOC);
+    vk->DestroyPipelineLayout(vk->dev, pass_vk->pipeLayout, PL_VK_ALLOC);
+    vk->DestroyPipelineCache(vk->dev, pass_vk->cache, PL_VK_ALLOC);
+    vk->DestroyDescriptorPool(vk->dev, pass_vk->dsPool, PL_VK_ALLOC);
+    vk->DestroyDescriptorSetLayout(vk->dev, pass_vk->dsLayout, PL_VK_ALLOC);
+    vk->DestroyShaderModule(vk->dev, pass_vk->vert, PL_VK_ALLOC);
+    vk->DestroyShaderModule(vk->dev, pass_vk->shader, PL_VK_ALLOC);
+
+    pl_free((void *) pass);
+}
+
+void vk_pass_destroy(pl_gpu gpu, pl_pass pass)
+{
+    vk_gpu_idle_callback(gpu, (vk_cb) pass_destroy_cb, gpu, pass);
+}
+
+static const VkDescriptorType dsType[] = {
+    [PL_DESC_SAMPLED_TEX] = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+    [PL_DESC_STORAGE_IMG] = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+    [PL_DESC_BUF_UNIFORM] = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+    [PL_DESC_BUF_STORAGE] = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+    [PL_DESC_BUF_TEXEL_UNIFORM] = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
+    [PL_DESC_BUF_TEXEL_STORAGE] = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER,
+};
+
+static VkResult vk_compile_glsl(pl_gpu gpu, void *alloc,
+                                enum glsl_shader_stage stage,
+                                const char *shader,
+                                pl_cache_obj *out_spirv)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    pl_cache cache = pl_gpu_cache(gpu);
+    uint64_t key = CACHE_KEY_SPIRV;
+    if (cache) { // skip computing key if `cache
+        pl_hash_merge(&key, p->spirv->signature);
+        pl_hash_merge(&key, pl_str0_hash(shader));
+        out_spirv->key = key;
+        if (pl_cache_get(cache, out_spirv)) {
+            PL_DEBUG(gpu, "Re-using cached SPIR-V object 0x%"PRIx64, key);
+            return VK_SUCCESS;
+        }
+    }
+
+    pl_clock_t start = pl_clock_now();
+    pl_str spirv = pl_spirv_compile_glsl(p->spirv, alloc, gpu->glsl, stage, shader);
+    pl_log_cpu_time(gpu->log, start, pl_clock_now(), "translating SPIR-V");
+    out_spirv->data = spirv.buf;
+    out_spirv->size = spirv.len;
+    out_spirv->free = pl_free;
+    return spirv.len ? VK_SUCCESS : VK_ERROR_INITIALIZATION_FAILED;
+}
+
+static const VkShaderStageFlags stageFlags[] = {
+    [PL_PASS_RASTER]  = VK_SHADER_STAGE_FRAGMENT_BIT |
+                        VK_SHADER_STAGE_VERTEX_BIT,
+    [PL_PASS_COMPUTE] = VK_SHADER_STAGE_COMPUTE_BIT,
+};
+
+static void destroy_pipeline(struct vk_ctx *vk, void *pipeline)
+{
+    vk->DestroyPipeline(vk->dev, vk_unwrap_handle(pipeline), PL_VK_ALLOC);
+}
+
+static VkResult vk_recreate_pipelines(struct vk_ctx *vk, pl_pass pass,
+                                      bool derivable, VkPipeline base,
+                                      VkPipeline *out_pipe)
+{
+    struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+    const struct pl_pass_params *params = &pass->params;
+
+    // The old pipeline might still be in use, so we have to destroy it
+    // asynchronously with a device idle callback
+    if (*out_pipe) {
+        // We don't need to use `vk_gpu_idle_callback` because the only command
+        // that can access a VkPipeline, `vk_pass_run`, always flushes `p->cmd`.
+        vk_dev_callback(vk, (vk_cb) destroy_pipeline, vk, vk_wrap_handle(*out_pipe));
+        *out_pipe = VK_NULL_HANDLE;
+    }
+
+    VkPipelineCreateFlags flags = 0;
+    if (derivable)
+        flags |= VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT;
+    if (base)
+        flags |= VK_PIPELINE_CREATE_DERIVATIVE_BIT;
+
+    const VkSpecializationInfo *specInfo = &pass_vk->specInfo;
+    if (!specInfo->dataSize)
+        specInfo = NULL;
+
+    switch (params->type) {
+    case PL_PASS_RASTER: {
+        static const VkBlendFactor blendFactors[] = {
+            [PL_BLEND_ZERO]                = VK_BLEND_FACTOR_ZERO,
+            [PL_BLEND_ONE]                 = VK_BLEND_FACTOR_ONE,
+            [PL_BLEND_SRC_ALPHA]           = VK_BLEND_FACTOR_SRC_ALPHA,
+            [PL_BLEND_ONE_MINUS_SRC_ALPHA] = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA,
+        };
+
+        VkPipelineColorBlendAttachmentState blendState = {
+            .colorBlendOp = VK_BLEND_OP_ADD,
+            .alphaBlendOp = VK_BLEND_OP_ADD,
+            .colorWriteMask = VK_COLOR_COMPONENT_R_BIT |
+                              VK_COLOR_COMPONENT_G_BIT |
+                              VK_COLOR_COMPONENT_B_BIT |
+                              VK_COLOR_COMPONENT_A_BIT,
+        };
+
+        const struct pl_blend_params *blend = params->blend_params;
+        if (blend) {
+            blendState.blendEnable = true;
+            blendState.srcColorBlendFactor = blendFactors[blend->src_rgb];
+            blendState.dstColorBlendFactor = blendFactors[blend->dst_rgb];
+            blendState.srcAlphaBlendFactor = blendFactors[blend->src_alpha];
+            blendState.dstAlphaBlendFactor = blendFactors[blend->dst_alpha];
+        }
+
+        static const VkPrimitiveTopology topologies[PL_PRIM_TYPE_COUNT] = {
+            [PL_PRIM_TRIANGLE_LIST]  = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
+            [PL_PRIM_TRIANGLE_STRIP] = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP,
+        };
+
+        VkGraphicsPipelineCreateInfo cinfo = {
+            .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
+            .flags = flags,
+            .stageCount = 2,
+            .pStages = (VkPipelineShaderStageCreateInfo[]) {
+                {
+                    .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+                    .stage = VK_SHADER_STAGE_VERTEX_BIT,
+                    .module = pass_vk->vert,
+                    .pName = "main",
+                }, {
+                    .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+                    .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
+                    .module = pass_vk->shader,
+                    .pName = "main",
+                    .pSpecializationInfo = specInfo,
+                }
+            },
+            .pVertexInputState = &(VkPipelineVertexInputStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+                .vertexBindingDescriptionCount = 1,
+                .pVertexBindingDescriptions = &(VkVertexInputBindingDescription) {
+                    .binding = 0,
+                    .stride = params->vertex_stride,
+                    .inputRate = VK_VERTEX_INPUT_RATE_VERTEX,
+                },
+                .vertexAttributeDescriptionCount = params->num_vertex_attribs,
+                .pVertexAttributeDescriptions = pass_vk->attrs,
+            },
+            .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+                .topology = topologies[params->vertex_type],
+            },
+            .pViewportState = &(VkPipelineViewportStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
+                .viewportCount = 1,
+                .scissorCount = 1,
+            },
+            .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
+                .polygonMode = VK_POLYGON_MODE_FILL,
+                .cullMode = VK_CULL_MODE_NONE,
+                .lineWidth = 1.0f,
+            },
+            .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
+                .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
+            },
+            .pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
+                .attachmentCount = 1,
+                .pAttachments = &blendState,
+            },
+            .pDynamicState = &(VkPipelineDynamicStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
+                .dynamicStateCount = 2,
+                .pDynamicStates = (VkDynamicState[]){
+                    VK_DYNAMIC_STATE_VIEWPORT,
+                    VK_DYNAMIC_STATE_SCISSOR,
+                },
+            },
+            .layout = pass_vk->pipeLayout,
+            .renderPass = pass_vk->renderPass,
+            .basePipelineHandle = base,
+            .basePipelineIndex = -1,
+        };
+
+        return vk->CreateGraphicsPipelines(vk->dev, pass_vk->cache, 1, &cinfo,
+                                           PL_VK_ALLOC, out_pipe);
+    }
+
+    case PL_PASS_COMPUTE: {
+        VkComputePipelineCreateInfo cinfo = {
+            .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+            .flags = flags,
+            .stage = {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+                .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+                .module = pass_vk->shader,
+                .pName = "main",
+                .pSpecializationInfo = specInfo,
+            },
+            .layout = pass_vk->pipeLayout,
+            .basePipelineHandle = base,
+            .basePipelineIndex = -1,
+        };
+
+        return vk->CreateComputePipelines(vk->dev, pass_vk->cache, 1, &cinfo,
+                                          PL_VK_ALLOC, out_pipe);
+    }
+
+    case PL_PASS_INVALID:
+    case PL_PASS_TYPE_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+}
+
+pl_pass vk_pass_create(pl_gpu gpu, const struct pl_pass_params *params)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    bool success = false;
+
+    struct pl_pass_t *pass = pl_zalloc_obj(NULL, pass, struct pl_pass_vk);
+    pass->params = pl_pass_params_copy(pass, params);
+
+    struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+    pass_vk->dmask = -1; // all descriptors available
+
+    // temporary allocations
+    void *tmp = pl_tmp(NULL);
+
+    int num_desc = params->num_descriptors;
+    if (!num_desc)
+        goto no_descriptors;
+    if (num_desc > vk->props.limits.maxPerStageResources) {
+        PL_ERR(gpu, "Pass with %d descriptors exceeds the maximum number of "
+               "per-stage resources %" PRIu32"!",
+               num_desc, vk->props.limits.maxPerStageResources);
+        goto error;
+    }
+
+    pass_vk->dswrite = pl_calloc(pass, num_desc, sizeof(VkWriteDescriptorSet));
+    pass_vk->dsiinfo = pl_calloc(pass, num_desc, sizeof(VkDescriptorImageInfo));
+    pass_vk->dsbinfo = pl_calloc(pass, num_desc, sizeof(VkDescriptorBufferInfo));
+
+#define NUM_DS (PL_ARRAY_SIZE(pass_vk->dss))
+
+    int dsSize[PL_DESC_TYPE_COUNT] = {0};
+    VkDescriptorSetLayoutBinding *bindings = pl_calloc_ptr(tmp, num_desc, bindings);
+
+    uint32_t max_tex = vk->props.limits.maxPerStageDescriptorSampledImages,
+             max_img = vk->props.limits.maxPerStageDescriptorStorageImages,
+             max_ubo = vk->props.limits.maxPerStageDescriptorUniformBuffers,
+             max_ssbo = vk->props.limits.maxPerStageDescriptorStorageBuffers;
+
+    uint32_t *dsLimits[PL_DESC_TYPE_COUNT] = {
+        [PL_DESC_SAMPLED_TEX] = &max_tex,
+        [PL_DESC_STORAGE_IMG] = &max_img,
+        [PL_DESC_BUF_UNIFORM] = &max_ubo,
+        [PL_DESC_BUF_STORAGE] = &max_ssbo,
+        [PL_DESC_BUF_TEXEL_UNIFORM] = &max_tex,
+        [PL_DESC_BUF_TEXEL_STORAGE] = &max_img,
+    };
+
+    for (int i = 0; i < num_desc; i++) {
+        struct pl_desc *desc = &params->descriptors[i];
+        if (!(*dsLimits[desc->type])--) {
+            PL_ERR(gpu, "Pass exceeds the maximum number of per-stage "
+                   "descriptors of type %u!", (unsigned) desc->type);
+            goto error;
+        }
+
+        dsSize[desc->type]++;
+        bindings[i] = (VkDescriptorSetLayoutBinding) {
+            .binding = desc->binding,
+            .descriptorType = dsType[desc->type],
+            .descriptorCount = 1,
+            .stageFlags = stageFlags[params->type],
+        };
+    }
+
+    VkDescriptorSetLayoutCreateInfo dinfo = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+        .pBindings = bindings,
+        .bindingCount = num_desc,
+    };
+
+    if (p->max_push_descriptors && num_desc <= p->max_push_descriptors) {
+        dinfo.flags |= VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR;
+        pass_vk->use_pushd = true;
+    } else if (p->max_push_descriptors) {
+        PL_INFO(gpu, "Pass with %d descriptors exceeds the maximum push "
+                "descriptor count (%d). Falling back to descriptor sets!",
+                num_desc, p->max_push_descriptors);
+    }
+
+    VK(vk->CreateDescriptorSetLayout(vk->dev, &dinfo, PL_VK_ALLOC,
+                                     &pass_vk->dsLayout));
+
+    if (!pass_vk->use_pushd) {
+        PL_ARRAY(VkDescriptorPoolSize) dsPoolSizes = {0};
+
+        for (enum pl_desc_type t = 0; t < PL_DESC_TYPE_COUNT; t++) {
+            if (dsSize[t] > 0) {
+                PL_ARRAY_APPEND(tmp, dsPoolSizes, (VkDescriptorPoolSize) {
+                    .type = dsType[t],
+                    .descriptorCount = dsSize[t] * NUM_DS,
+                });
+            }
+        }
+
+        if (dsPoolSizes.num) {
+            VkDescriptorPoolCreateInfo pinfo = {
+                .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+                .maxSets = NUM_DS,
+                .pPoolSizes = dsPoolSizes.elem,
+                .poolSizeCount = dsPoolSizes.num,
+            };
+
+            VK(vk->CreateDescriptorPool(vk->dev, &pinfo, PL_VK_ALLOC, &pass_vk->dsPool));
+
+            VkDescriptorSetLayout layouts[NUM_DS];
+            for (int i = 0; i < NUM_DS; i++)
+                layouts[i] = pass_vk->dsLayout;
+
+            VkDescriptorSetAllocateInfo ainfo = {
+                .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+                .descriptorPool = pass_vk->dsPool,
+                .descriptorSetCount = NUM_DS,
+                .pSetLayouts = layouts,
+            };
+
+            VK(vk->AllocateDescriptorSets(vk->dev, &ainfo, pass_vk->dss));
+        }
+    }
+
+no_descriptors: ;
+
+    bool has_spec = params->num_constants;
+    if (has_spec) {
+        PL_ARRAY(VkSpecializationMapEntry) entries = {0};
+        PL_ARRAY_RESIZE(pass, entries, params->num_constants);
+        size_t spec_size = 0;
+
+        for (int i = 0; i < params->num_constants; i++) {
+            const struct pl_constant *con = &params->constants[i];
+            size_t con_size = pl_var_type_size(con->type);
+            entries.elem[i] = (VkSpecializationMapEntry) {
+                .constantID = con->id,
+                .offset = con->offset,
+                .size = con_size,
+            };
+
+            size_t req_size = con->offset + con_size;
+            spec_size = PL_MAX(spec_size, req_size);
+        }
+
+        pass_vk->spec_size = spec_size;
+        pass_vk->specInfo = (VkSpecializationInfo) {
+            .mapEntryCount = params->num_constants,
+            .pMapEntries = entries.elem,
+        };
+
+        if (params->constant_data) {
+            pass_vk->specInfo.pData = pl_memdup(pass, params->constant_data, spec_size);
+            pass_vk->specInfo.dataSize = spec_size;
+        }
+    }
+
+    VkPipelineLayoutCreateInfo linfo = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+        .setLayoutCount = num_desc ? 1 : 0,
+        .pSetLayouts = &pass_vk->dsLayout,
+        .pushConstantRangeCount = params->push_constants_size ? 1 : 0,
+        .pPushConstantRanges = &(VkPushConstantRange){
+            .stageFlags = stageFlags[params->type],
+            .offset = 0,
+            .size = params->push_constants_size,
+        },
+    };
+
+    VK(vk->CreatePipelineLayout(vk->dev, &linfo, PL_VK_ALLOC,
+                                &pass_vk->pipeLayout));
+
+    pl_cache_obj vert = {0}, frag = {0}, comp = {0};
+    switch (params->type) {
+    case PL_PASS_RASTER: ;
+        VK(vk_compile_glsl(gpu, tmp, GLSL_SHADER_VERTEX, params->vertex_shader, &vert));
+        VK(vk_compile_glsl(gpu, tmp, GLSL_SHADER_FRAGMENT, params->glsl_shader, &frag));
+        break;
+    case PL_PASS_COMPUTE:
+        VK(vk_compile_glsl(gpu, tmp, GLSL_SHADER_COMPUTE, params->glsl_shader, &comp));
+        break;
+    case PL_PASS_INVALID:
+    case PL_PASS_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+    // Use hash of generated SPIR-V as key for pipeline cache
+    const pl_cache cache = pl_gpu_cache(gpu);
+    pl_cache_obj pipecache = {0};
+    if (cache) {
+        pipecache.key = CACHE_KEY_VK_PIPE;
+        pl_hash_merge(&pipecache.key, pl_var_hash(vk->props.pipelineCacheUUID));
+        pl_hash_merge(&pipecache.key, pl_mem_hash(vert.data, vert.size));
+        pl_hash_merge(&pipecache.key, pl_mem_hash(frag.data, frag.size));
+        pl_hash_merge(&pipecache.key, pl_mem_hash(comp.data, comp.size));
+        pl_cache_get(cache, &pipecache);
+    }
+
+    if (cache || has_spec) {
+        // Don't create pipeline cache unless we either plan on caching the
+        // result of this shader to a pl_cache, or if we will possibly re-use
+        // it due to the presence of specialization constants
+        VkPipelineCacheCreateInfo pcinfo = {
+            .sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO,
+            .pInitialData = pipecache.data,
+            .initialDataSize = pipecache.size,
+        };
+
+        VK(vk->CreatePipelineCache(vk->dev, &pcinfo, PL_VK_ALLOC, &pass_vk->cache));
+    }
+
+    VkShaderModuleCreateInfo sinfo = {
+        .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+    };
+
+    pl_clock_t start = pl_clock_now();
+    switch (params->type) {
+    case PL_PASS_RASTER: {
+        sinfo.pCode = (uint32_t *) vert.data;
+        sinfo.codeSize = vert.size;
+        VK(vk->CreateShaderModule(vk->dev, &sinfo, PL_VK_ALLOC, &pass_vk->vert));
+        PL_VK_NAME(SHADER_MODULE, pass_vk->vert, "vertex");
+
+        sinfo.pCode = (uint32_t *) frag.data;
+        sinfo.codeSize = frag.size;
+        VK(vk->CreateShaderModule(vk->dev, &sinfo, PL_VK_ALLOC, &pass_vk->shader));
+        PL_VK_NAME(SHADER_MODULE, pass_vk->shader, "fragment");
+
+        pass_vk->attrs = pl_calloc_ptr(pass, params->num_vertex_attribs, pass_vk->attrs);
+        for (int i = 0; i < params->num_vertex_attribs; i++) {
+            struct pl_vertex_attrib *va = &params->vertex_attribs[i];
+            const struct vk_format **pfmt_vk = PL_PRIV(va->fmt);
+
+            pass_vk->attrs[i] = (VkVertexInputAttributeDescription) {
+                .binding  = 0,
+                .location = va->location,
+                .offset   = va->offset,
+                .format   = PL_DEF((*pfmt_vk)->bfmt, (*pfmt_vk)->tfmt),
+            };
+        }
+
+        VkRenderPassCreateInfo rinfo = {
+            .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
+            .attachmentCount = 1,
+            .pAttachments = &(VkAttachmentDescription) {
+                .format = (VkFormat) params->target_format->signature,
+                .samples = VK_SAMPLE_COUNT_1_BIT,
+                .loadOp = pass->params.load_target
+                            ? VK_ATTACHMENT_LOAD_OP_LOAD
+                            : VK_ATTACHMENT_LOAD_OP_DONT_CARE,
+                .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
+                .initialLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+                .finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+            },
+            .subpassCount = 1,
+            .pSubpasses = &(VkSubpassDescription) {
+                .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
+                .colorAttachmentCount = 1,
+                .pColorAttachments = &(VkAttachmentReference) {
+                    .attachment = 0,
+                    .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+                },
+            },
+        };
+
+        VK(vk->CreateRenderPass(vk->dev, &rinfo, PL_VK_ALLOC, &pass_vk->renderPass));
+        break;
+    }
+    case PL_PASS_COMPUTE: {
+        sinfo.pCode = (uint32_t *) comp.data;
+        sinfo.codeSize = comp.size;
+        VK(vk->CreateShaderModule(vk->dev, &sinfo, PL_VK_ALLOC, &pass_vk->shader));
+        PL_VK_NAME(SHADER_MODULE, pass_vk->shader, "compute");
+        break;
+    }
+    case PL_PASS_INVALID:
+    case PL_PASS_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+    pl_clock_t after_compilation = pl_clock_now();
+    pl_log_cpu_time(gpu->log, start, after_compilation, "compiling shader");
+
+    // Update cache entries on successful compilation
+    pl_cache_steal(cache, &vert);
+    pl_cache_steal(cache, &frag);
+    pl_cache_steal(cache, &comp);
+
+    // Create the graphics/compute pipeline
+    VkPipeline *pipe = has_spec ? &pass_vk->base : &pass_vk->pipe;
+    VK(vk_recreate_pipelines(vk, pass, has_spec, VK_NULL_HANDLE, pipe));
+    pl_log_cpu_time(gpu->log, after_compilation, pl_clock_now(), "creating pipeline");
+
+    // Update pipeline cache
+    if (cache) {
+        size_t size = 0;
+        VK(vk->GetPipelineCacheData(vk->dev, pass_vk->cache, &size, NULL));
+        pl_cache_obj_resize(tmp, &pipecache, size);
+        VK(vk->GetPipelineCacheData(vk->dev, pass_vk->cache, &size, pipecache.data));
+        pl_cache_steal(cache, &pipecache);
+    }
+
+    if (!has_spec) {
+        // We can free these if we no longer need them for specialization
+        pl_free_ptr(&pass_vk->attrs);
+        vk->DestroyShaderModule(vk->dev, pass_vk->vert, PL_VK_ALLOC);
+        vk->DestroyShaderModule(vk->dev, pass_vk->shader, PL_VK_ALLOC);
+        vk->DestroyPipelineCache(vk->dev, pass_vk->cache, PL_VK_ALLOC);
+        pass_vk->vert = VK_NULL_HANDLE;
+        pass_vk->shader = VK_NULL_HANDLE;
+        pass_vk->cache = VK_NULL_HANDLE;
+    }
+
+    PL_DEBUG(vk, "Pass statistics: size %zu, SPIR-V: vert %zu frag %zu comp %zu",
+             pipecache.size, vert.size, frag.size, comp.size);
+
+    success = true;
+
+error:
+    if (!success) {
+        pass_destroy_cb(gpu, pass);
+        pass = NULL;
+    }
+
+#undef NUM_DS
+
+    pl_free(tmp);
+    return pass;
+}
+
+static const VkPipelineStageFlags2 shaderStages[] = {
+    [PL_PASS_RASTER]  = VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT,
+    [PL_PASS_COMPUTE] = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+};
+
+static void vk_update_descriptor(pl_gpu gpu, struct vk_cmd *cmd, pl_pass pass,
+                                 struct pl_desc_binding db,
+                                 VkDescriptorSet ds, int idx)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+    struct pl_desc *desc = &pass->params.descriptors[idx];
+
+    VkWriteDescriptorSet *wds = &pass_vk->dswrite[idx];
+    *wds = (VkWriteDescriptorSet) {
+        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+        .dstSet = ds,
+        .dstBinding = desc->binding,
+        .descriptorCount = 1,
+        .descriptorType = dsType[desc->type],
+    };
+
+    static const VkAccessFlags2 storageAccess[PL_DESC_ACCESS_COUNT] = {
+        [PL_DESC_ACCESS_READONLY]   = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
+        [PL_DESC_ACCESS_WRITEONLY]  = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+        [PL_DESC_ACCESS_READWRITE]  = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
+                                      VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+    };
+
+    switch (desc->type) {
+    case PL_DESC_SAMPLED_TEX: {
+        pl_tex tex = db.object;
+        struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+        vk_tex_barrier(gpu, cmd, tex, shaderStages[pass->params.type],
+                       VK_ACCESS_2_SHADER_SAMPLED_READ_BIT,
+                       VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+                       VK_QUEUE_FAMILY_IGNORED);
+
+        VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx];
+        *iinfo = (VkDescriptorImageInfo) {
+            .sampler = p->samplers[db.sample_mode][db.address_mode],
+            .imageView = tex_vk->view,
+            .imageLayout = tex_vk->layout,
+        };
+
+        wds->pImageInfo = iinfo;
+        return;
+    }
+    case PL_DESC_STORAGE_IMG: {
+        pl_tex tex = db.object;
+        struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+        vk_tex_barrier(gpu, cmd, tex, shaderStages[pass->params.type],
+                       storageAccess[desc->access], VK_IMAGE_LAYOUT_GENERAL,
+                       VK_QUEUE_FAMILY_IGNORED);
+
+        VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx];
+        *iinfo = (VkDescriptorImageInfo) {
+            .imageView = tex_vk->view,
+            .imageLayout = tex_vk->layout,
+        };
+
+        wds->pImageInfo = iinfo;
+        return;
+    }
+    case PL_DESC_BUF_UNIFORM:
+    case PL_DESC_BUF_STORAGE: {
+        pl_buf buf = db.object;
+        struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+        VkAccessFlags2 access = VK_ACCESS_2_UNIFORM_READ_BIT;
+        if (desc->type == PL_DESC_BUF_STORAGE)
+            access = storageAccess[desc->access];
+
+        vk_buf_barrier(gpu, cmd, buf, shaderStages[pass->params.type],
+                       access, 0, buf->params.size, false);
+
+        VkDescriptorBufferInfo *binfo = &pass_vk->dsbinfo[idx];
+        *binfo = (VkDescriptorBufferInfo) {
+            .buffer = buf_vk->mem.buf,
+            .offset = buf_vk->mem.offset,
+            .range = buf->params.size,
+        };
+
+        wds->pBufferInfo = binfo;
+        return;
+    }
+    case PL_DESC_BUF_TEXEL_UNIFORM:
+    case PL_DESC_BUF_TEXEL_STORAGE: {
+        pl_buf buf = db.object;
+        struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+        VkAccessFlags2 access = VK_ACCESS_2_SHADER_SAMPLED_READ_BIT;
+        if (desc->type == PL_DESC_BUF_TEXEL_STORAGE)
+            access = storageAccess[desc->access];
+
+        vk_buf_barrier(gpu, cmd, buf, shaderStages[pass->params.type],
+                       access, 0, buf->params.size, false);
+
+        wds->pTexelBufferView = &buf_vk->view;
+        return;
+    }
+    case PL_DESC_INVALID:
+    case PL_DESC_TYPE_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+}
+
+static void vk_release_descriptor(pl_gpu gpu, struct vk_cmd *cmd, pl_pass pass,
+                                  struct pl_desc_binding db, int idx)
+{
+    const struct pl_desc *desc = &pass->params.descriptors[idx];
+
+    switch (desc->type) {
+    case PL_DESC_BUF_UNIFORM:
+    case PL_DESC_BUF_STORAGE:
+    case PL_DESC_BUF_TEXEL_UNIFORM:
+    case PL_DESC_BUF_TEXEL_STORAGE:
+        if (desc->access != PL_DESC_ACCESS_READONLY) {
+            pl_buf buf = db.object;
+            vk_buf_flush(gpu, cmd, buf, 0, buf->params.size);
+        }
+        return;
+    case PL_DESC_SAMPLED_TEX:
+    case PL_DESC_STORAGE_IMG:
+        return;
+    case PL_DESC_INVALID:
+    case PL_DESC_TYPE_COUNT:
+        break;
+    }
+
+    pl_unreachable();
+}
+
+static void set_ds(struct pl_pass_vk *pass_vk, void *dsbit)
+{
+    pass_vk->dmask |= (uintptr_t) dsbit;
+}
+
+static bool need_respec(pl_pass pass, const struct pl_pass_run_params *params)
+{
+    struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+    if (!pass_vk->spec_size || !params->constant_data)
+        return false;
+
+    VkSpecializationInfo *specInfo = &pass_vk->specInfo;
+    size_t size = pass_vk->spec_size;
+    if (!specInfo->pData) {
+        // Shader was never specialized before
+        specInfo->pData = pl_memdup((void *) pass, params->constant_data, size);
+        specInfo->dataSize = size;
+        return true;
+    }
+
+    // Shader is being re-specialized with new values
+    if (memcmp(specInfo->pData, params->constant_data, size) != 0) {
+        memcpy((void *) specInfo->pData, params->constant_data, size);
+        return true;
+    }
+
+    return false;
+}
+
+void vk_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    pl_pass pass = params->pass;
+    struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+
+    if (params->vertex_data || params->index_data)
+        return pl_pass_run_vbo(gpu, params);
+
+    // Check if we need to re-specialize this pipeline
+    if (need_respec(pass, params)) {
+        pl_clock_t start = pl_clock_now();
+        VK(vk_recreate_pipelines(vk, pass, false, pass_vk->base, &pass_vk->pipe));
+        pl_log_cpu_time(gpu->log, start, pl_clock_now(), "re-specializing shader");
+    }
+
+    if (!pass_vk->use_pushd) {
+        // Wait for a free descriptor set
+        while (!pass_vk->dmask) {
+            PL_TRACE(gpu, "No free descriptor sets! ...blocking (slow path)");
+            vk_poll_commands(vk, 10000000); // 10 ms
+        }
+    }
+
+    static const enum queue_type types[] = {
+        [PL_PASS_RASTER]  = GRAPHICS,
+        [PL_PASS_COMPUTE] = COMPUTE,
+    };
+
+    struct vk_cmd *cmd = CMD_BEGIN_TIMED(types[pass->params.type], params->timer);
+    if (!cmd)
+        goto error;
+
+    // Find a descriptor set to use
+    VkDescriptorSet ds = VK_NULL_HANDLE;
+    if (!pass_vk->use_pushd) {
+        for (int i = 0; i < PL_ARRAY_SIZE(pass_vk->dss); i++) {
+            uint16_t dsbit = 1u << i;
+            if (pass_vk->dmask & dsbit) {
+                ds = pass_vk->dss[i];
+                pass_vk->dmask &= ~dsbit; // unset
+                vk_cmd_callback(cmd, (vk_cb) set_ds, pass_vk,
+                                (void *)(uintptr_t) dsbit);
+                break;
+            }
+        }
+    }
+
+    // Update the dswrite structure with all of the new values
+    for (int i = 0; i < pass->params.num_descriptors; i++)
+        vk_update_descriptor(gpu, cmd, pass, params->desc_bindings[i], ds, i);
+
+    if (!pass_vk->use_pushd) {
+        vk->UpdateDescriptorSets(vk->dev, pass->params.num_descriptors,
+                                 pass_vk->dswrite, 0, NULL);
+    }
+
+    // Bind the pipeline, descriptor set, etc.
+    static const VkPipelineBindPoint bindPoint[] = {
+        [PL_PASS_RASTER]  = VK_PIPELINE_BIND_POINT_GRAPHICS,
+        [PL_PASS_COMPUTE] = VK_PIPELINE_BIND_POINT_COMPUTE,
+    };
+
+    vk->CmdBindPipeline(cmd->buf, bindPoint[pass->params.type],
+                        PL_DEF(pass_vk->pipe, pass_vk->base));
+
+    if (ds) {
+        vk->CmdBindDescriptorSets(cmd->buf, bindPoint[pass->params.type],
+                                  pass_vk->pipeLayout, 0, 1, &ds, 0, NULL);
+    }
+
+    if (pass_vk->use_pushd) {
+        vk->CmdPushDescriptorSetKHR(cmd->buf, bindPoint[pass->params.type],
+                                    pass_vk->pipeLayout, 0,
+                                    pass->params.num_descriptors,
+                                    pass_vk->dswrite);
+    }
+
+    if (pass->params.push_constants_size) {
+        vk->CmdPushConstants(cmd->buf, pass_vk->pipeLayout,
+                             stageFlags[pass->params.type], 0,
+                             pass->params.push_constants_size,
+                             params->push_constants);
+    }
+
+    switch (pass->params.type) {
+    case PL_PASS_RASTER: {
+        pl_tex tex = params->target;
+        struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+        pl_buf vert = params->vertex_buf;
+        struct pl_buf_vk *vert_vk = PL_PRIV(vert);
+        pl_buf index = params->index_buf;
+        struct pl_buf_vk *index_vk = index ? PL_PRIV(index) : NULL;
+        pl_assert(vert);
+
+        // In the edge case that vert = index buffer, we need to synchronize
+        // for both flags simultaneously
+        VkPipelineStageFlags2 vbo_stage = VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT;
+        VkAccessFlags2 vbo_flags = VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT;
+        if (index == vert) {
+            vbo_stage |= VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT;
+            vbo_flags |= VK_ACCESS_2_INDEX_READ_BIT;
+        }
+
+        vk_buf_barrier(gpu, cmd, vert, vbo_stage, vbo_flags, 0, vert->params.size, false);
+
+        VkDeviceSize offset = vert_vk->mem.offset + params->buf_offset;
+        vk->CmdBindVertexBuffers(cmd->buf, 0, 1, &vert_vk->mem.buf, &offset);
+
+        if (index) {
+            if (index != vert) {
+                vk_buf_barrier(gpu, cmd, index, VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT,
+                               VK_ACCESS_2_INDEX_READ_BIT, 0, index->params.size,
+                               false);
+            }
+
+            static const VkIndexType index_fmts[PL_INDEX_FORMAT_COUNT] = {
+                [PL_INDEX_UINT16] = VK_INDEX_TYPE_UINT16,
+                [PL_INDEX_UINT32] = VK_INDEX_TYPE_UINT32,
+            };
+
+            vk->CmdBindIndexBuffer(cmd->buf, index_vk->mem.buf,
+                                   index_vk->mem.offset + params->index_offset,
+                                   index_fmts[params->index_fmt]);
+        }
+
+
+        VkAccessFlags2 fbo_access = VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT;
+        if (pass->params.load_target)
+            fbo_access |= VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT;
+
+        vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
+                       fbo_access, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+                       VK_QUEUE_FAMILY_IGNORED);
+
+        VkViewport viewport = {
+            .x = params->viewport.x0,
+            .y = params->viewport.y0,
+            .width  = pl_rect_w(params->viewport),
+            .height = pl_rect_h(params->viewport),
+        };
+
+        VkRect2D scissor = {
+            .offset = {params->scissors.x0, params->scissors.y0},
+            .extent = {pl_rect_w(params->scissors), pl_rect_h(params->scissors)},
+        };
+
+        vk->CmdSetViewport(cmd->buf, 0, 1, &viewport);
+        vk->CmdSetScissor(cmd->buf, 0, 1, &scissor);
+
+        VkRenderPassBeginInfo binfo = {
+            .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
+            .renderPass = pass_vk->renderPass,
+            .framebuffer = tex_vk->framebuffer,
+            .renderArea.extent = {tex->params.w, tex->params.h},
+        };
+
+        vk->CmdBeginRenderPass(cmd->buf, &binfo, VK_SUBPASS_CONTENTS_INLINE);
+
+        if (index) {
+            vk->CmdDrawIndexed(cmd->buf, params->vertex_count, 1, 0, 0, 0);
+        } else {
+            vk->CmdDraw(cmd->buf, params->vertex_count, 1, 0, 0);
+        }
+
+        vk->CmdEndRenderPass(cmd->buf);
+        break;
+    }
+    case PL_PASS_COMPUTE:
+        vk->CmdDispatch(cmd->buf, params->compute_groups[0],
+                        params->compute_groups[1],
+                        params->compute_groups[2]);
+        break;
+    case PL_PASS_INVALID:
+    case PL_PASS_TYPE_COUNT:
+        pl_unreachable();
+    };
+
+    for (int i = 0; i < pass->params.num_descriptors; i++)
+        vk_release_descriptor(gpu, cmd, pass, params->desc_bindings[i], i);
+
+    // submit this command buffer for better intra-frame granularity
+    CMD_SUBMIT(&cmd);
+
+error:
+    return;
+}
diff --git a/src/vulkan/gpu_tex.c b/src/vulkan/gpu_tex.c
new file mode 100644
index 0000000..7ab83b7
--- /dev/null
+++ b/src/vulkan/gpu_tex.c
@@ -0,0 +1,1453 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+
+void vk_tex_barrier(pl_gpu gpu, struct vk_cmd *cmd, pl_tex tex,
+                    VkPipelineStageFlags2 stage, VkAccessFlags2 access,
+                    VkImageLayout layout, uint32_t qf)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+    pl_rc_ref(&tex_vk->rc);
+    pl_assert(!tex_vk->held);
+    pl_assert(!tex_vk->num_planes);
+
+    // CONCURRENT images require transitioning to/from IGNORED, EXCLUSIVE
+    // images require transitioning to/from the concrete QF index
+    if (vk->pools.num == 1) {
+        if (tex_vk->qf == VK_QUEUE_FAMILY_IGNORED)
+            tex_vk->qf = cmd->pool->qf;
+        if (qf == VK_QUEUE_FAMILY_IGNORED)
+            qf = cmd->pool->qf;
+    }
+
+    struct vk_sync_scope last;
+    bool is_trans = layout != tex_vk->layout, is_xfer = qf != tex_vk->qf;
+    last = vk_sem_barrier(cmd, &tex_vk->sem, stage, access, is_trans || is_xfer);
+
+    VkImageMemoryBarrier2 barr = {
+        .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2,
+        .srcStageMask = last.stage,
+        .srcAccessMask = last.access,
+        .dstStageMask = stage,
+        .dstAccessMask = access,
+        .oldLayout = tex_vk->layout,
+        .newLayout = layout,
+        .srcQueueFamilyIndex = tex_vk->qf,
+        .dstQueueFamilyIndex = qf,
+        .image = tex_vk->img,
+        .subresourceRange = {
+            .aspectMask = tex_vk->aspect,
+            .levelCount = 1,
+            .layerCount = 1,
+        },
+    };
+
+    if (tex_vk->may_invalidate) {
+        tex_vk->may_invalidate = false;
+        barr.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+    }
+
+    if (last.access || is_trans || is_xfer) {
+        vk_cmd_barrier(cmd, &(VkDependencyInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+            .imageMemoryBarrierCount = 1,
+            .pImageMemoryBarriers = &barr,
+        });
+    }
+
+    tex_vk->qf = qf;
+    tex_vk->layout = layout;
+    vk_cmd_callback(cmd, (vk_cb) vk_tex_deref, gpu, tex);
+
+    for (int i = 0; i < tex_vk->ext_deps.num; i++)
+        vk_cmd_dep(cmd, stage, tex_vk->ext_deps.elem[i]);
+    tex_vk->ext_deps.num = 0;
+
+    if (tex_vk->ext_sync) {
+        vk_cmd_callback(cmd, (vk_cb) vk_sync_deref, gpu, tex_vk->ext_sync);
+        tex_vk->ext_sync = NULL;
+    }
+}
+
+static void vk_tex_destroy(pl_gpu gpu, struct pl_tex_t *tex)
+{
+    if (!tex)
+        return;
+
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+    vk_sync_deref(gpu, tex_vk->ext_sync);
+    vk->DestroyFramebuffer(vk->dev, tex_vk->framebuffer, PL_VK_ALLOC);
+    vk->DestroyImageView(vk->dev, tex_vk->view, PL_VK_ALLOC);
+    for (int i = 0; i < tex_vk->num_planes; i++)
+        vk_tex_deref(gpu, tex->planes[i]);
+    if (!tex_vk->external_img) {
+        vk->DestroyImage(vk->dev, tex_vk->img, PL_VK_ALLOC);
+        vk_malloc_free(vk->ma, &tex_vk->mem);
+    }
+
+    pl_free(tex);
+}
+
+void vk_tex_deref(pl_gpu gpu, pl_tex tex)
+{
+    if (!tex)
+        return;
+
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+    if (pl_rc_deref(&tex_vk->rc))
+        vk_tex_destroy(gpu, (struct pl_tex_t *) tex);
+}
+
+
+// Initializes non-VkImage values like the image view, framebuffers, etc.
+static bool vk_init_image(pl_gpu gpu, pl_tex tex, pl_debug_tag debug_tag)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    const struct pl_tex_params *params = &tex->params;
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+    pl_assert(tex_vk->img);
+    PL_VK_NAME(IMAGE, tex_vk->img, debug_tag);
+    pl_rc_init(&tex_vk->rc);
+    if (tex_vk->num_planes)
+        return true;
+    tex_vk->layout = VK_IMAGE_LAYOUT_UNDEFINED;
+    tex_vk->transfer_queue = GRAPHICS;
+    tex_vk->qf = VK_QUEUE_FAMILY_IGNORED; // will be set on first use, if needed
+
+    // Always use the transfer pool if available, for efficiency
+    if ((params->host_writable || params->host_readable) && vk->pool_transfer)
+        tex_vk->transfer_queue = TRANSFER;
+
+    // For emulated formats: force usage of the compute queue, because we
+    // can't properly track cross-queue dependencies for buffers (yet?)
+    if (params->format->emulated)
+        tex_vk->transfer_queue = COMPUTE;
+
+    bool ret = false;
+    VkRenderPass dummyPass = VK_NULL_HANDLE;
+
+    if (params->sampleable || params->renderable || params->storable) {
+        static const VkImageViewType viewType[] = {
+            [VK_IMAGE_TYPE_1D] = VK_IMAGE_VIEW_TYPE_1D,
+            [VK_IMAGE_TYPE_2D] = VK_IMAGE_VIEW_TYPE_2D,
+            [VK_IMAGE_TYPE_3D] = VK_IMAGE_VIEW_TYPE_3D,
+        };
+
+        const VkImageViewCreateInfo vinfo = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+            .image = tex_vk->img,
+            .viewType = viewType[tex_vk->type],
+            .format = tex_vk->img_fmt,
+            .subresourceRange = {
+                .aspectMask = tex_vk->aspect,
+                .levelCount = 1,
+                .layerCount = 1,
+            },
+        };
+
+        VK(vk->CreateImageView(vk->dev, &vinfo, PL_VK_ALLOC, &tex_vk->view));
+        PL_VK_NAME(IMAGE_VIEW, tex_vk->view, debug_tag);
+    }
+
+    if (params->renderable) {
+        // Framebuffers need to be created against a specific render pass
+        // layout, so we need to temporarily create a skeleton/dummy render
+        // pass for vulkan to figure out the compatibility
+        VkRenderPassCreateInfo rinfo = {
+            .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
+            .attachmentCount = 1,
+            .pAttachments = &(VkAttachmentDescription) {
+                .format = tex_vk->img_fmt,
+                .samples = VK_SAMPLE_COUNT_1_BIT,
+                .loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE,
+                .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
+                .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
+                .finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+            },
+            .subpassCount = 1,
+            .pSubpasses = &(VkSubpassDescription) {
+                .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
+                .colorAttachmentCount = 1,
+                .pColorAttachments = &(VkAttachmentReference) {
+                    .attachment = 0,
+                    .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+                },
+            },
+        };
+
+        VK(vk->CreateRenderPass(vk->dev, &rinfo, PL_VK_ALLOC, &dummyPass));
+
+        VkFramebufferCreateInfo finfo = {
+            .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
+            .renderPass = dummyPass,
+            .attachmentCount = 1,
+            .pAttachments = &tex_vk->view,
+            .width = tex->params.w,
+            .height = tex->params.h,
+            .layers = 1,
+        };
+
+        if (finfo.width > vk->props.limits.maxFramebufferWidth ||
+            finfo.height > vk->props.limits.maxFramebufferHeight)
+        {
+            PL_ERR(gpu, "Framebuffer of size %dx%d exceeds the maximum allowed "
+                   "dimensions: %dx%d", finfo.width, finfo.height,
+                   vk->props.limits.maxFramebufferWidth,
+                   vk->props.limits.maxFramebufferHeight);
+            goto error;
+        }
+
+        VK(vk->CreateFramebuffer(vk->dev, &finfo, PL_VK_ALLOC,
+                                 &tex_vk->framebuffer));
+        PL_VK_NAME(FRAMEBUFFER, tex_vk->framebuffer, debug_tag);
+    }
+
+    ret = true;
+
+error:
+    vk->DestroyRenderPass(vk->dev, dummyPass, PL_VK_ALLOC);
+    return ret;
+}
+
+pl_tex vk_tex_create(pl_gpu gpu, const struct pl_tex_params *params)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    enum pl_handle_type handle_type = params->export_handle |
+                                      params->import_handle;
+    VkExternalMemoryHandleTypeFlagBitsKHR vk_handle_type = vk_mem_handle_type(handle_type);
+
+    struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct pl_tex_vk);
+    pl_fmt fmt = params->format;
+    tex->params = *params;
+    tex->params.initial_data = NULL;
+    tex->sampler_type = PL_SAMPLER_NORMAL;
+
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+    struct pl_fmt_vk *fmtp = PL_PRIV(fmt);
+    tex_vk->img_fmt = fmtp->vk_fmt->tfmt;
+    tex_vk->num_planes = fmt->num_planes;
+    for (int i = 0; i < tex_vk->num_planes; i++)
+        tex_vk->aspect |= VK_IMAGE_ASPECT_PLANE_0_BIT << i;
+    tex_vk->aspect = PL_DEF(tex_vk->aspect, VK_IMAGE_ASPECT_COLOR_BIT);
+
+    switch (pl_tex_params_dimension(*params)) {
+    case 1: tex_vk->type = VK_IMAGE_TYPE_1D; break;
+    case 2: tex_vk->type = VK_IMAGE_TYPE_2D; break;
+    case 3: tex_vk->type = VK_IMAGE_TYPE_3D; break;
+    }
+
+    if (fmt->emulated) {
+        tex_vk->texel_fmt = pl_find_fmt(gpu, fmt->type, 1, 0,
+                                        fmt->host_bits[0],
+                                        PL_FMT_CAP_TEXEL_UNIFORM);
+        if (!tex_vk->texel_fmt) {
+            PL_ERR(gpu, "Failed picking texel format for emulated texture!");
+            goto error;
+        }
+
+        // Our format emulation requires storage image support. In order to
+        // make a bunch of checks happy, just mark it off as storable (and also
+        // enable VK_IMAGE_USAGE_STORAGE_BIT, which we do below)
+        tex->params.storable = true;
+    }
+
+    if (fmtp->blit_emulated) {
+        // Enable what's required for sampling
+        tex->params.sampleable = fmt->caps & PL_FMT_CAP_SAMPLEABLE;
+        tex->params.storable = true;
+    }
+
+    // Blit emulation on planar textures requires storage
+    if ((params->blit_src || params->blit_dst) && tex_vk->num_planes)
+        tex->params.storable = true;
+
+    VkImageUsageFlags usage = 0;
+    VkImageCreateFlags flags = 0;
+    if (tex->params.sampleable)
+        usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+    if (tex->params.renderable)
+        usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
+    if (tex->params.storable)
+        usage |= VK_IMAGE_USAGE_STORAGE_BIT;
+    if (tex->params.host_readable || tex->params.blit_src)
+        usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
+    if (tex->params.host_writable || tex->params.blit_dst || params->initial_data)
+        usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+
+    if (!usage) {
+        // Vulkan requires images have at least *some* image usage set, but our
+        // API is perfectly happy with a (useless) image. So just put
+        // VK_IMAGE_USAGE_TRANSFER_DST_BIT since this harmless.
+        usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+    }
+
+    if (tex_vk->num_planes) {
+        flags |= VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT |
+                 VK_IMAGE_CREATE_EXTENDED_USAGE_BIT;
+    }
+
+    // FIXME: Since we can't keep track of queue family ownership properly,
+    // and we don't know in advance what types of queue families this image
+    // will belong to, we're forced to share all of our images between all
+    // command pools.
+    uint32_t qfs[3] = {0};
+    pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs));
+    for (int i = 0; i < vk->pools.num; i++)
+        qfs[i] = vk->pools.elem[i]->qf;
+
+    VkImageDrmFormatModifierExplicitCreateInfoEXT drm_explicit = {
+        .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT,
+        .drmFormatModifier = params->shared_mem.drm_format_mod,
+        .drmFormatModifierPlaneCount = 1,
+        .pPlaneLayouts = &(VkSubresourceLayout) {
+            .rowPitch = PL_DEF(params->shared_mem.stride_w, params->w),
+            .depthPitch = params->d ? PL_DEF(params->shared_mem.stride_h, params->h) : 0,
+            .offset = params->shared_mem.offset,
+        },
+    };
+
+#ifdef VK_EXT_metal_objects
+    VkImportMetalTextureInfoEXT import_metal_tex = {
+        .sType = VK_STRUCTURE_TYPE_IMPORT_METAL_TEXTURE_INFO_EXT,
+        .plane = VK_IMAGE_ASPECT_PLANE_0_BIT << params->shared_mem.plane,
+    };
+
+    VkImportMetalIOSurfaceInfoEXT import_iosurface = {
+        .sType = VK_STRUCTURE_TYPE_IMPORT_METAL_IO_SURFACE_INFO_EXT,
+    };
+#endif
+
+    VkImageDrmFormatModifierListCreateInfoEXT drm_list = {
+        .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT,
+        .drmFormatModifierCount = fmt->num_modifiers,
+        .pDrmFormatModifiers = fmt->modifiers,
+    };
+
+    VkExternalMemoryImageCreateInfoKHR ext_info = {
+        .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO_KHR,
+        .handleTypes = vk_handle_type,
+    };
+
+    VkImageCreateInfo iinfo = {
+        .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+        .pNext = vk_handle_type ? &ext_info : NULL,
+        .imageType = tex_vk->type,
+        .format = tex_vk->img_fmt,
+        .extent = (VkExtent3D) {
+            .width  = params->w,
+            .height = PL_MAX(1, params->h),
+            .depth  = PL_MAX(1, params->d)
+        },
+        .mipLevels = 1,
+        .arrayLayers = 1,
+        .samples = VK_SAMPLE_COUNT_1_BIT,
+        .tiling = VK_IMAGE_TILING_OPTIMAL,
+        .usage = usage,
+        .flags = flags,
+        .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
+        .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT
+                                         : VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = vk->pools.num,
+        .pQueueFamilyIndices = qfs,
+    };
+
+    struct vk_malloc_params mparams = {
+        .optimal = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+        .export_handle = params->export_handle,
+        .import_handle = params->import_handle,
+        .shared_mem = params->shared_mem,
+        .debug_tag = params->debug_tag,
+    };
+
+    if (params->import_handle == PL_HANDLE_DMA_BUF) {
+        vk_link_struct(&iinfo, &drm_explicit);
+        iinfo.tiling = VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
+        mparams.shared_mem.offset = 0x0; // handled via plane offsets
+    }
+
+#ifdef VK_EXT_metal_objects
+    if (params->import_handle == PL_HANDLE_MTL_TEX) {
+        vk_link_struct(&iinfo, &import_metal_tex);
+        import_metal_tex.mtlTexture = params->shared_mem.handle.handle;
+    }
+
+    if (params->import_handle == PL_HANDLE_IOSURFACE) {
+        vk_link_struct(&iinfo, &import_iosurface);
+        import_iosurface.ioSurface = params->shared_mem.handle.handle;
+    }
+#endif
+
+    if (params->export_handle == PL_HANDLE_DMA_BUF) {
+        pl_assert(drm_list.drmFormatModifierCount > 0);
+        vk_link_struct(&iinfo, &drm_list);
+        iinfo.tiling = VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
+    }
+
+    // Double-check physical image format limits and fail if invalid
+    VkPhysicalDeviceImageDrmFormatModifierInfoEXT drm_pinfo = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT,
+        .sharingMode = iinfo.sharingMode,
+        .queueFamilyIndexCount = iinfo.queueFamilyIndexCount,
+        .pQueueFamilyIndices = iinfo.pQueueFamilyIndices,
+    };
+
+    VkPhysicalDeviceExternalImageFormatInfoKHR ext_pinfo = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO_KHR,
+        .handleType = ext_info.handleTypes,
+    };
+
+    if (handle_type == PL_HANDLE_DMA_BUF) {
+        if (params->import_handle) {
+            // On import, we know exactly which format modifier to test
+            drm_pinfo.drmFormatModifier = drm_explicit.drmFormatModifier;
+        } else {
+            // On export, the choice of format modifier is ambiguous, because
+            // we offer the implementation a whole list to choose from. In
+            // principle, we must check *all* supported drm format modifiers,
+            // but in practice it should hopefully suffice to just check one
+            drm_pinfo.drmFormatModifier = drm_list.pDrmFormatModifiers[0];
+        }
+        vk_link_struct(&ext_pinfo, &drm_pinfo);
+    }
+
+    VkPhysicalDeviceImageFormatInfo2KHR pinfo = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2_KHR,
+        .pNext = vk_handle_type ? &ext_pinfo : NULL,
+        .format = iinfo.format,
+        .type = iinfo.imageType,
+        .tiling = iinfo.tiling,
+        .usage = iinfo.usage,
+        .flags = iinfo.flags,
+    };
+
+    VkExternalImageFormatPropertiesKHR ext_props = {
+        .sType = VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES_KHR,
+    };
+
+    VkImageFormatProperties2KHR props = {
+        .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2_KHR,
+        .pNext = vk_handle_type ? &ext_props : NULL,
+    };
+
+    VkResult res;
+    res = vk->GetPhysicalDeviceImageFormatProperties2KHR(vk->physd, &pinfo, &props);
+    if (res == VK_ERROR_FORMAT_NOT_SUPPORTED) {
+        PL_DEBUG(gpu, "Texture creation failed: not supported");
+        goto error;
+    } else {
+        PL_VK_ASSERT(res, "Querying image format properties");
+    }
+
+    VkExtent3D max = props.imageFormatProperties.maxExtent;
+    if (params->w > max.width || params->h > max.height || params->d > max.depth)
+    {
+        PL_ERR(gpu, "Requested image size %dx%dx%d exceeds the maximum allowed "
+               "dimensions %dx%dx%d for vulkan image format %x",
+               params->w, params->h, params->d, max.width, max.height, max.depth,
+               (unsigned) iinfo.format);
+        goto error;
+    }
+
+    // Ensure the handle type is supported
+    if (vk_handle_type) {
+        bool ok = vk_external_mem_check(vk, &ext_props.externalMemoryProperties,
+                                        handle_type, params->import_handle);
+        if (!ok) {
+            PL_ERR(gpu, "Requested handle type is not compatible with the "
+                   "specified combination of image parameters. Possibly the "
+                   "handle type is unsupported altogether?");
+            goto error;
+        }
+    }
+
+    VK(vk->CreateImage(vk->dev, &iinfo, PL_VK_ALLOC, &tex_vk->img));
+    tex_vk->usage_flags = iinfo.usage;
+
+    VkMemoryDedicatedRequirements ded_reqs = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR,
+    };
+
+    VkMemoryRequirements2 reqs = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR,
+        .pNext = &ded_reqs,
+    };
+
+    VkImageMemoryRequirementsInfo2 req_info = {
+        .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2_KHR,
+        .image = tex_vk->img,
+    };
+
+    vk->GetImageMemoryRequirements2(vk->dev, &req_info, &reqs);
+    mparams.reqs = reqs.memoryRequirements;
+    if (ded_reqs.prefersDedicatedAllocation) {
+        mparams.ded_image = tex_vk->img;
+        if (vk_mem_handle_type(params->import_handle))
+            mparams.shared_mem.size = reqs.memoryRequirements.size;
+    }
+
+    const char *debug_tag = params->debug_tag ? params->debug_tag :
+                            params->import_handle ? "imported" : "created";
+
+    if (!params->import_handle || vk_mem_handle_type(params->import_handle)) {
+        struct vk_memslice *mem = &tex_vk->mem;
+        if (!vk_malloc_slice(vk->ma, mem, &mparams))
+            goto error;
+
+        VK(vk->BindImageMemory(vk->dev, tex_vk->img, mem->vkmem, mem->offset));
+    }
+
+    static const char * const plane_names[4] = {
+        "plane 0", "plane 1", "plane 2", "plane 3",
+    };
+
+    if (tex_vk->num_planes) {
+        for (int i = 0; i < tex_vk->num_planes; i++) {
+            struct pl_tex_t *plane;
+
+            pl_assert(tex_vk->type == VK_IMAGE_TYPE_2D);
+            plane = (struct pl_tex_t *) pl_vulkan_wrap(gpu, pl_vulkan_wrap_params(
+                .image      = tex_vk->img,
+                .aspect     = VK_IMAGE_ASPECT_PLANE_0_BIT << i,
+                .width      = PL_RSHIFT_UP(tex->params.w, fmt->planes[i].shift_x),
+                .height     = PL_RSHIFT_UP(tex->params.h, fmt->planes[i].shift_y),
+                .format     = fmtp->vk_fmt->pfmt[i].fmt,
+                .usage      = usage,
+                .user_data  = params->user_data,
+                .debug_tag  = PL_DEF(params->debug_tag, plane_names[i]),
+            ));
+            if (!plane)
+                goto error;
+            plane->parent = tex;
+            tex->planes[i] = plane;
+            tex_vk->planes[i] = PL_PRIV(plane);
+            tex_vk->planes[i]->held = false;
+            tex_vk->planes[i]->layout = tex_vk->layout;
+        }
+
+        // Explicitly mask out all usage flags from planar parent images
+        pl_assert(!fmt->caps);
+        tex->params.sampleable      = false;
+        tex->params.renderable      = false;
+        tex->params.storable        = false;
+        tex->params.blit_src        = false;
+        tex->params.blit_dst        = false;
+        tex->params.host_writable   = false;
+        tex->params.host_readable   = false;
+    }
+
+    if (!vk_init_image(gpu, tex, debug_tag))
+        goto error;
+
+    if (params->export_handle)
+        tex->shared_mem = tex_vk->mem.shared_mem;
+
+    if (params->export_handle == PL_HANDLE_DMA_BUF) {
+        if (vk->GetImageDrmFormatModifierPropertiesEXT) {
+
+            // Query the DRM format modifier and plane layout from the driver
+            VkImageDrmFormatModifierPropertiesEXT mod_props = {
+                .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_PROPERTIES_EXT,
+            };
+
+            VK(vk->GetImageDrmFormatModifierPropertiesEXT(vk->dev, tex_vk->img, &mod_props));
+            tex->shared_mem.drm_format_mod = mod_props.drmFormatModifier;
+
+            VkSubresourceLayout layout = {0};
+            VkImageSubresource plane = {
+                .aspectMask = VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT,
+            };
+
+            vk->GetImageSubresourceLayout(vk->dev, tex_vk->img, &plane, &layout);
+            if (layout.offset != 0) {
+                PL_ERR(gpu, "Exported DRM plane 0 has nonzero offset %zu, "
+                       "this should never happen! Erroring for safety...",
+                       (size_t) layout.offset);
+                goto error;
+            }
+            tex->shared_mem.stride_w = layout.rowPitch;
+            tex->shared_mem.stride_h = layout.depthPitch;
+
+        } else {
+
+            // Fallback for no modifiers, just do something stupid.
+            tex->shared_mem.drm_format_mod = DRM_FORMAT_MOD_INVALID;
+            tex->shared_mem.stride_w = params->w;
+            tex->shared_mem.stride_h = params->h;
+
+        }
+    }
+
+    if (params->initial_data) {
+        struct pl_tex_transfer_params ul_params = {
+            .tex = tex,
+            .ptr = (void *) params->initial_data,
+            .rc = { 0, 0, 0, params->w, params->h, params->d },
+        };
+
+        // Since we re-use GPU helpers which require writable images, just fake it
+        bool writable = tex->params.host_writable;
+        tex->params.host_writable = true;
+        if (!pl_tex_upload(gpu, &ul_params))
+            goto error;
+        tex->params.host_writable = writable;
+    }
+
+    return tex;
+
+error:
+    vk_tex_destroy(gpu, tex);
+    return NULL;
+}
+
+void vk_tex_invalidate(pl_gpu gpu, pl_tex tex)
+{
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+    tex_vk->may_invalidate = true;
+    for (int i = 0; i < tex_vk->num_planes; i++)
+        tex_vk->planes[i]->may_invalidate = true;
+}
+
+static bool tex_clear_fallback(pl_gpu gpu, pl_tex tex,
+                               const union pl_clear_color color)
+{
+    pl_tex pixel = pl_tex_create(gpu, pl_tex_params(
+        .w = 1,
+        .h = 1,
+        .format = tex->params.format,
+        .storable = true,
+        .blit_src = true,
+        .blit_dst = true,
+    ));
+    if (!pixel)
+        return false;
+
+    pl_tex_clear_ex(gpu, pixel, color);
+
+    pl_assert(tex->params.storable);
+    pl_tex_blit(gpu, pl_tex_blit_params(
+        .src = pixel,
+        .dst = tex,
+        .sample_mode = PL_TEX_SAMPLE_NEAREST,
+    ));
+
+    pl_tex_destroy(gpu, &pixel);
+    return true;
+}
+
+void vk_tex_clear_ex(pl_gpu gpu, pl_tex tex, const union pl_clear_color color)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+    if (tex_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT) {
+        if (!tex_clear_fallback(gpu, tex,  color)) {
+            PL_ERR(gpu, "Failed clearing imported planar image: color aspect "
+                   "clears disallowed by spec and no shader fallback "
+                   "available");
+        }
+        return;
+    }
+
+    struct vk_cmd *cmd = CMD_BEGIN(GRAPHICS);
+    if (!cmd)
+        return;
+
+    vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_CLEAR_BIT,
+                   VK_ACCESS_2_TRANSFER_WRITE_BIT,
+                   VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                   VK_QUEUE_FAMILY_IGNORED);
+
+    pl_static_assert(sizeof(VkClearColorValue) == sizeof(union pl_clear_color));
+    const VkClearColorValue *clearColor = (const VkClearColorValue *) &color;
+
+    pl_assert(tex_vk->aspect == VK_IMAGE_ASPECT_COLOR_BIT);
+    static const VkImageSubresourceRange range = {
+        .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+        .levelCount = 1,
+        .layerCount = 1,
+    };
+
+    vk->CmdClearColorImage(cmd->buf, tex_vk->img, tex_vk->layout,
+                           clearColor, 1, &range);
+
+    CMD_FINISH(&cmd);
+}
+
+void vk_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_tex_vk *src_vk = PL_PRIV(params->src);
+    struct pl_tex_vk *dst_vk = PL_PRIV(params->dst);
+    struct pl_fmt_vk *src_fmtp = PL_PRIV(params->src->params.format);
+    struct pl_fmt_vk *dst_fmtp = PL_PRIV(params->dst->params.format);
+    bool blit_emulated = src_fmtp->blit_emulated || dst_fmtp->blit_emulated;
+    bool planar_fallback = src_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT ||
+                           dst_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT;
+
+    pl_rect3d src_rc = params->src_rc, dst_rc = params->dst_rc;
+    bool requires_scaling = !pl_rect3d_eq(src_rc, dst_rc);
+    if ((requires_scaling && blit_emulated) || planar_fallback) {
+        if (!pl_tex_blit_compute(gpu, params))
+            PL_ERR(gpu, "Failed emulating texture blit, incompatible textures?");
+        return;
+    }
+
+    struct vk_cmd *cmd = CMD_BEGIN(GRAPHICS);
+    if (!cmd)
+        return;
+
+    // When the blit operation doesn't require scaling, we can use the more
+    // efficient vkCmdCopyImage instead of vkCmdBlitImage
+    if (!requires_scaling) {
+        vk_tex_barrier(gpu, cmd, params->src, VK_PIPELINE_STAGE_2_COPY_BIT,
+                       VK_ACCESS_2_TRANSFER_READ_BIT,
+                       VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+                       VK_QUEUE_FAMILY_IGNORED);
+
+        vk_tex_barrier(gpu, cmd, params->dst, VK_PIPELINE_STAGE_2_COPY_BIT,
+                       VK_ACCESS_2_TRANSFER_WRITE_BIT,
+                       VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                       VK_QUEUE_FAMILY_IGNORED);
+
+        pl_rect3d_normalize(&src_rc);
+
+        VkImageCopy region = {
+            .srcSubresource = {
+                .aspectMask = src_vk->aspect,
+                .layerCount = 1,
+            },
+            .dstSubresource = {
+                .aspectMask = dst_vk->aspect,
+                .layerCount = 1,
+            },
+            .srcOffset = {src_rc.x0, src_rc.y0, src_rc.z0},
+            .dstOffset = {src_rc.x0, src_rc.y0, src_rc.z0},
+            .extent = {
+                pl_rect_w(src_rc),
+                pl_rect_h(src_rc),
+                pl_rect_d(src_rc),
+            },
+        };
+
+        vk->CmdCopyImage(cmd->buf, src_vk->img, src_vk->layout,
+                         dst_vk->img, dst_vk->layout, 1, &region);
+    } else {
+        vk_tex_barrier(gpu, cmd, params->src, VK_PIPELINE_STAGE_2_BLIT_BIT,
+                       VK_ACCESS_2_TRANSFER_READ_BIT,
+                       VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+                       VK_QUEUE_FAMILY_IGNORED);
+
+        vk_tex_barrier(gpu, cmd, params->dst, VK_PIPELINE_STAGE_2_BLIT_BIT,
+                       VK_ACCESS_2_TRANSFER_WRITE_BIT,
+                       VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                       VK_QUEUE_FAMILY_IGNORED);
+
+        VkImageBlit region = {
+            .srcSubresource = {
+                .aspectMask = src_vk->aspect,
+                .layerCount = 1,
+            },
+            .dstSubresource = {
+                .aspectMask = dst_vk->aspect,
+                .layerCount = 1,
+            },
+            .srcOffsets = {{src_rc.x0, src_rc.y0, src_rc.z0},
+                           {src_rc.x1, src_rc.y1, src_rc.z1}},
+            .dstOffsets = {{dst_rc.x0, dst_rc.y0, dst_rc.z0},
+                           {dst_rc.x1, dst_rc.y1, dst_rc.z1}},
+        };
+
+        static const VkFilter filters[PL_TEX_SAMPLE_MODE_COUNT] = {
+            [PL_TEX_SAMPLE_NEAREST] = VK_FILTER_NEAREST,
+            [PL_TEX_SAMPLE_LINEAR]  = VK_FILTER_LINEAR,
+        };
+
+        vk->CmdBlitImage(cmd->buf, src_vk->img, src_vk->layout,
+                         dst_vk->img, dst_vk->layout, 1, &region,
+                         filters[params->sample_mode]);
+    }
+
+    CMD_FINISH(&cmd);
+}
+
+// Determine the best queue type to perform a buffer<->image copy on
+static enum queue_type vk_img_copy_queue(pl_gpu gpu, pl_tex tex,
+                                         const struct VkBufferImageCopy *region)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    const struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+    enum queue_type queue = tex_vk->transfer_queue;
+    if (queue != TRANSFER)
+        return queue;
+
+    VkExtent3D alignment = vk->pool_transfer->props.minImageTransferGranularity;
+
+    enum queue_type fallback = GRAPHICS;
+    if (gpu->limits.compute_queues > gpu->limits.fragment_queues)
+        fallback = COMPUTE; // prefer async compute queue
+
+    int tex_w = PL_DEF(tex->params.w, 1),
+        tex_h = PL_DEF(tex->params.h, 1),
+        tex_d = PL_DEF(tex->params.d, 1);
+
+    bool full_w = region->imageOffset.x + region->imageExtent.width  == tex_w,
+         full_h = region->imageOffset.y + region->imageExtent.height == tex_h,
+         full_d = region->imageOffset.z + region->imageExtent.depth  == tex_d;
+
+    if (alignment.width) {
+
+        bool unaligned = false;
+        unaligned |= region->imageOffset.x % alignment.width;
+        unaligned |= region->imageOffset.y % alignment.height;
+        unaligned |= region->imageOffset.z % alignment.depth;
+        unaligned |= (region->imageExtent.width  % alignment.width)  && !full_w;
+        unaligned |= (region->imageExtent.height % alignment.height) && !full_h;
+        unaligned |= (region->imageExtent.depth  % alignment.depth)  && !full_d;
+
+        return unaligned ? fallback : queue;
+
+    } else {
+
+        // an alignment of {0} means the copy must span the entire image
+        bool unaligned = false;
+        unaligned |= region->imageOffset.x || !full_w;
+        unaligned |= region->imageOffset.y || !full_h;
+        unaligned |= region->imageOffset.z || !full_d;
+
+        return unaligned ? fallback : queue;
+
+    }
+}
+
+static void tex_xfer_cb(void *ctx, void *arg)
+{
+    void (*fun)(void *priv) = ctx;
+    fun(arg);
+}
+
+bool vk_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    pl_tex tex = params->tex;
+    pl_fmt fmt = tex->params.format;
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+    struct pl_tex_transfer_params *slices = NULL;
+    int num_slices = 0;
+
+    if (!params->buf)
+        return pl_tex_upload_pbo(gpu, params);
+
+    pl_buf buf = params->buf;
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+    pl_rect3d rc = params->rc;
+    const size_t size = pl_tex_transfer_size(params);
+    const size_t buf_offset = buf_vk->mem.offset + params->buf_offset;
+    bool unaligned = buf_offset % fmt->texel_size;
+    if (unaligned)
+        PL_TRACE(gpu, "vk_tex_upload: unaligned transfer (slow path)");
+
+    if (fmt->emulated || unaligned) {
+
+        // Create all slice buffers first, to early-fail if OOM, and to avoid
+        // blocking unnecessarily on waiting for these buffers to get read from
+        num_slices = pl_tex_transfer_slices(gpu, tex_vk->texel_fmt, params, &slices);
+        for (int i = 0; i < num_slices; i++) {
+            slices[i].buf = pl_buf_create(gpu, pl_buf_params(
+                .memory_type = PL_BUF_MEM_DEVICE,
+                .format      = tex_vk->texel_fmt,
+                .size        = pl_tex_transfer_size(&slices[i]),
+                .storable    = fmt->emulated,
+            ));
+
+            if (!slices[i].buf) {
+                PL_ERR(gpu, "Failed creating buffer for tex upload fallback!");
+                num_slices = i; // only clean up buffers up to here
+                goto error;
+            }
+        }
+
+        // All temporary buffers successfully created, begin copying source data
+        struct vk_cmd *cmd = CMD_BEGIN_TIMED(tex_vk->transfer_queue,
+                                             params->timer);
+        if (!cmd)
+            goto error;
+
+        vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT,
+                       VK_ACCESS_2_TRANSFER_READ_BIT, params->buf_offset, size,
+                       false);
+
+        for (int i = 0; i < num_slices; i++) {
+            pl_buf slice = slices[i].buf;
+            struct pl_buf_vk *slice_vk = PL_PRIV(slice);
+            vk_buf_barrier(gpu, cmd, slice, VK_PIPELINE_STAGE_2_COPY_BIT,
+                           VK_ACCESS_2_TRANSFER_WRITE_BIT, 0, slice->params.size,
+                           false);
+
+            vk->CmdCopyBuffer(cmd->buf, buf_vk->mem.buf, slice_vk->mem.buf, 1, &(VkBufferCopy) {
+                .srcOffset = buf_vk->mem.offset + slices[i].buf_offset,
+                .dstOffset = slice_vk->mem.offset,
+                .size      = slice->params.size,
+            });
+        }
+
+        if (params->callback)
+            vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv);
+
+        bool ok = CMD_FINISH(&cmd);
+
+        // Finally, dispatch the (texel) upload asynchronously. We can fire
+        // the callback already at the completion of previous command because
+        // these temporary buffers already hold persistent copies of the data
+        for (int i = 0; i < num_slices; i++) {
+            if (ok) {
+                slices[i].buf_offset = 0;
+                ok = fmt->emulated ? pl_tex_upload_texel(gpu, &slices[i])
+                                   : pl_tex_upload(gpu, &slices[i]);
+            }
+            pl_buf_destroy(gpu, &slices[i].buf);
+        }
+
+        pl_free(slices);
+        return ok;
+
+    } else {
+
+        pl_assert(fmt->texel_align == fmt->texel_size);
+        const VkBufferImageCopy region = {
+            .bufferOffset = buf_offset,
+            .bufferRowLength = params->row_pitch / fmt->texel_size,
+            .bufferImageHeight = params->depth_pitch / params->row_pitch,
+            .imageOffset = { rc.x0, rc.y0, rc.z0 },
+            .imageExtent = { rc.x1, rc.y1, rc.z1 },
+            .imageSubresource = {
+                .aspectMask = tex_vk->aspect,
+                .layerCount = 1,
+            },
+        };
+
+        enum queue_type queue = vk_img_copy_queue(gpu, tex, &region);
+        struct vk_cmd *cmd = CMD_BEGIN_TIMED(queue, params->timer);
+        if (!cmd)
+            goto error;
+
+        vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT,
+                       VK_ACCESS_2_TRANSFER_READ_BIT, params->buf_offset, size,
+                       false);
+        vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_COPY_BIT,
+                       VK_ACCESS_2_TRANSFER_WRITE_BIT,
+                       VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                       VK_QUEUE_FAMILY_IGNORED);
+        vk->CmdCopyBufferToImage(cmd->buf, buf_vk->mem.buf, tex_vk->img,
+                                 tex_vk->layout, 1, &region);
+
+        if (params->callback)
+            vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv);
+
+        return CMD_FINISH(&cmd);
+    }
+
+    pl_unreachable();
+
+error:
+    for (int i = 0; i < num_slices; i++)
+        pl_buf_destroy(gpu, &slices[i].buf);
+    pl_free(slices);
+    return false;
+}
+
+bool vk_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    pl_tex tex = params->tex;
+    pl_fmt fmt = tex->params.format;
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+    struct pl_tex_transfer_params *slices = NULL;
+    int num_slices = 0;
+
+    if (!params->buf)
+        return pl_tex_download_pbo(gpu, params);
+
+    pl_buf buf = params->buf;
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+    pl_rect3d rc = params->rc;
+    const size_t size = pl_tex_transfer_size(params);
+    const size_t buf_offset = buf_vk->mem.offset + params->buf_offset;
+    bool unaligned = buf_offset % fmt->texel_size;
+    if (unaligned)
+        PL_TRACE(gpu, "vk_tex_download: unaligned transfer (slow path)");
+
+    if (fmt->emulated || unaligned) {
+
+        num_slices = pl_tex_transfer_slices(gpu, tex_vk->texel_fmt, params, &slices);
+        for (int i = 0; i < num_slices; i++) {
+            slices[i].buf = pl_buf_create(gpu, pl_buf_params(
+                .memory_type = PL_BUF_MEM_DEVICE,
+                .format      = tex_vk->texel_fmt,
+                .size        = pl_tex_transfer_size(&slices[i]),
+                .storable    = fmt->emulated,
+            ));
+
+            if (!slices[i].buf) {
+                PL_ERR(gpu, "Failed creating buffer for tex download fallback!");
+                num_slices = i;
+                goto error;
+            }
+        }
+
+        for (int i = 0; i < num_slices; i++) {
+            // Restore buffer offset after downloading into temporary buffer,
+            // because we still need to copy the data from the temporary buffer
+            // into this offset in the original buffer
+            const size_t tmp_offset = slices[i].buf_offset;
+            slices[i].buf_offset = 0;
+            bool ok = fmt->emulated ? pl_tex_download_texel(gpu, &slices[i])
+                                    : pl_tex_download(gpu, &slices[i]);
+            slices[i].buf_offset = tmp_offset;
+            if (!ok)
+                goto error;
+        }
+
+        // Finally, download into the user buffer
+        struct vk_cmd *cmd = CMD_BEGIN_TIMED(tex_vk->transfer_queue, params->timer);
+        if (!cmd)
+            goto error;
+
+        vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT,
+                       VK_ACCESS_2_TRANSFER_WRITE_BIT, params->buf_offset, size,
+                       false);
+
+        for (int i = 0; i < num_slices; i++) {
+            pl_buf slice = slices[i].buf;
+            struct pl_buf_vk *slice_vk = PL_PRIV(slice);
+            vk_buf_barrier(gpu, cmd, slice, VK_PIPELINE_STAGE_2_COPY_BIT,
+                           VK_ACCESS_2_TRANSFER_READ_BIT, 0, slice->params.size,
+                           false);
+
+            vk->CmdCopyBuffer(cmd->buf, slice_vk->mem.buf, buf_vk->mem.buf, 1, &(VkBufferCopy) {
+                .srcOffset = slice_vk->mem.offset,
+                .dstOffset = buf_vk->mem.offset + slices[i].buf_offset,
+                .size      = slice->params.size,
+            });
+
+            pl_buf_destroy(gpu, &slices[i].buf);
+        }
+
+        vk_buf_flush(gpu, cmd, buf, params->buf_offset, size);
+
+        if (params->callback)
+            vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv);
+
+        pl_free(slices);
+        return CMD_FINISH(&cmd);
+
+    } else {
+
+        pl_assert(params->row_pitch % fmt->texel_size == 0);
+        pl_assert(params->depth_pitch % params->row_pitch == 0);
+        const VkBufferImageCopy region = {
+            .bufferOffset = buf_offset,
+            .bufferRowLength = params->row_pitch / fmt->texel_size,
+            .bufferImageHeight = params->depth_pitch / params->row_pitch,
+            .imageOffset = { rc.x0, rc.y0, rc.z0 },
+            .imageExtent = { rc.x1, rc.y1, rc.z1 },
+            .imageSubresource = {
+                .aspectMask = tex_vk->aspect,
+                .layerCount = 1,
+            },
+        };
+
+        enum queue_type queue = vk_img_copy_queue(gpu, tex, &region);
+
+        struct vk_cmd *cmd = CMD_BEGIN_TIMED(queue, params->timer);
+        if (!cmd)
+            goto error;
+
+        vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT,
+                       VK_ACCESS_2_TRANSFER_WRITE_BIT, params->buf_offset, size,
+                       false);
+        vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_COPY_BIT,
+                       VK_ACCESS_2_TRANSFER_READ_BIT,
+                       VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+                       VK_QUEUE_FAMILY_IGNORED);
+        vk->CmdCopyImageToBuffer(cmd->buf, tex_vk->img, tex_vk->layout,
+                                 buf_vk->mem.buf, 1, &region);
+        vk_buf_flush(gpu, cmd, buf, params->buf_offset, size);
+
+        if (params->callback)
+            vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv);
+
+        return CMD_FINISH(&cmd);
+    }
+
+    pl_unreachable();
+
+error:
+    for (int i = 0; i < num_slices; i++)
+        pl_buf_destroy(gpu, &slices[i].buf);
+    pl_free(slices);
+    return false;
+}
+
+bool vk_tex_poll(pl_gpu gpu, pl_tex tex, uint64_t timeout)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+    // Opportunistically check if we can re-use this texture without flush
+    vk_poll_commands(vk, 0);
+    if (pl_rc_count(&tex_vk->rc) == 1)
+        goto skip_blocking;
+
+    // Otherwise, we're force to submit any queued command so that the user is
+    // guaranteed to see progress eventually, even if they call this in a loop
+    CMD_SUBMIT(NULL);
+    vk_poll_commands(vk, timeout);
+    if (pl_rc_count(&tex_vk->rc) > 1)
+        return true;
+
+    // fall through
+skip_blocking:
+    for (int i = 0; i < tex_vk->num_planes; i++) {
+        if (vk_tex_poll(gpu, tex->planes[i], timeout))
+            return true;
+    }
+
+    return false;
+}
+
+bool vk_tex_export(pl_gpu gpu, pl_tex tex, pl_sync sync)
+{
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+    struct pl_sync_vk *sync_vk = PL_PRIV(sync);
+
+    if (tex_vk->num_planes) {
+        PL_ERR(gpu, "`pl_tex_export` cannot be called on planar textures."
+               "Please see `pl_vulkan_hold_ex` for a replacement.");
+        return false;
+    }
+
+    struct vk_cmd *cmd = CMD_BEGIN(ANY);
+    if (!cmd)
+        goto error;
+
+    vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_NONE,
+                   0, VK_IMAGE_LAYOUT_GENERAL, VK_QUEUE_FAMILY_EXTERNAL);
+
+    // Make the next barrier appear as though coming from a different queue
+    tex_vk->sem.write.queue = tex_vk->sem.read.queue = NULL;
+
+    vk_cmd_sig(cmd, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, (pl_vulkan_sem){ sync_vk->wait });
+    if (!CMD_SUBMIT(&cmd))
+        goto error;
+
+    // Remember the other dependency and hold on to the sync object
+    PL_ARRAY_APPEND(tex, tex_vk->ext_deps, (pl_vulkan_sem){ sync_vk->signal });
+    pl_rc_ref(&sync_vk->rc);
+    tex_vk->ext_sync = sync;
+    tex_vk->qf = VK_QUEUE_FAMILY_EXTERNAL;
+    return true;
+
+error:
+    PL_ERR(gpu, "Failed exporting shared texture!");
+    return false;
+}
+
+pl_tex pl_vulkan_wrap(pl_gpu gpu, const struct pl_vulkan_wrap_params *params)
+{
+    pl_fmt fmt = NULL;
+    for (int i = 0; i < gpu->num_formats; i++) {
+        const struct vk_format **vkfmt = PL_PRIV(gpu->formats[i]);
+        if ((*vkfmt)->tfmt == params->format) {
+            fmt = gpu->formats[i];
+            break;
+        }
+    }
+
+    if (!fmt) {
+        PL_ERR(gpu, "Could not find pl_fmt suitable for wrapped image "
+               "with format %s", vk_fmt_name(params->format));
+        return NULL;
+    }
+
+    VkImageUsageFlags usage = params->usage;
+    if (fmt->num_planes)
+        usage = 0; // mask capabilities from the base texture
+
+    struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct pl_tex_vk);
+    tex->params = (struct pl_tex_params) {
+        .format         = fmt,
+        .w              = params->width,
+        .h              = params->height,
+        .d              = params->depth,
+        .sampleable     = !!(usage & VK_IMAGE_USAGE_SAMPLED_BIT),
+        .renderable     = !!(usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT),
+        .storable       = !!(usage & VK_IMAGE_USAGE_STORAGE_BIT),
+        .blit_src       = !!(usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT),
+        .blit_dst       = !!(usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT),
+        .host_writable  = !!(usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT),
+        .host_readable  = !!(usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT),
+        .user_data      = params->user_data,
+        .debug_tag      = params->debug_tag,
+    };
+
+    // Mask out capabilities not permitted by the `pl_fmt`
+#define MASK(field, cap)                                                        \
+    do {                                                                        \
+        if (tex->params.field && !(fmt->caps & cap)) {                          \
+            PL_WARN(gpu, "Masking `" #field "` from wrapped texture because "   \
+                    "the corresponding format '%s' does not support " #cap,     \
+                    fmt->name);                                                 \
+            tex->params.field = false;                                          \
+        }                                                                       \
+    } while (0)
+
+    MASK(sampleable,    PL_FMT_CAP_SAMPLEABLE);
+    MASK(renderable,    PL_FMT_CAP_RENDERABLE);
+    MASK(storable,      PL_FMT_CAP_STORABLE);
+    MASK(blit_src,      PL_FMT_CAP_BLITTABLE);
+    MASK(blit_dst,      PL_FMT_CAP_BLITTABLE);
+    MASK(host_readable, PL_FMT_CAP_HOST_READABLE);
+#undef MASK
+
+    // For simplicity, explicitly mask out blit emulation for wrapped textures
+    struct pl_fmt_vk *fmtp = PL_PRIV(fmt);
+    if (fmtp->blit_emulated) {
+        tex->params.blit_src = false;
+        tex->params.blit_dst = false;
+    }
+
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+    switch (pl_tex_params_dimension(tex->params)) {
+    case 1: tex_vk->type = VK_IMAGE_TYPE_1D; break;
+    case 2: tex_vk->type = VK_IMAGE_TYPE_2D; break;
+    case 3: tex_vk->type = VK_IMAGE_TYPE_3D; break;
+    }
+    tex_vk->external_img = true;
+    tex_vk->held = !fmt->num_planes;
+    tex_vk->img = params->image;
+    tex_vk->img_fmt = params->format;
+    tex_vk->num_planes = fmt->num_planes;
+    tex_vk->usage_flags = usage;
+    tex_vk->aspect = params->aspect;
+
+    if (!tex_vk->aspect) {
+        for (int i = 0; i < tex_vk->num_planes; i++)
+            tex_vk->aspect |= VK_IMAGE_ASPECT_PLANE_0_BIT << i;
+        tex_vk->aspect = PL_DEF(tex_vk->aspect, VK_IMAGE_ASPECT_COLOR_BIT);
+    }
+
+    // Blitting to planar images requires fallback via compute shaders
+    if (tex_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT) {
+        tex->params.blit_src &= tex->params.storable;
+        tex->params.blit_dst &= tex->params.storable;
+    }
+
+    static const char * const wrapped_plane_names[4] = {
+        "wrapped plane 0", "wrapped plane 1", "wrapped plane 2", "wrapped plane 3",
+    };
+
+    for (int i = 0; i < tex_vk->num_planes; i++) {
+        struct pl_tex_t *plane;
+        VkImageAspectFlags aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << i;
+        if (!(aspect & tex_vk->aspect)) {
+            PL_INFO(gpu, "Not wrapping plane %d due to aspect bit 0x%x not "
+                    "being contained in supplied params->aspect 0x%x!",
+                    i, (unsigned) aspect, (unsigned) tex_vk->aspect);
+            continue;
+        }
+
+        pl_assert(tex_vk->type == VK_IMAGE_TYPE_2D);
+        plane = (struct pl_tex_t *) pl_vulkan_wrap(gpu, pl_vulkan_wrap_params(
+            .image      = tex_vk->img,
+            .aspect     = aspect,
+            .width      = PL_RSHIFT_UP(tex->params.w, fmt->planes[i].shift_x),
+            .height     = PL_RSHIFT_UP(tex->params.h, fmt->planes[i].shift_y),
+            .format     = fmtp->vk_fmt->pfmt[i].fmt,
+            .usage      = params->usage,
+            .user_data  = params->user_data,
+            .debug_tag  = PL_DEF(params->debug_tag, wrapped_plane_names[i]),
+        ));
+        if (!plane)
+            goto error;
+        plane->parent = tex;
+        tex->planes[i] = plane;
+        tex_vk->planes[i] = PL_PRIV(plane);
+    }
+
+    if (!vk_init_image(gpu, tex, PL_DEF(params->debug_tag, "wrapped")))
+        goto error;
+
+    return tex;
+
+error:
+    vk_tex_destroy(gpu, tex);
+    return NULL;
+}
+
+VkImage pl_vulkan_unwrap(pl_gpu gpu, pl_tex tex, VkFormat *out_format,
+                         VkImageUsageFlags *out_flags)
+{
+    struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+    if (out_format)
+        *out_format = tex_vk->img_fmt;
+    if (out_flags)
+        *out_flags = tex_vk->usage_flags;
+
+    return tex_vk->img;
+}
+
+bool pl_vulkan_hold_ex(pl_gpu gpu, const struct pl_vulkan_hold_params *params)
+{
+    struct pl_tex_vk *tex_vk = PL_PRIV(params->tex);
+    pl_assert(params->semaphore.sem);
+
+    bool held = tex_vk->held;
+    for (int i = 0; i < tex_vk->num_planes; i++)
+        held |= tex_vk->planes[i]->held;
+
+    if (held) {
+        PL_ERR(gpu, "Attempting to hold an already held image!");
+        return false;
+    }
+
+    struct vk_cmd *cmd = CMD_BEGIN(GRAPHICS);
+    if (!cmd) {
+        PL_ERR(gpu, "Failed holding external image!");
+        return false;
+    }
+
+    VkImageLayout layout = params->layout;
+    if (params->out_layout) {
+        // For planar images, arbitrarily pick the current image layout of the
+        // first plane. This should be fine in practice, since all planes will
+        // share the same usage capabilities.
+        if (tex_vk->num_planes) {
+            layout = tex_vk->planes[0]->layout;
+        } else {
+            layout = tex_vk->layout;
+        }
+    }
+
+    bool may_invalidate = true;
+    if (!tex_vk->num_planes) {
+        may_invalidate &= tex_vk->may_invalidate;
+        vk_tex_barrier(gpu, cmd, params->tex, VK_PIPELINE_STAGE_2_NONE,
+                       0, layout, params->qf);
+    }
+
+    for (int i = 0; i < tex_vk->num_planes; i++) {
+        may_invalidate &= tex_vk->planes[i]->may_invalidate;
+        vk_tex_barrier(gpu, cmd, params->tex->planes[i],
+                       VK_PIPELINE_STAGE_2_NONE, 0, layout, params->qf);
+    }
+
+    vk_cmd_sig(cmd, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, params->semaphore);
+    bool ok = CMD_SUBMIT(&cmd);
+
+    if (!tex_vk->num_planes) {
+        tex_vk->sem.write.queue = tex_vk->sem.read.queue = NULL;
+        tex_vk->held = ok;
+    }
+
+    for (int i = 0; i < tex_vk->num_planes; i++) {
+        struct pl_tex_vk *plane_vk = tex_vk->planes[i];
+        plane_vk->sem.write.queue = plane_vk->sem.read.queue = NULL;
+        plane_vk->held = ok;
+    }
+
+    if (ok && params->out_layout)
+        *params->out_layout = may_invalidate ? VK_IMAGE_LAYOUT_UNDEFINED : layout;
+
+    return ok;
+}
+
+void pl_vulkan_release_ex(pl_gpu gpu, const struct pl_vulkan_release_params *params)
+{
+    struct pl_tex_vk *tex_vk = PL_PRIV(params->tex);
+    if (tex_vk->num_planes) {
+        struct pl_vulkan_release_params plane_pars = *params;
+        for (int i = 0; i < tex_vk->num_planes; i++) {
+            plane_pars.tex = params->tex->planes[i];
+            pl_vulkan_release_ex(gpu, &plane_pars);
+        }
+        return;
+    }
+
+    if (!tex_vk->held) {
+        PL_ERR(gpu, "Attempting to release an unheld image?");
+        return;
+    }
+
+    if (params->semaphore.sem)
+        PL_ARRAY_APPEND(params->tex, tex_vk->ext_deps, params->semaphore);
+
+    tex_vk->qf = params->qf;
+    tex_vk->layout = params->layout;
+    tex_vk->held = false;
+}
+
+bool pl_vulkan_hold(pl_gpu gpu, pl_tex tex, VkImageLayout layout,
+                    pl_vulkan_sem sem_out)
+{
+    return pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params(
+        .tex        = tex,
+        .layout     = layout,
+        .semaphore  = sem_out,
+        .qf         = VK_QUEUE_FAMILY_IGNORED,
+    ));
+}
+
+bool pl_vulkan_hold_raw(pl_gpu gpu, pl_tex tex,
+                        VkImageLayout *out_layout,
+                        pl_vulkan_sem sem_out)
+{
+    return pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params(
+        .tex        = tex,
+        .out_layout = out_layout,
+        .semaphore  = sem_out,
+        .qf         = VK_QUEUE_FAMILY_IGNORED,
+    ));
+}
+
+void pl_vulkan_release(pl_gpu gpu, pl_tex tex, VkImageLayout layout,
+                       pl_vulkan_sem sem_in)
+{
+    pl_vulkan_release_ex(gpu, pl_vulkan_release_params(
+        .tex        = tex,
+        .layout     = layout,
+        .semaphore  = sem_in,
+        .qf         = VK_QUEUE_FAMILY_IGNORED,
+    ));
+}
diff --git a/src/vulkan/malloc.c b/src/vulkan/malloc.c
new file mode 100644
index 0000000..c35183b
--- /dev/null
+++ b/src/vulkan/malloc.c
@@ -0,0 +1,1058 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "malloc.h"
+#include "command.h"
+#include "utils.h"
+#include "pl_thread.h"
+
+#ifdef PL_HAVE_UNIX
+#include <errno.h>
+#include <unistd.h>
+#endif
+
+// Controls the page size alignment, to help coalesce allocations into the same
+// slab. Pages are rounded up to multiples of this value. (Default: 4 KB)
+#define PAGE_SIZE_ALIGN (1LLU << 12)
+
+// Controls the minimum/maximum number of pages for new slabs. As slabs are
+// exhausted of memory, the number of pages per new slab grows exponentially,
+// starting with the minimum until the maximum is reached.
+//
+// Note: The maximum must never exceed the size of `vk_slab.spacemap`.
+#define MINIMUM_PAGE_COUNT 4
+#define MAXIMUM_PAGE_COUNT (sizeof(uint64_t) * 8)
+
+// Controls the maximum page size. Any allocations above this threshold
+// (absolute size or fraction of VRAM, whichever is higher) will be served by
+// dedicated allocations. (Default: 64 MB or 1/16 of VRAM)
+#define MAXIMUM_PAGE_SIZE_ABSOLUTE (1LLU << 26)
+#define MAXIMUM_PAGE_SIZE_RELATIVE 16
+
+// Controls the minimum slab size, to avoid excessive re-allocation of very
+// small slabs. (Default: 256 KB)
+#define MINIMUM_SLAB_SIZE (1LLU << 18)
+
+// How long to wait before garbage collecting empty slabs. Slabs older than
+// this many invocations of `vk_malloc_garbage_collect` will be released.
+#define MAXIMUM_SLAB_AGE 32
+
+// A single slab represents a contiguous region of allocated memory. Actual
+// allocations are served as pages of this. Slabs are organized into pools,
+// each of which contains a list of slabs of differing page sizes.
+struct vk_slab {
+    pl_mutex lock;
+    pl_debug_tag debug_tag; // debug tag of the triggering allocation
+    VkDeviceMemory mem;     // underlying device allocation
+    VkDeviceSize size;      // total allocated size of `mem`
+    VkMemoryType mtype;     // underlying memory type
+    bool dedicated;         // slab is allocated specifically for one object
+    bool imported;          // slab represents an imported memory allocation
+
+    // free space accounting (only for non-dedicated slabs)
+    uint64_t spacemap;      // bitset of available pages
+    size_t pagesize;        // size in bytes per page
+    size_t used;            // number of bytes actually in use
+    uint64_t age;           // timestamp of last use
+
+    // optional, depends on the memory type:
+    VkBuffer buffer;        // buffer spanning the entire slab
+    void *data;             // mapped memory corresponding to `mem`
+    bool coherent;          // mapped memory is coherent
+    union pl_handle handle; // handle associated with this device memory
+    enum pl_handle_type handle_type;
+};
+
+// Represents a single memory pool. We keep track of a vk_pool for each
+// combination of malloc parameters. This shouldn't actually be that many in
+// practice, because some combinations simply never occur, and others will
+// generally be the same for the same objects.
+//
+// Note: `vk_pool` addresses are not immutable, so we mustn't expose any
+// dangling references to a `vk_pool` from e.g. `vk_memslice.priv = vk_slab`.
+struct vk_pool {
+    struct vk_malloc_params params;   // allocation params (with some fields nulled)
+    PL_ARRAY(struct vk_slab *) slabs; // array of slabs, unsorted
+    int index;                        // running index in `vk_malloc.pools`
+};
+
+// The overall state of the allocator, which keeps track of a vk_pool for each
+// memory type.
+struct vk_malloc {
+    struct vk_ctx *vk;
+    pl_mutex lock;
+    VkPhysicalDeviceMemoryProperties props;
+    size_t maximum_page_size;
+    PL_ARRAY(struct vk_pool) pools;
+    uint64_t age;
+};
+
+static inline float efficiency(size_t used, size_t total)
+{
+    if (!total)
+        return 100.0;
+
+    return 100.0f * used / total;
+}
+
+static const char *print_size(char buf[8], size_t size)
+{
+    const char *suffixes = "\0KMG";
+    while (suffixes[1] && size > 9999) {
+        size >>= 10;
+        suffixes++;
+    }
+
+    int ret = *suffixes ? snprintf(buf, 8, "%4zu%c", size, *suffixes)
+                        : snprintf(buf, 8, "%5zu", size);
+
+    return ret >= 0 ? buf : "(error)";
+}
+
+#define PRINT_SIZE(x) (print_size((char[8]){0}, (size_t) (x)))
+
+void vk_malloc_print_stats(struct vk_malloc *ma, enum pl_log_level lev)
+{
+    struct vk_ctx *vk = ma->vk;
+    size_t total_size = 0;
+    size_t total_used = 0;
+    size_t total_res = 0;
+
+    PL_MSG(vk, lev, "Memory heaps supported by device:");
+    for (int i = 0; i < ma->props.memoryHeapCount; i++) {
+        VkMemoryHeap heap = ma->props.memoryHeaps[i];
+        PL_MSG(vk, lev, "    %d: flags 0x%x size %s",
+                i, (unsigned) heap.flags, PRINT_SIZE(heap.size));
+    }
+
+    PL_DEBUG(vk, "Memory types supported by device:");
+    for (int i = 0; i < ma->props.memoryTypeCount; i++) {
+        VkMemoryType type = ma->props.memoryTypes[i];
+        PL_DEBUG(vk, "    %d: flags 0x%x heap %d",
+                 i, (unsigned) type.propertyFlags, (int) type.heapIndex);
+    }
+
+    pl_mutex_lock(&ma->lock);
+    for (int i = 0; i < ma->pools.num; i++) {
+        struct vk_pool *pool = &ma->pools.elem[i];
+        const struct vk_malloc_params *par = &pool->params;
+
+        PL_MSG(vk, lev, "Memory pool %d:", i);
+        PL_MSG(vk, lev, "    Compatible types: 0x%"PRIx32, par->reqs.memoryTypeBits);
+        if (par->required)
+            PL_MSG(vk, lev, "    Required flags: 0x%"PRIx32, par->required);
+        if (par->optimal)
+            PL_MSG(vk, lev, "    Optimal flags: 0x%"PRIx32, par->optimal);
+        if (par->buf_usage)
+            PL_MSG(vk, lev, "    Buffer flags: 0x%"PRIx32, par->buf_usage);
+        if (par->export_handle)
+            PL_MSG(vk, lev, "    Export handle: 0x%x", par->export_handle);
+
+        size_t pool_size = 0;
+        size_t pool_used = 0;
+        size_t pool_res = 0;
+
+        for (int j = 0; j < pool->slabs.num; j++) {
+            struct vk_slab *slab = pool->slabs.elem[j];
+            pl_mutex_lock(&slab->lock);
+
+            size_t avail = __builtin_popcountll(slab->spacemap) * slab->pagesize;
+            size_t slab_res = slab->size - avail;
+
+            PL_MSG(vk, lev, "    Slab %2d: %8"PRIx64" x %s: "
+                   "%s used %s res %s alloc from heap %d, efficiency %.2f%%  [%s]",
+                   j, slab->spacemap, PRINT_SIZE(slab->pagesize),
+                   PRINT_SIZE(slab->used), PRINT_SIZE(slab_res),
+                   PRINT_SIZE(slab->size), (int) slab->mtype.heapIndex,
+                   efficiency(slab->used, slab_res),
+                   PL_DEF(slab->debug_tag, "unknown"));
+
+            pool_size += slab->size;
+            pool_used += slab->used;
+            pool_res += slab_res;
+            pl_mutex_unlock(&slab->lock);
+        }
+
+        PL_MSG(vk, lev, "    Pool summary: %s used %s res %s alloc, "
+               "efficiency %.2f%%, utilization %.2f%%",
+               PRINT_SIZE(pool_used), PRINT_SIZE(pool_res),
+               PRINT_SIZE(pool_size), efficiency(pool_used, pool_res),
+               efficiency(pool_res, pool_size));
+
+        total_size += pool_size;
+        total_used += pool_used;
+        total_res += pool_res;
+    }
+    pl_mutex_unlock(&ma->lock);
+
+    PL_MSG(vk, lev, "Memory summary: %s used %s res %s alloc, "
+           "efficiency %.2f%%, utilization %.2f%%, max page: %s",
+           PRINT_SIZE(total_used), PRINT_SIZE(total_res),
+           PRINT_SIZE(total_size), efficiency(total_used, total_res),
+           efficiency(total_res, total_size),
+           PRINT_SIZE(ma->maximum_page_size));
+}
+
+static void slab_free(struct vk_ctx *vk, struct vk_slab *slab)
+{
+    if (!slab)
+        return;
+
+#ifndef NDEBUG
+    if (!slab->dedicated && slab->used > 0) {
+        PL_WARN(vk, "Leaked %zu bytes of vulkan memory!", slab->used);
+        PL_WARN(vk, "slab total size: %zu bytes, heap: %d, flags: 0x%"PRIX64,
+                (size_t) slab->size, (int) slab->mtype.heapIndex,
+                (uint64_t) slab->mtype.propertyFlags);
+        if (slab->debug_tag)
+            PL_WARN(vk, "last used for: %s", slab->debug_tag);
+        pl_log_stack_trace(vk->log, PL_LOG_WARN);
+        pl_debug_abort();
+    }
+#endif
+
+    if (slab->imported) {
+        switch (slab->handle_type) {
+        case PL_HANDLE_FD:
+        case PL_HANDLE_DMA_BUF:
+            PL_TRACE(vk, "Unimporting slab of size %s from fd: %d",
+                     PRINT_SIZE(slab->size), slab->handle.fd);
+            break;
+        case PL_HANDLE_WIN32:
+        case PL_HANDLE_WIN32_KMT:
+#ifdef PL_HAVE_WIN32
+            PL_TRACE(vk, "Unimporting slab of size %s from handle: %p",
+                     PRINT_SIZE(slab->size), (void *) slab->handle.handle);
+#endif
+            break;
+        case PL_HANDLE_HOST_PTR:
+            PL_TRACE(vk, "Unimporting slab of size %s from ptr: %p",
+                     PRINT_SIZE(slab->size), (void *) slab->handle.ptr);
+            break;
+        case PL_HANDLE_IOSURFACE:
+        case PL_HANDLE_MTL_TEX:
+            pl_unreachable();
+        }
+    } else {
+        switch (slab->handle_type) {
+        case PL_HANDLE_FD:
+        case PL_HANDLE_DMA_BUF:
+#ifdef PL_HAVE_UNIX
+            if (slab->handle.fd > -1)
+                close(slab->handle.fd);
+#endif
+            break;
+        case PL_HANDLE_WIN32:
+#ifdef PL_HAVE_WIN32
+            if (slab->handle.handle != NULL)
+                CloseHandle(slab->handle.handle);
+#endif
+            break;
+        case PL_HANDLE_WIN32_KMT:
+            // PL_HANDLE_WIN32_KMT is just an identifier. It doesn't get closed.
+            break;
+        case PL_HANDLE_HOST_PTR:
+            // Implicitly unmapped
+            break;
+        case PL_HANDLE_IOSURFACE:
+        case PL_HANDLE_MTL_TEX:
+            pl_unreachable();
+        }
+
+        PL_DEBUG(vk, "Freeing slab of size %s", PRINT_SIZE(slab->size));
+    }
+
+    vk->DestroyBuffer(vk->dev, slab->buffer, PL_VK_ALLOC);
+    // also implicitly unmaps the memory if needed
+    vk->FreeMemory(vk->dev, slab->mem, PL_VK_ALLOC);
+
+    pl_mutex_destroy(&slab->lock);
+    pl_free(slab);
+}
+
+// type_mask: optional
+// thread-safety: safe
+static bool find_best_memtype(const struct vk_malloc *ma, uint32_t type_mask,
+                              const struct vk_malloc_params *params,
+                              uint32_t *out_index)
+{
+    struct vk_ctx *vk = ma->vk;
+    int best = -1;
+
+    // The vulkan spec requires memory types to be sorted in the "optimal"
+    // order, so the first matching type we find will be the best/fastest one.
+    // That being said, we still want to prioritize memory types that have
+    // better optional flags.
+
+    type_mask &= params->reqs.memoryTypeBits;
+    for (int i = 0; i < ma->props.memoryTypeCount; i++) {
+        const VkMemoryType *mtype = &ma->props.memoryTypes[i];
+
+        // The memory type flags must include our properties
+        if ((mtype->propertyFlags & params->required) != params->required)
+            continue;
+
+        // The memory heap must be large enough for the allocation
+        VkDeviceSize heapSize = ma->props.memoryHeaps[mtype->heapIndex].size;
+        if (params->reqs.size > heapSize)
+            continue;
+
+        // The memory type must be supported by the type mask (bitfield)
+        if (!(type_mask & (1LU << i)))
+            continue;
+
+        // Calculate the score as the number of optimal property flags matched
+        int score = __builtin_popcountl(mtype->propertyFlags & params->optimal);
+        if (score > best) {
+            *out_index = i;
+            best = score;
+        }
+    }
+
+    if (best < 0) {
+        PL_ERR(vk, "Found no memory type matching property flags 0x%x and type "
+               "bits 0x%x!",
+               (unsigned) params->required, (unsigned) type_mask);
+        return false;
+    }
+
+    return true;
+}
+
+static bool buf_external_check(struct vk_ctx *vk, VkBufferUsageFlags usage,
+                               enum pl_handle_type handle_type, bool import)
+{
+    if (!handle_type)
+        return true;
+
+    VkPhysicalDeviceExternalBufferInfo info = {
+        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO_KHR,
+        .usage = usage,
+        .handleType = vk_mem_handle_type(handle_type),
+    };
+
+    VkExternalBufferProperties props = {
+        .sType = VK_STRUCTURE_TYPE_EXTERNAL_BUFFER_PROPERTIES_KHR,
+    };
+
+    if (!info.handleType)
+        return false;
+
+    vk->GetPhysicalDeviceExternalBufferProperties(vk->physd, &info, &props);
+    return vk_external_mem_check(vk, &props.externalMemoryProperties,
+                                 handle_type, import);
+}
+
+// thread-safety: safe
+static struct vk_slab *slab_alloc(struct vk_malloc *ma,
+                                  const struct vk_malloc_params *params)
+{
+    struct vk_ctx *vk = ma->vk;
+    struct vk_slab *slab = pl_alloc_ptr(NULL, slab);
+    *slab = (struct vk_slab) {
+        .age = ma->age,
+        .size = params->reqs.size,
+        .handle_type = params->export_handle,
+        .debug_tag = params->debug_tag,
+    };
+    pl_mutex_init(&slab->lock);
+
+    switch (slab->handle_type) {
+    case PL_HANDLE_FD:
+    case PL_HANDLE_DMA_BUF:
+        slab->handle.fd = -1;
+        break;
+    case PL_HANDLE_WIN32:
+    case PL_HANDLE_WIN32_KMT:
+    case PL_HANDLE_MTL_TEX:
+    case PL_HANDLE_IOSURFACE:
+        slab->handle.handle = NULL;
+        break;
+    case PL_HANDLE_HOST_PTR:
+        slab->handle.ptr = NULL;
+        break;
+    }
+
+    VkExportMemoryAllocateInfoKHR ext_info = {
+        .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR,
+        .handleTypes = vk_mem_handle_type(slab->handle_type),
+    };
+
+    uint32_t type_mask = UINT32_MAX;
+    if (params->buf_usage) {
+        // Queue family sharing modes don't matter for buffers, so we just
+        // set them as concurrent and stop worrying about it.
+        uint32_t qfs[3] = {0};
+        pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs));
+        for (int i = 0; i < vk->pools.num; i++)
+            qfs[i] = vk->pools.elem[i]->qf;
+
+        VkExternalMemoryBufferCreateInfoKHR ext_buf_info = {
+            .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR,
+            .handleTypes = ext_info.handleTypes,
+        };
+
+        VkBufferCreateInfo binfo = {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+            .pNext = slab->handle_type ? &ext_buf_info : NULL,
+            .size  = slab->size,
+            .usage = params->buf_usage,
+            .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT
+                                             : VK_SHARING_MODE_EXCLUSIVE,
+            .queueFamilyIndexCount = vk->pools.num,
+            .pQueueFamilyIndices = qfs,
+        };
+
+        if (!buf_external_check(vk, binfo.usage, slab->handle_type, false)) {
+            PL_ERR(vk, "Failed allocating shared memory buffer: possibly "
+                   "the handle type is unsupported?");
+            goto error;
+        }
+
+        VK(vk->CreateBuffer(vk->dev, &binfo, PL_VK_ALLOC, &slab->buffer));
+        PL_VK_NAME(BUFFER, slab->buffer, "slab");
+
+        VkMemoryRequirements reqs = {0};
+        vk->GetBufferMemoryRequirements(vk->dev, slab->buffer, &reqs);
+        slab->size = reqs.size; // this can be larger than `slab->size`
+        type_mask = reqs.memoryTypeBits;
+
+        // Note: we can ignore `reqs.align` because we always bind the buffer
+        // memory to offset 0
+    }
+
+    VkMemoryAllocateInfo minfo = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+        .allocationSize = slab->size,
+    };
+
+    if (params->export_handle)
+        vk_link_struct(&minfo, &ext_info);
+
+    VkMemoryDedicatedAllocateInfoKHR dinfo = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR,
+        .image = params->ded_image,
+    };
+
+    if (params->ded_image)
+        vk_link_struct(&minfo, &dinfo);
+
+    if (!find_best_memtype(ma, type_mask, params, &minfo.memoryTypeIndex))
+        goto error;
+
+    const VkMemoryType *mtype = &ma->props.memoryTypes[minfo.memoryTypeIndex];
+    PL_DEBUG(vk, "Allocating %zu memory of type 0x%x (id %d) in heap %d: %s",
+             (size_t) slab->size, (unsigned) mtype->propertyFlags,
+             (int) minfo.memoryTypeIndex, (int) mtype->heapIndex,
+             PL_DEF(params->debug_tag, "unknown"));
+
+    pl_clock_t start = pl_clock_now();
+
+    VkResult res = vk->AllocateMemory(vk->dev, &minfo, PL_VK_ALLOC, &slab->mem);
+    switch (res) {
+    case VK_ERROR_OUT_OF_DEVICE_MEMORY:
+    case VK_ERROR_OUT_OF_HOST_MEMORY:
+        PL_ERR(vk, "Allocation of size %s failed: %s!",
+               PRINT_SIZE(slab->size), vk_res_str(res));
+        vk_malloc_print_stats(ma, PL_LOG_ERR);
+        pl_log_stack_trace(vk->log, PL_LOG_ERR);
+        pl_debug_abort();
+        goto error;
+
+    default:
+        PL_VK_ASSERT(res, "vkAllocateMemory");
+    }
+
+    slab->mtype = *mtype;
+    if (mtype->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
+        VK(vk->MapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data));
+        slab->coherent = mtype->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+    }
+
+    if (slab->buffer)
+        VK(vk->BindBufferMemory(vk->dev, slab->buffer, slab->mem, 0));
+
+#ifdef PL_HAVE_UNIX
+    if (slab->handle_type == PL_HANDLE_FD ||
+        slab->handle_type == PL_HANDLE_DMA_BUF)
+    {
+        VkMemoryGetFdInfoKHR fd_info = {
+            .sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR,
+            .memory = slab->mem,
+            .handleType = ext_info.handleTypes,
+        };
+
+        VK(vk->GetMemoryFdKHR(vk->dev, &fd_info, &slab->handle.fd));
+    }
+#endif
+
+#ifdef PL_HAVE_WIN32
+    if (slab->handle_type == PL_HANDLE_WIN32 ||
+        slab->handle_type == PL_HANDLE_WIN32_KMT)
+    {
+        VkMemoryGetWin32HandleInfoKHR handle_info = {
+            .sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR,
+            .memory = slab->mem,
+            .handleType = ext_info.handleTypes,
+        };
+
+        VK(vk->GetMemoryWin32HandleKHR(vk->dev, &handle_info,
+                                       &slab->handle.handle));
+    }
+#endif
+
+    pl_log_cpu_time(vk->log, start, pl_clock_now(), "allocating slab");
+
+    // free space accounting is done by the caller
+    return slab;
+
+error:
+    if (params->debug_tag)
+        PL_ERR(vk, "  for malloc: %s", params->debug_tag);
+    slab_free(vk, slab);
+    return NULL;
+}
+
+static void pool_uninit(struct vk_ctx *vk, struct vk_pool *pool)
+{
+    for (int i = 0; i < pool->slabs.num; i++)
+        slab_free(vk, pool->slabs.elem[i]);
+
+    pl_free(pool->slabs.elem);
+    *pool = (struct vk_pool) {0};
+}
+
+struct vk_malloc *vk_malloc_create(struct vk_ctx *vk)
+{
+    struct vk_malloc *ma = pl_zalloc_ptr(NULL, ma);
+    pl_mutex_init(&ma->lock);
+    vk->GetPhysicalDeviceMemoryProperties(vk->physd, &ma->props);
+    ma->vk = vk;
+
+    // Determine maximum page size
+    ma->maximum_page_size = MAXIMUM_PAGE_SIZE_ABSOLUTE;
+    for (int i = 0; i < ma->props.memoryHeapCount; i++) {
+        VkMemoryHeap heap = ma->props.memoryHeaps[i];
+        if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) {
+            size_t size_max = heap.size / MAXIMUM_PAGE_SIZE_RELATIVE;
+            ma->maximum_page_size = PL_MAX(ma->maximum_page_size, size_max);
+        }
+    }
+
+    vk_malloc_print_stats(ma, PL_LOG_INFO);
+    return ma;
+}
+
+void vk_malloc_destroy(struct vk_malloc **ma_ptr)
+{
+    struct vk_malloc *ma = *ma_ptr;
+    if (!ma)
+        return;
+
+    vk_malloc_print_stats(ma, PL_LOG_DEBUG);
+    for (int i = 0; i < ma->pools.num; i++)
+        pool_uninit(ma->vk, &ma->pools.elem[i]);
+
+    pl_mutex_destroy(&ma->lock);
+    pl_free_ptr(ma_ptr);
+}
+
+void vk_malloc_garbage_collect(struct vk_malloc *ma)
+{
+    struct vk_ctx *vk = ma->vk;
+
+    pl_mutex_lock(&ma->lock);
+    ma->age++;
+
+    for (int i = 0; i < ma->pools.num; i++) {
+        struct vk_pool *pool = &ma->pools.elem[i];
+        for (int n = 0; n < pool->slabs.num; n++) {
+            struct vk_slab *slab = pool->slabs.elem[n];
+            pl_mutex_lock(&slab->lock);
+            if (slab->used || (ma->age - slab->age) <= MAXIMUM_SLAB_AGE) {
+                pl_mutex_unlock(&slab->lock);
+                continue;
+            }
+
+            PL_DEBUG(vk, "Garbage collected slab of size %s from pool %d",
+                     PRINT_SIZE(slab->size), pool->index);
+
+            pl_mutex_unlock(&slab->lock);
+            slab_free(ma->vk, slab);
+            PL_ARRAY_REMOVE_AT(pool->slabs, n--);
+        }
+    }
+
+    pl_mutex_unlock(&ma->lock);
+}
+
+pl_handle_caps vk_malloc_handle_caps(const struct vk_malloc *ma, bool import)
+{
+    struct vk_ctx *vk = ma->vk;
+    pl_handle_caps caps = 0;
+
+    for (int i = 0; vk_mem_handle_list[i]; i++) {
+        // Try seeing if we could allocate a "basic" buffer using these
+        // capabilities, with no fancy buffer usage. More specific checks will
+        // happen down the line at VkBuffer creation time, but this should give
+        // us a rough idea of what the driver supports.
+        enum pl_handle_type type = vk_mem_handle_list[i];
+        if (buf_external_check(vk, VK_BUFFER_USAGE_TRANSFER_DST_BIT, type, import))
+            caps |= type;
+    }
+
+    return caps;
+}
+
+void vk_malloc_free(struct vk_malloc *ma, struct vk_memslice *slice)
+{
+    struct vk_ctx *vk = ma->vk;
+    struct vk_slab *slab = slice->priv;
+    if (!slab || slab->dedicated) {
+        slab_free(vk, slab);
+        goto done;
+    }
+
+    pl_mutex_lock(&slab->lock);
+
+    int page_idx = slice->offset / slab->pagesize;
+    slab->spacemap |= 0x1LLU << page_idx;
+    slab->used -= slice->size;
+    slab->age = ma->age;
+    pl_assert(slab->used >= 0);
+
+    pl_mutex_unlock(&slab->lock);
+
+done:
+    *slice = (struct vk_memslice) {0};
+}
+
+static inline bool pool_params_eq(const struct vk_malloc_params *a,
+                                  const struct vk_malloc_params *b)
+{
+    return a->reqs.size == b->reqs.size &&
+           a->reqs.alignment == b->reqs.alignment &&
+           a->reqs.memoryTypeBits == b->reqs.memoryTypeBits &&
+           a->required == b->required &&
+           a->optimal == b->optimal &&
+           a->buf_usage == b->buf_usage &&
+           a->export_handle == b->export_handle;
+}
+
+static struct vk_pool *find_pool(struct vk_malloc *ma,
+                                 const struct vk_malloc_params *params)
+{
+    pl_assert(!params->import_handle);
+    pl_assert(!params->ded_image);
+
+    struct vk_malloc_params fixed = *params;
+    fixed.reqs.alignment = 0;
+    fixed.reqs.size = 0;
+    fixed.shared_mem = (struct pl_shared_mem) {0};
+
+    for (int i = 0; i < ma->pools.num; i++) {
+        if (pool_params_eq(&ma->pools.elem[i].params, &fixed))
+            return &ma->pools.elem[i];
+    }
+
+    // Not found => add it
+    PL_ARRAY_GROW(ma, ma->pools);
+    size_t idx = ma->pools.num++;
+    ma->pools.elem[idx] = (struct vk_pool) {
+        .params = fixed,
+        .index = idx,
+    };
+    return &ma->pools.elem[idx];
+}
+
+// Returns a suitable memory page from the pool. A new slab will be allocated
+// under the hood, if necessary.
+//
+// Note: This locks the slab it returns
+static struct vk_slab *pool_get_page(struct vk_malloc *ma, struct vk_pool *pool,
+                                     size_t size, size_t align,
+                                     VkDeviceSize *offset)
+{
+    struct vk_slab *slab = NULL;
+    int slab_pages = MINIMUM_PAGE_COUNT;
+    size = PL_ALIGN2(size, PAGE_SIZE_ALIGN);
+    const size_t pagesize = PL_ALIGN(size, align);
+
+    for (int i = 0; i < pool->slabs.num; i++) {
+        slab = pool->slabs.elem[i];
+        if (slab->pagesize < size)
+            continue;
+        if (slab->pagesize > pagesize * MINIMUM_PAGE_COUNT) // rough heuristic
+            continue;
+        if (slab->pagesize % align)
+            continue;
+
+        pl_mutex_lock(&slab->lock);
+        int page_idx = __builtin_ffsll(slab->spacemap);
+        if (!page_idx--) {
+            pl_mutex_unlock(&slab->lock);
+            // Increase the number of slabs to allocate for new slabs the
+            // more existing full slabs exist for this size range
+            slab_pages = PL_MIN(slab_pages << 1, MAXIMUM_PAGE_COUNT);
+            continue;
+        }
+
+        slab->spacemap ^= 0x1LLU << page_idx;
+        *offset = page_idx * slab->pagesize;
+        return slab;
+    }
+
+    // Otherwise, allocate a new vk_slab and append it to the list.
+    VkDeviceSize slab_size = slab_pages * pagesize;
+    pl_static_assert(MINIMUM_SLAB_SIZE <= PAGE_SIZE_ALIGN * MAXIMUM_PAGE_COUNT);
+    const VkDeviceSize max_slab_size = ma->maximum_page_size * MINIMUM_PAGE_COUNT;
+    pl_assert(pagesize <= ma->maximum_page_size);
+    slab_size = PL_CLAMP(slab_size, MINIMUM_SLAB_SIZE, max_slab_size);
+    slab_pages = slab_size / pagesize;
+    slab_size = slab_pages * pagesize; // max_slab_size may be npot2, trim excess
+
+    struct vk_malloc_params params = pool->params;
+    params.reqs.size = slab_size;
+
+    // Don't hold the lock while allocating the slab, because it can be a
+    // potentially very costly operation.
+    pl_mutex_unlock(&ma->lock);
+    slab = slab_alloc(ma, &params);
+    pl_mutex_lock(&ma->lock);
+    if (!slab)
+        return NULL;
+    pl_mutex_lock(&slab->lock);
+
+    slab->spacemap = (slab_pages == sizeof(uint64_t) * 8) ? ~0LLU : ~(~0LLU << slab_pages);
+    slab->pagesize = pagesize;
+    PL_ARRAY_APPEND(NULL, pool->slabs, slab);
+
+    // Return the first page in this newly allocated slab
+    slab->spacemap ^= 0x1;
+    *offset = 0;
+    return slab;
+}
+
+static bool vk_malloc_import(struct vk_malloc *ma, struct vk_memslice *out,
+                             const struct vk_malloc_params *params)
+{
+    struct vk_ctx *vk = ma->vk;
+    VkExternalMemoryHandleTypeFlagBitsKHR vk_handle_type;
+    vk_handle_type = vk_mem_handle_type(params->import_handle);
+
+    struct vk_slab *slab = NULL;
+    const struct pl_shared_mem *shmem = &params->shared_mem;
+
+    VkMemoryDedicatedAllocateInfoKHR dinfo = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR,
+        .image = params->ded_image,
+    };
+
+    VkImportMemoryFdInfoKHR fdinfo = {
+        .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR,
+        .handleType = vk_handle_type,
+        .fd = -1,
+    };
+
+    VkImportMemoryHostPointerInfoEXT ptrinfo = {
+        .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT,
+        .handleType = vk_handle_type,
+    };
+
+    VkMemoryAllocateInfo ainfo = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+        .allocationSize = shmem->size,
+    };
+
+    if (params->ded_image)
+        vk_link_struct(&ainfo, &dinfo);
+
+    VkBuffer buffer = VK_NULL_HANDLE;
+    VkMemoryRequirements reqs = params->reqs;
+
+    if (params->buf_usage) {
+        uint32_t qfs[3] = {0};
+        pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs));
+        for (int i = 0; i < vk->pools.num; i++)
+            qfs[i] = vk->pools.elem[i]->qf;
+
+        VkExternalMemoryBufferCreateInfoKHR ext_buf_info = {
+            .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR,
+            .handleTypes = vk_handle_type,
+        };
+
+        VkBufferCreateInfo binfo = {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+            .pNext = &ext_buf_info,
+            .size = shmem->size,
+            .usage = params->buf_usage,
+            .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT
+                                             : VK_SHARING_MODE_EXCLUSIVE,
+            .queueFamilyIndexCount = vk->pools.num,
+            .pQueueFamilyIndices = qfs,
+        };
+
+        VK(vk->CreateBuffer(vk->dev, &binfo, PL_VK_ALLOC, &buffer));
+        PL_VK_NAME(BUFFER, buffer, "imported");
+
+        vk->GetBufferMemoryRequirements(vk->dev, buffer, &reqs);
+    }
+
+    if (reqs.size > shmem->size) {
+        PL_ERR(vk, "Imported object requires %zu bytes, larger than the "
+               "provided size %zu!",
+               (size_t) reqs.size, shmem->size);
+        goto error;
+    }
+
+    if (shmem->offset % reqs.alignment || shmem->offset % params->reqs.alignment) {
+        PL_ERR(vk, "Imported object offset %zu conflicts with alignment %zu!",
+               shmem->offset, pl_lcm(reqs.alignment, params->reqs.alignment));
+        goto error;
+    }
+
+    switch (params->import_handle) {
+#ifdef PL_HAVE_UNIX
+    case PL_HANDLE_DMA_BUF: {
+        if (!vk->GetMemoryFdPropertiesKHR) {
+            PL_ERR(vk, "Importing PL_HANDLE_DMA_BUF requires %s.",
+                   VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME);
+            goto error;
+        }
+
+        VkMemoryFdPropertiesKHR fdprops = {
+            .sType = VK_STRUCTURE_TYPE_MEMORY_FD_PROPERTIES_KHR,
+        };
+
+        VK(vk->GetMemoryFdPropertiesKHR(vk->dev,
+                                        vk_handle_type,
+                                        shmem->handle.fd,
+                                        &fdprops));
+
+        // We dup() the fd to make it safe to import the same original fd
+        // multiple times.
+        fdinfo.fd = dup(shmem->handle.fd);
+        if (fdinfo.fd == -1) {
+            PL_ERR(vk, "Failed to dup() fd (%d) when importing memory: %s",
+                   fdinfo.fd, strerror(errno));
+            goto error;
+        }
+
+        reqs.memoryTypeBits &= fdprops.memoryTypeBits;
+        vk_link_struct(&ainfo, &fdinfo);
+        break;
+    }
+#else // !PL_HAVE_UNIX
+    case PL_HANDLE_DMA_BUF:
+        PL_ERR(vk, "PL_HANDLE_DMA_BUF requires building with UNIX support!");
+        goto error;
+#endif
+
+    case PL_HANDLE_HOST_PTR: {
+        VkMemoryHostPointerPropertiesEXT ptrprops = {
+            .sType = VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT,
+        };
+
+        VK(vk->GetMemoryHostPointerPropertiesEXT(vk->dev, vk_handle_type,
+                                                 shmem->handle.ptr,
+                                                 &ptrprops));
+
+        ptrinfo.pHostPointer = (void *) shmem->handle.ptr;
+        reqs.memoryTypeBits &= ptrprops.memoryTypeBits;
+        vk_link_struct(&ainfo, &ptrinfo);
+        break;
+    }
+
+    case PL_HANDLE_FD:
+    case PL_HANDLE_WIN32:
+    case PL_HANDLE_WIN32_KMT:
+    case PL_HANDLE_IOSURFACE:
+    case PL_HANDLE_MTL_TEX:
+        PL_ERR(vk, "vk_malloc_import: unsupported handle type %d",
+               params->import_handle);
+        goto error;
+    }
+
+    if (!find_best_memtype(ma, reqs.memoryTypeBits, params, &ainfo.memoryTypeIndex)) {
+        PL_ERR(vk, "No compatible memory types offered for imported memory!");
+        goto error;
+    }
+
+    VkDeviceMemory vkmem = VK_NULL_HANDLE;
+    VK(vk->AllocateMemory(vk->dev, &ainfo, PL_VK_ALLOC, &vkmem));
+
+    slab = pl_alloc_ptr(NULL, slab);
+    *slab = (struct vk_slab) {
+        .mem = vkmem,
+        .dedicated = true,
+        .imported = true,
+        .buffer = buffer,
+        .size = shmem->size,
+        .handle_type = params->import_handle,
+    };
+    pl_mutex_init(&slab->lock);
+
+    *out = (struct vk_memslice) {
+        .vkmem = vkmem,
+        .buf = buffer,
+        .size = shmem->size - shmem->offset,
+        .offset = shmem->offset,
+        .shared_mem = *shmem,
+        .priv = slab,
+    };
+
+    switch (params->import_handle) {
+    case PL_HANDLE_DMA_BUF:
+    case PL_HANDLE_FD:
+        PL_TRACE(vk, "Imported %s bytes from fd: %d%s",
+                 PRINT_SIZE(slab->size), shmem->handle.fd,
+                 params->ded_image ? " (dedicated)" : "");
+        // fd ownership is transferred at this point.
+        slab->handle.fd = fdinfo.fd;
+        fdinfo.fd = -1;
+        break;
+    case PL_HANDLE_HOST_PTR:
+        PL_TRACE(vk, "Imported %s bytes from ptr: %p%s",
+                 PRINT_SIZE(slab->size), shmem->handle.ptr,
+                 params->ded_image ? " (dedicated" : "");
+        slab->handle.ptr = ptrinfo.pHostPointer;
+        break;
+    case PL_HANDLE_WIN32:
+    case PL_HANDLE_WIN32_KMT:
+    case PL_HANDLE_IOSURFACE:
+    case PL_HANDLE_MTL_TEX:
+        break;
+    }
+
+    VkMemoryPropertyFlags flags = ma->props.memoryTypes[ainfo.memoryTypeIndex].propertyFlags;
+    if (flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
+        VK(vk->MapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data));
+        slab->coherent = flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+        out->data = (uint8_t *) slab->data + out->offset;
+        out->coherent = slab->coherent;
+        if (!slab->coherent) {
+            // Use entire buffer range, since this is a dedicated memory
+            // allocation. This avoids issues with noncoherent atomicity
+            out->map_offset = 0;
+            out->map_size = VK_WHOLE_SIZE;
+
+            // Mapping does not implicitly invalidate mapped memory
+            VK(vk->InvalidateMappedMemoryRanges(vk->dev, 1, &(VkMappedMemoryRange) {
+                .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+                .memory = slab->mem,
+                .offset = out->map_offset,
+                .size = out->map_size,
+            }));
+        }
+    }
+
+    if (buffer)
+        VK(vk->BindBufferMemory(vk->dev, buffer, vkmem, 0));
+
+    return true;
+
+error:
+    if (params->debug_tag)
+        PL_ERR(vk, "  for malloc: %s", params->debug_tag);
+    vk->DestroyBuffer(vk->dev, buffer, PL_VK_ALLOC);
+#ifdef PL_HAVE_UNIX
+    if (fdinfo.fd > -1)
+        close(fdinfo.fd);
+#endif
+    pl_free(slab);
+    *out = (struct vk_memslice) {0};
+    return false;
+}
+
+size_t vk_malloc_avail(struct vk_malloc *ma, VkMemoryPropertyFlags flags)
+{
+    size_t avail = 0;
+    for (int i = 0; i < ma->props.memoryTypeCount; i++) {
+        const VkMemoryType *mtype = &ma->props.memoryTypes[i];
+        if ((mtype->propertyFlags & flags) != flags)
+            continue;
+        avail = PL_MAX(avail, ma->props.memoryHeaps[mtype->heapIndex].size);
+    }
+
+    return avail;
+}
+
+bool vk_malloc_slice(struct vk_malloc *ma, struct vk_memslice *out,
+                     const struct vk_malloc_params *params)
+{
+    struct vk_ctx *vk = ma->vk;
+    pl_assert(!params->import_handle || !params->export_handle);
+    if (params->import_handle)
+        return vk_malloc_import(ma, out, params);
+
+    pl_assert(params->reqs.size);
+    size_t size = params->reqs.size;
+    size_t align = params->reqs.alignment;
+    align = pl_lcm(align, vk->props.limits.bufferImageGranularity);
+    align = pl_lcm(align, vk->props.limits.nonCoherentAtomSize);
+
+    struct vk_slab *slab;
+    VkDeviceSize offset;
+
+    if (params->ded_image || size > ma->maximum_page_size) {
+        slab = slab_alloc(ma, params);
+        if (!slab)
+            return false;
+        slab->dedicated = true;
+        offset = 0;
+    } else {
+        pl_mutex_lock(&ma->lock);
+        struct vk_pool *pool = find_pool(ma, params);
+        slab = pool_get_page(ma, pool, size, align, &offset);
+        pl_mutex_unlock(&ma->lock);
+        if (!slab) {
+            PL_ERR(ma->vk, "No slab to serve request for %s bytes (with "
+                   "alignment 0x%zx) in pool %d!",
+                   PRINT_SIZE(size), align, pool->index);
+            return false;
+        }
+
+        // For accounting, just treat the alignment as part of the used size.
+        // Doing it this way makes sure that the sizes reported to vk_memslice
+        // consumers are always aligned properly.
+        size = PL_ALIGN(size, align);
+        slab->used += size;
+        slab->age = ma->age;
+        if (params->debug_tag)
+            slab->debug_tag = params->debug_tag;
+        pl_mutex_unlock(&slab->lock);
+    }
+
+    pl_assert(offset % align == 0);
+    *out = (struct vk_memslice) {
+        .vkmem = slab->mem,
+        .offset = offset,
+        .size = size,
+        .buf = slab->buffer,
+        .data = slab->data ? (uint8_t *) slab->data + offset : 0x0,
+        .coherent = slab->coherent,
+        .map_offset = slab->data ? offset : 0,
+        .map_size = slab->data ? size : 0,
+        .priv = slab,
+        .shared_mem = {
+            .handle = slab->handle,
+            .offset = offset,
+            .size = slab->size,
+        },
+    };
+    return true;
+}
diff --git a/src/vulkan/malloc.h b/src/vulkan/malloc.h
new file mode 100644
index 0000000..115352e
--- /dev/null
+++ b/src/vulkan/malloc.h
@@ -0,0 +1,72 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+// All memory allocated from a vk_malloc MUST be explicitly released by
+// the caller before vk_malloc_destroy is called.
+struct vk_malloc *vk_malloc_create(struct vk_ctx *vk);
+void vk_malloc_destroy(struct vk_malloc **ma);
+
+// Get the supported handle types for this malloc instance
+pl_handle_caps vk_malloc_handle_caps(const struct vk_malloc *ma, bool import);
+
+// Represents a single "slice" of generic (non-buffer) memory, plus some
+// metadata for accounting. This struct is essentially read-only.
+struct vk_memslice {
+    VkDeviceMemory vkmem;
+    VkDeviceSize offset;
+    VkDeviceSize size;
+    void *priv;
+    // depending on the type/flags:
+    struct pl_shared_mem shared_mem;
+    VkBuffer buf;   // associated buffer (when `buf_usage` is nonzero)
+    void *data;     // pointer to slice (for persistently mapped slices)
+    bool coherent;  // whether `data` is coherent
+    VkDeviceSize map_offset; // can be larger than offset/size
+    VkDeviceSize map_size;
+};
+
+struct vk_malloc_params {
+    VkMemoryRequirements reqs;
+    VkMemoryPropertyFlags required;
+    VkMemoryPropertyFlags optimal;
+    VkBufferUsageFlags buf_usage;
+    VkImage ded_image; // for dedicated image allocations
+    enum pl_handle_type export_handle;
+    enum pl_handle_type import_handle;
+    struct pl_shared_mem shared_mem; // for `import_handle`
+    pl_debug_tag debug_tag;
+};
+
+// Returns the amount of available memory matching a given set of property
+// flags. Always returns the highest single allocation, not the combined total.
+size_t vk_malloc_avail(struct vk_malloc *ma, VkMemoryPropertyFlags flags);
+
+bool vk_malloc_slice(struct vk_malloc *ma, struct vk_memslice *out,
+                     const struct vk_malloc_params *params);
+
+void vk_malloc_free(struct vk_malloc *ma, struct vk_memslice *slice);
+
+// Clean up unused slabs. Call this roughly once per frame to reduce
+// memory pressure / memory leaks.
+void vk_malloc_garbage_collect(struct vk_malloc *ma);
+
+// For debugging purposes. Doesn't include dedicated slab allocations!
+void vk_malloc_print_stats(struct vk_malloc *ma, enum pl_log_level);
diff --git a/src/vulkan/meson.build b/src/vulkan/meson.build
new file mode 100644
index 0000000..64c5572
--- /dev/null
+++ b/src/vulkan/meson.build
@@ -0,0 +1,59 @@
+vulkan_build = get_option('vulkan')
+vulkan_link = get_option('vk-proc-addr')
+vulkan_loader = dependency('vulkan', required: false)
+vulkan_headers = vulkan_loader.partial_dependency(includes: true, compile_args: true)
+registry_xml = get_option('vulkan-registry')
+
+# Prefer our Vulkan headers for portability
+vulkan_headers_dir = thirdparty/'Vulkan-Headers'
+vulkan_headers_inc = include_directories()
+if fs.is_dir(vulkan_headers_dir/'include')
+  vulkan_headers = declare_dependency()
+  vulkan_headers_inc = include_directories('../../3rdparty/Vulkan-Headers/include')
+  # Force the use of this vk.xml because it has to be in sync with the headers
+  registry_xml = vulkan_headers_dir/'registry/vk.xml'
+endif
+
+vulkan_build = vulkan_build.require(
+  cc.has_header_symbol('vulkan/vulkan_core.h', 'VK_VERSION_1_3',
+                       include_directories: vulkan_headers_inc,
+                       dependencies: vulkan_headers),
+  error_message: 'vulkan.h was not found on the system, nor inside ' +
+                 '`3rdparty/Vulkan-Headers`. Please run `git submodule update --init` ' +
+                 'followed by `meson --wipe`.')
+components.set('vulkan', vulkan_build.allowed())
+
+vulkan_link = vulkan_link.require(vulkan_loader.found() and vulkan_build.allowed())
+components.set('vk-proc-addr', vulkan_link.allowed())
+
+build_deps += vulkan_headers
+
+if vulkan_build.allowed()
+  sources += [
+    'vulkan/command.c',
+    'vulkan/context.c',
+    'vulkan/formats.c',
+    'vulkan/gpu.c',
+    'vulkan/gpu_buf.c',
+    'vulkan/gpu_tex.c',
+    'vulkan/gpu_pass.c',
+    'vulkan/malloc.c',
+    'vulkan/swapchain.c',
+    'vulkan/utils.c',
+  ]
+
+  datadir = get_option('prefix') / get_option('datadir')
+  sources += custom_target('utils_gen.c',
+    input: 'utils_gen.py',
+    output: 'utils_gen.c',
+    command: [python, '@INPUT@', datadir, registry_xml, '@OUTPUT@'],
+    env: python_env,
+  )
+
+  if vulkan_link.allowed()
+    build_deps += vulkan_loader
+    tests += 'vulkan.c'
+  endif
+else
+  sources += 'vulkan/stubs.c'
+endif
diff --git a/src/vulkan/stubs.c b/src/vulkan/stubs.c
new file mode 100644
index 0000000..0c0738e
--- /dev/null
+++ b/src/vulkan/stubs.c
@@ -0,0 +1,108 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../common.h"
+#include "log.h"
+
+#include <libplacebo/vulkan.h>
+
+const struct pl_vk_inst_params pl_vk_inst_default_params = {0};
+const struct pl_vulkan_params pl_vulkan_default_params = { PL_VULKAN_DEFAULTS };
+
+pl_vk_inst pl_vk_inst_create(pl_log log, const struct pl_vk_inst_params *params)
+{
+    pl_fatal(log, "libplacebo compiled without Vulkan support!");
+    return NULL;
+}
+
+void pl_vk_inst_destroy(pl_vk_inst *pinst)
+{
+    pl_vk_inst inst = *pinst;
+    pl_assert(!inst);
+}
+
+pl_vulkan pl_vulkan_create(pl_log log, const struct pl_vulkan_params *params)
+{
+    pl_fatal(log, "libplacebo compiled without Vulkan support!");
+    return NULL;
+}
+
+void pl_vulkan_destroy(pl_vulkan *pvk)
+{
+    pl_vulkan vk = *pvk;
+    pl_assert(!vk);
+}
+
+pl_vulkan pl_vulkan_get(pl_gpu gpu)
+{
+    return NULL;
+}
+
+VkPhysicalDevice pl_vulkan_choose_device(pl_log log,
+                              const struct pl_vulkan_device_params *params)
+{
+    pl_err(log, "libplacebo compiled without Vulkan support!");
+    return NULL;
+}
+
+pl_swapchain pl_vulkan_create_swapchain(pl_vulkan vk,
+                              const struct pl_vulkan_swapchain_params *params)
+{
+    pl_unreachable();
+}
+
+bool pl_vulkan_swapchain_suboptimal(pl_swapchain sw)
+{
+    pl_unreachable();
+}
+
+pl_vulkan pl_vulkan_import(pl_log log, const struct pl_vulkan_import_params *params)
+{
+    pl_fatal(log, "libplacebo compiled without Vulkan support!");
+    return NULL;
+}
+
+pl_tex pl_vulkan_wrap(pl_gpu gpu, const struct pl_vulkan_wrap_params *params)
+{
+    pl_unreachable();
+}
+
+VkImage pl_vulkan_unwrap(pl_gpu gpu, pl_tex tex,
+                         VkFormat *out_format, VkImageUsageFlags *out_flags)
+{
+    pl_unreachable();
+}
+
+bool pl_vulkan_hold_ex(pl_gpu gpu, const struct pl_vulkan_hold_params *params)
+{
+    pl_unreachable();
+}
+
+void pl_vulkan_release_ex(pl_gpu gpu, const struct pl_vulkan_release_params *params)
+{
+    pl_unreachable();
+}
+
+VkSemaphore pl_vulkan_sem_create(pl_gpu gpu, const struct pl_vulkan_sem_params *params)
+{
+    pl_unreachable();
+}
+
+void pl_vulkan_sem_destroy(pl_gpu gpu, VkSemaphore *semaphore)
+{
+    pl_unreachable();
+}
diff --git a/src/vulkan/swapchain.c b/src/vulkan/swapchain.c
new file mode 100644
index 0000000..0741fbf
--- /dev/null
+++ b/src/vulkan/swapchain.c
@@ -0,0 +1,911 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include "command.h"
+#include "formats.h"
+#include "utils.h"
+#include "gpu.h"
+#include "swapchain.h"
+#include "pl_thread.h"
+
+struct sem_pair {
+    VkSemaphore in;
+    VkSemaphore out;
+};
+
+struct priv {
+    struct pl_sw_fns impl;
+
+    pl_mutex lock;
+    struct vk_ctx *vk;
+    VkSurfaceKHR surf;
+    PL_ARRAY(VkSurfaceFormatKHR) formats;
+
+    // current swapchain and metadata:
+    struct pl_vulkan_swapchain_params params;
+    VkSwapchainCreateInfoKHR protoInfo; // partially filled-in prototype
+    VkSwapchainKHR swapchain;
+    int cur_width, cur_height;
+    int swapchain_depth;
+    pl_rc_t frames_in_flight;       // number of frames currently queued
+    bool suboptimal;                // true once VK_SUBOPTIMAL_KHR is returned
+    bool needs_recreate;            // swapchain needs to be recreated
+    struct pl_color_repr color_repr;
+    struct pl_color_space color_space;
+    struct pl_hdr_metadata hdr_metadata;
+
+    // state of the images:
+    PL_ARRAY(pl_tex) images;        // pl_tex wrappers for the VkImages
+    PL_ARRAY(struct sem_pair) sems; // pool of semaphores used to synchronize images
+    int idx_sems;                   // index of next free semaphore pair
+    int last_imgidx;                // the image index last acquired (for submit)
+};
+
+static const struct pl_sw_fns vulkan_swapchain;
+
+static bool map_color_space(VkColorSpaceKHR space, struct pl_color_space *out)
+{
+    switch (space) {
+    // Note: This is technically against the spec, but more often than not
+    // it's the correct result since `SRGB_NONLINEAR` is just a catch-all
+    // for any sort of typical SDR curve, which is better approximated by
+    // `pl_color_space_monitor`.
+    case VK_COLOR_SPACE_SRGB_NONLINEAR_KHR:
+        *out = pl_color_space_monitor;
+        return true;
+
+    case VK_COLOR_SPACE_BT709_NONLINEAR_EXT:
+        *out = pl_color_space_monitor;
+        return true;
+    case VK_COLOR_SPACE_DISPLAY_P3_NONLINEAR_EXT:
+        *out = (struct pl_color_space) {
+            .primaries = PL_COLOR_PRIM_DISPLAY_P3,
+            .transfer  = PL_COLOR_TRC_BT_1886,
+        };
+        return true;
+    case VK_COLOR_SPACE_DCI_P3_LINEAR_EXT:
+        *out = (struct pl_color_space) {
+            .primaries = PL_COLOR_PRIM_DCI_P3,
+            .transfer  = PL_COLOR_TRC_LINEAR,
+        };
+        return true;
+    case VK_COLOR_SPACE_DCI_P3_NONLINEAR_EXT:
+        *out = (struct pl_color_space) {
+            .primaries = PL_COLOR_PRIM_DCI_P3,
+            .transfer  = PL_COLOR_TRC_BT_1886,
+        };
+        return true;
+    case VK_COLOR_SPACE_EXTENDED_SRGB_LINEAR_EXT:
+    case VK_COLOR_SPACE_EXTENDED_SRGB_NONLINEAR_EXT:
+        // TODO
+        return false;
+    case VK_COLOR_SPACE_BT709_LINEAR_EXT:
+        *out = (struct pl_color_space) {
+            .primaries = PL_COLOR_PRIM_DCI_P3,
+            .transfer  = PL_COLOR_TRC_LINEAR,
+        };
+        return true;
+    case VK_COLOR_SPACE_BT2020_LINEAR_EXT:
+        *out = (struct pl_color_space) {
+            .primaries = PL_COLOR_PRIM_BT_2020,
+            .transfer  = PL_COLOR_TRC_LINEAR,
+        };
+        return true;
+    case VK_COLOR_SPACE_HDR10_ST2084_EXT:
+        *out = (struct pl_color_space) {
+            .primaries = PL_COLOR_PRIM_BT_2020,
+            .transfer  = PL_COLOR_TRC_PQ,
+        };
+        return true;
+    case VK_COLOR_SPACE_DOLBYVISION_EXT:
+        // Unlikely to ever be implemented
+        return false;
+    case VK_COLOR_SPACE_HDR10_HLG_EXT:
+        *out = (struct pl_color_space) {
+            .primaries = PL_COLOR_PRIM_BT_2020,
+            .transfer  = PL_COLOR_TRC_HLG,
+        };
+        return true;
+    case VK_COLOR_SPACE_ADOBERGB_LINEAR_EXT:
+        *out = (struct pl_color_space) {
+            .primaries = PL_COLOR_PRIM_ADOBE,
+            .transfer  = PL_COLOR_TRC_LINEAR,
+        };
+        return true;
+    case VK_COLOR_SPACE_ADOBERGB_NONLINEAR_EXT:
+        *out = (struct pl_color_space) {
+            .primaries = PL_COLOR_PRIM_ADOBE,
+            .transfer  = PL_COLOR_TRC_GAMMA22,
+        };
+        return true;
+    case VK_COLOR_SPACE_PASS_THROUGH_EXT:
+        *out = pl_color_space_unknown;
+        return true;
+
+#ifdef VK_AMD_display_native_hdr
+    case VK_COLOR_SPACE_DISPLAY_NATIVE_AMD:
+        // TODO
+        return false;
+#endif
+
+    default: return false;
+    }
+}
+
+static bool pick_surf_format(pl_swapchain sw, const struct pl_color_space *hint)
+{
+    struct priv *p = PL_PRIV(sw);
+    struct vk_ctx *vk = p->vk;
+    pl_gpu gpu = sw->gpu;
+
+    int best_score = 0, best_id;
+    bool wide_gamut = pl_color_primaries_is_wide_gamut(hint->primaries);
+    bool prefer_hdr = pl_color_transfer_is_hdr(hint->transfer);
+
+    for (int i = 0; i < p->formats.num; i++) {
+        // Color space / format whitelist
+        struct pl_color_space space;
+        if (!map_color_space(p->formats.elem[i].colorSpace, &space))
+            continue;
+
+        bool disable10 = !pl_color_transfer_is_hdr(space.transfer) &&
+                         p->params.disable_10bit_sdr;
+
+        switch (p->formats.elem[i].format) {
+        // Only accept floating point formats for linear curves
+        case VK_FORMAT_R16G16B16_SFLOAT:
+        case VK_FORMAT_R16G16B16A16_SFLOAT:
+        case VK_FORMAT_R32G32B32_SFLOAT:
+        case VK_FORMAT_R32G32B32A32_SFLOAT:
+        case VK_FORMAT_R64G64B64_SFLOAT:
+        case VK_FORMAT_R64G64B64A64_SFLOAT:
+            if (space.transfer == PL_COLOR_TRC_LINEAR)
+                break; // accept
+            continue;
+
+        // Only accept 8 bit for non-HDR curves
+        case VK_FORMAT_R8G8B8_UNORM:
+        case VK_FORMAT_B8G8R8_UNORM:
+        case VK_FORMAT_R8G8B8A8_UNORM:
+        case VK_FORMAT_B8G8R8A8_UNORM:
+        case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+            if (!pl_color_transfer_is_hdr(space.transfer))
+                break; // accept
+            continue;
+
+        // Only accept 10 bit formats for non-linear curves
+        case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
+        case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+            if (space.transfer != PL_COLOR_TRC_LINEAR && !disable10)
+                break; // accept
+            continue;
+
+        // Accept 16-bit formats for everything
+        case VK_FORMAT_R16G16B16_UNORM:
+        case VK_FORMAT_R16G16B16A16_UNORM:
+            if (!disable10)
+                break; // accept
+            continue;
+
+        default: continue;
+        }
+
+        // Make sure we can wrap this format to a meaningful, valid pl_fmt
+        for (int n = 0; n < gpu->num_formats; n++) {
+            pl_fmt plfmt = gpu->formats[n];
+            const struct vk_format **pvkfmt = PL_PRIV(plfmt);
+            if ((*pvkfmt)->tfmt != p->formats.elem[i].format)
+                continue;
+
+            enum pl_fmt_caps render_caps = 0;
+            render_caps |= PL_FMT_CAP_RENDERABLE;
+            render_caps |= PL_FMT_CAP_BLITTABLE;
+            if ((plfmt->caps & render_caps) != render_caps)
+                continue;
+
+            // format valid, use it if it has a higher score
+            int score = 0;
+            for (int c = 0; c < 3; c++)
+                score += plfmt->component_depth[c];
+            if (pl_color_primaries_is_wide_gamut(space.primaries) == wide_gamut)
+                score += 1000;
+            if (space.primaries == hint->primaries)
+                score += 2000;
+            if (pl_color_transfer_is_hdr(space.transfer) == prefer_hdr)
+                score += 10000;
+            if (space.transfer == hint->transfer)
+                score += 20000;
+
+            switch (plfmt->type) {
+            case PL_FMT_UNKNOWN: break;
+            case PL_FMT_UINT: break;
+            case PL_FMT_SINT: break;
+            case PL_FMT_UNORM: score += 500; break;
+            case PL_FMT_SNORM: score += 400; break;
+            case PL_FMT_FLOAT: score += 300; break;
+            case PL_FMT_TYPE_COUNT: pl_unreachable();
+            };
+
+            if (score > best_score) {
+                best_score = score;
+                best_id = i;
+                break;
+            }
+        }
+    }
+
+    if (!best_score) {
+        PL_ERR(vk, "Failed picking any valid, renderable surface format!");
+        return false;
+    }
+
+    VkSurfaceFormatKHR new_sfmt = p->formats.elem[best_id];
+    if (p->protoInfo.imageFormat != new_sfmt.format ||
+        p->protoInfo.imageColorSpace != new_sfmt.colorSpace)
+    {
+        PL_INFO(vk, "Picked surface configuration %d: %s + %s", best_id,
+                vk_fmt_name(new_sfmt.format),
+                vk_csp_name(new_sfmt.colorSpace));
+
+        p->protoInfo.imageFormat = new_sfmt.format;
+        p->protoInfo.imageColorSpace = new_sfmt.colorSpace;
+        p->needs_recreate = true;
+    }
+
+    return true;
+}
+
+static void set_hdr_metadata(struct priv *p, const struct pl_hdr_metadata *metadata)
+{
+    struct vk_ctx *vk = p->vk;
+    if (!vk->SetHdrMetadataEXT)
+        return;
+
+    // Whitelist only values that we support signalling metadata for
+    struct pl_hdr_metadata fix = {
+        .prim     = metadata->prim,
+        .min_luma = metadata->min_luma,
+        .max_luma = metadata->max_luma,
+        .max_cll  = metadata->max_cll,
+        .max_fall = metadata->max_fall,
+    };
+
+    // Ignore no-op changes
+    if (pl_hdr_metadata_equal(&fix, &p->hdr_metadata))
+        return;
+
+    // Remember the metadata so we can re-apply it after swapchain recreation
+    p->hdr_metadata = fix;
+
+    // Ignore HDR metadata requests for SDR swapchains
+    if (!pl_color_transfer_is_hdr(p->color_space.transfer))
+        return;
+
+    if (!p->swapchain)
+        return;
+
+    vk->SetHdrMetadataEXT(vk->dev, 1, &p->swapchain, &(VkHdrMetadataEXT) {
+        .sType = VK_STRUCTURE_TYPE_HDR_METADATA_EXT,
+        .displayPrimaryRed   = { fix.prim.red.x,   fix.prim.red.y },
+        .displayPrimaryGreen = { fix.prim.green.x, fix.prim.green.y },
+        .displayPrimaryBlue  = { fix.prim.blue.x,  fix.prim.blue.y },
+        .whitePoint = { fix.prim.white.x, fix.prim.white.y },
+        .maxLuminance = fix.max_luma,
+        .minLuminance = fix.min_luma,
+        .maxContentLightLevel = fix.max_cll,
+        .maxFrameAverageLightLevel = fix.max_fall,
+    });
+
+    // Keep track of applied HDR colorimetry metadata
+    p->color_space.hdr = p->hdr_metadata;
+}
+
+pl_swapchain pl_vulkan_create_swapchain(pl_vulkan plvk,
+                              const struct pl_vulkan_swapchain_params *params)
+{
+    struct vk_ctx *vk = PL_PRIV(plvk);
+    pl_gpu gpu = plvk->gpu;
+
+    if (!vk->CreateSwapchainKHR) {
+        PL_ERR(gpu, VK_KHR_SWAPCHAIN_EXTENSION_NAME " not enabled!");
+        return NULL;
+    }
+
+    struct pl_swapchain_t *sw = pl_zalloc_obj(NULL, sw, struct priv);
+    sw->log = vk->log;
+    sw->gpu = gpu;
+
+    struct priv *p = PL_PRIV(sw);
+    pl_mutex_init(&p->lock);
+    p->impl = vulkan_swapchain;
+    p->params = *params;
+    p->vk = vk;
+    p->surf = params->surface;
+    p->swapchain_depth = PL_DEF(params->swapchain_depth, 3);
+    pl_assert(p->swapchain_depth > 0);
+    atomic_init(&p->frames_in_flight, 0);
+    p->last_imgidx = -1;
+    p->protoInfo = (VkSwapchainCreateInfoKHR) {
+        .sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR,
+        .surface = p->surf,
+        .imageArrayLayers = 1, // non-stereoscopic
+        .imageSharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        .minImageCount = p->swapchain_depth + 1, // +1 for the FB
+        .presentMode = params->present_mode,
+        .clipped = true,
+    };
+
+    // These fields will be updated by `vk_sw_recreate`
+    p->color_space = pl_color_space_unknown;
+    p->color_repr = (struct pl_color_repr) {
+        .sys    = PL_COLOR_SYSTEM_RGB,
+        .levels = PL_COLOR_LEVELS_FULL,
+        .alpha  = PL_ALPHA_UNKNOWN,
+    };
+
+    // Make sure the swapchain present mode is supported
+    VkPresentModeKHR *modes = NULL;
+    uint32_t num_modes = 0;
+    VK(vk->GetPhysicalDeviceSurfacePresentModesKHR(vk->physd, p->surf, &num_modes, NULL));
+    modes = pl_calloc_ptr(NULL, num_modes, modes);
+    VK(vk->GetPhysicalDeviceSurfacePresentModesKHR(vk->physd, p->surf, &num_modes, modes));
+
+    bool supported = false;
+    for (int i = 0; i < num_modes; i++)
+        supported |= (modes[i] == p->protoInfo.presentMode);
+    pl_free_ptr(&modes);
+
+    if (!supported) {
+        PL_WARN(vk, "Requested swap mode unsupported by this device, falling "
+                "back to VK_PRESENT_MODE_FIFO_KHR");
+        p->protoInfo.presentMode = VK_PRESENT_MODE_FIFO_KHR;
+    }
+
+    // Enumerate the supported surface color spaces
+    uint32_t num_formats = 0;
+    VK(vk->GetPhysicalDeviceSurfaceFormatsKHR(vk->physd, p->surf, &num_formats, NULL));
+    PL_ARRAY_RESIZE(sw, p->formats, num_formats);
+    VK(vk->GetPhysicalDeviceSurfaceFormatsKHR(vk->physd, p->surf, &num_formats, p->formats.elem));
+    p->formats.num = num_formats;
+
+    PL_INFO(gpu, "Available surface configurations:");
+    for (int i = 0; i < p->formats.num; i++) {
+        PL_INFO(gpu, "    %d: %-40s %s", i,
+                vk_fmt_name(p->formats.elem[i].format),
+                vk_csp_name(p->formats.elem[i].colorSpace));
+    }
+
+    // Ensure there exists at least some valid renderable surface format
+    struct pl_color_space hint = {0};
+    if (!pick_surf_format(sw, &hint))
+        goto error;
+
+    return sw;
+
+error:
+    pl_free(modes);
+    pl_free(sw);
+    return NULL;
+}
+
+static void vk_sw_destroy(pl_swapchain sw)
+{
+    pl_gpu gpu = sw->gpu;
+    struct priv *p = PL_PRIV(sw);
+    struct vk_ctx *vk = p->vk;
+
+    pl_gpu_flush(gpu);
+    vk_wait_idle(vk);
+
+    // Vulkan offers no way to know when a queue presentation command is done,
+    // leading to spec-mandated undefined behavior when destroying resources
+    // tied to the swapchain. Use an extra `vkQueueWaitIdle` on all of the
+    // queues we may have oustanding presentation calls on, to hopefully inform
+    // the driver that we want to wait until the device is truly idle.
+    for (int i = 0; i < vk->pool_graphics->num_queues; i++)
+        vk->QueueWaitIdle(vk->pool_graphics->queues[i]);
+
+    for (int i = 0; i < p->images.num; i++)
+        pl_tex_destroy(gpu, &p->images.elem[i]);
+    for (int i = 0; i < p->sems.num; i++) {
+        vk->DestroySemaphore(vk->dev, p->sems.elem[i].in, PL_VK_ALLOC);
+        vk->DestroySemaphore(vk->dev, p->sems.elem[i].out, PL_VK_ALLOC);
+    }
+
+    vk->DestroySwapchainKHR(vk->dev, p->swapchain, PL_VK_ALLOC);
+    pl_mutex_destroy(&p->lock);
+    pl_free((void *) sw);
+}
+
+static int vk_sw_latency(pl_swapchain sw)
+{
+    struct priv *p = PL_PRIV(sw);
+    return p->swapchain_depth;
+}
+
+static bool update_swapchain_info(struct priv *p, VkSwapchainCreateInfoKHR *info,
+                                  int w, int h)
+{
+    struct vk_ctx *vk = p->vk;
+
+    // Query the supported capabilities and update this struct as needed
+    VkSurfaceCapabilitiesKHR caps = {0};
+    VK(vk->GetPhysicalDeviceSurfaceCapabilitiesKHR(vk->physd, p->surf, &caps));
+
+    // Check for hidden/invisible window
+    if (!caps.currentExtent.width || !caps.currentExtent.height) {
+        PL_DEBUG(vk, "maxImageExtent reported as 0x0, hidden window? skipping");
+        return false;
+    }
+
+    // Sorted by preference
+    static const struct { VkCompositeAlphaFlagsKHR vk_mode;
+                          enum pl_alpha_mode pl_mode;
+                        } alphaModes[] = {
+        {VK_COMPOSITE_ALPHA_POST_MULTIPLIED_BIT_KHR, PL_ALPHA_INDEPENDENT},
+        {VK_COMPOSITE_ALPHA_PRE_MULTIPLIED_BIT_KHR,  PL_ALPHA_PREMULTIPLIED},
+        {VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR,          PL_ALPHA_UNKNOWN},
+        {VK_COMPOSITE_ALPHA_INHERIT_BIT_KHR,         PL_ALPHA_UNKNOWN},
+    };
+
+    for (int i = 0; i < PL_ARRAY_SIZE(alphaModes); i++) {
+        if (caps.supportedCompositeAlpha & alphaModes[i].vk_mode) {
+            info->compositeAlpha = alphaModes[i].vk_mode;
+            p->color_repr.alpha = alphaModes[i].pl_mode;
+            PL_DEBUG(vk, "Requested alpha compositing mode: %s",
+                     vk_alpha_mode(info->compositeAlpha));
+            break;
+        }
+    }
+
+    if (!info->compositeAlpha) {
+        PL_ERR(vk, "Failed picking alpha compositing mode (caps: 0x%x)",
+               caps.supportedCompositeAlpha);
+        goto error;
+    }
+
+    // Note: We could probably also allow picking a surface transform that
+    // flips the framebuffer and set `pl_swapchain_frame.flipped`, but this
+    // doesn't appear to be necessary for any vulkan implementations.
+    static const VkSurfaceTransformFlagsKHR rotModes[] = {
+        VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR,
+        VK_SURFACE_TRANSFORM_INHERIT_BIT_KHR,
+    };
+
+    for (int i = 0; i < PL_ARRAY_SIZE(rotModes); i++) {
+        if (caps.supportedTransforms & rotModes[i]) {
+            info->preTransform = rotModes[i];
+            PL_DEBUG(vk, "Requested surface transform: %s",
+                     vk_surface_transform(info->preTransform));
+            break;
+        }
+    }
+
+    if (!info->preTransform) {
+        PL_ERR(vk, "Failed picking surface transform mode (caps: 0x%x)",
+               caps.supportedTransforms);
+        goto error;
+    }
+
+    // Image count as required
+    PL_DEBUG(vk, "Requested image count: %d (min %d max %d)",
+             (int) info->minImageCount, (int) caps.minImageCount,
+             (int) caps.maxImageCount);
+
+    info->minImageCount = PL_MAX(info->minImageCount, caps.minImageCount);
+    if (caps.maxImageCount)
+        info->minImageCount = PL_MIN(info->minImageCount, caps.maxImageCount);
+
+    PL_DEBUG(vk, "Requested image size: %dx%d (min %dx%d < cur %dx%d < max %dx%d)",
+             w, h, caps.minImageExtent.width, caps.minImageExtent.height,
+             caps.currentExtent.width, caps.currentExtent.height,
+             caps.maxImageExtent.width, caps.maxImageExtent.height);
+
+    // Default the requested size based on the reported extent
+    if (caps.currentExtent.width != 0xFFFFFFFF)
+        w = PL_DEF(w, caps.currentExtent.width);
+    if (caps.currentExtent.height != 0xFFFFFFFF)
+        h = PL_DEF(h, caps.currentExtent.height);
+
+    // Otherwise, re-use the existing size if available
+    w = PL_DEF(w, info->imageExtent.width);
+    h = PL_DEF(h, info->imageExtent.height);
+
+    if (!w || !h) {
+        PL_ERR(vk, "Failed resizing swapchain: unknown size?");
+        goto error;
+    }
+
+    // Clamp the extent based on the supported limits
+    w = PL_CLAMP(w, caps.minImageExtent.width,  caps.maxImageExtent.width);
+    h = PL_CLAMP(h, caps.minImageExtent.height, caps.maxImageExtent.height);
+    info->imageExtent = (VkExtent2D) { w, h };
+
+    // We just request whatever makes sense, and let the pl_vk decide what
+    // pl_tex_params that translates to. That said, we still need to intersect
+    // the swapchain usage flags with the format usage flags
+    VkImageUsageFlags req_flags = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
+                                  VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+    VkImageUsageFlags opt_flags = VK_IMAGE_USAGE_STORAGE_BIT;
+
+    info->imageUsage = caps.supportedUsageFlags & (req_flags | opt_flags);
+    VkFormatProperties fmtprop = {0};
+    vk->GetPhysicalDeviceFormatProperties(vk->physd, info->imageFormat, &fmtprop);
+
+#define CHECK(usage, feature) \
+    if (!((fmtprop.optimalTilingFeatures & VK_FORMAT_FEATURE_##feature##_BIT))) \
+        info->imageUsage &= ~VK_IMAGE_USAGE_##usage##_BIT
+
+    CHECK(COLOR_ATTACHMENT, COLOR_ATTACHMENT);
+    CHECK(TRANSFER_DST, TRANSFER_DST);
+    CHECK(STORAGE, STORAGE_IMAGE);
+
+    if ((info->imageUsage & req_flags) != req_flags) {
+        PL_ERR(vk, "The swapchain doesn't support rendering and blitting!");
+        goto error;
+    }
+
+    return true;
+
+error:
+    return false;
+}
+
+static void destroy_swapchain(struct vk_ctx *vk, void *swapchain)
+{
+    vk->DestroySwapchainKHR(vk->dev, vk_unwrap_handle(swapchain), PL_VK_ALLOC);
+}
+
+static bool vk_sw_recreate(pl_swapchain sw, int w, int h)
+{
+    pl_gpu gpu = sw->gpu;
+    struct priv *p = PL_PRIV(sw);
+    struct vk_ctx *vk = p->vk;
+
+    VkImage *vkimages = NULL;
+    uint32_t num_images = 0;
+
+    if (!update_swapchain_info(p, &p->protoInfo, w, h))
+        return false;
+
+    VkSwapchainCreateInfoKHR sinfo = p->protoInfo;
+#ifdef VK_EXT_full_screen_exclusive
+    // Explicitly disallow full screen exclusive mode if possible
+    static const VkSurfaceFullScreenExclusiveInfoEXT fsinfo = {
+        .sType = VK_STRUCTURE_TYPE_SURFACE_FULL_SCREEN_EXCLUSIVE_INFO_EXT,
+        .fullScreenExclusive = VK_FULL_SCREEN_EXCLUSIVE_DISALLOWED_EXT,
+    };
+    if (vk->AcquireFullScreenExclusiveModeEXT)
+        vk_link_struct(&sinfo, &fsinfo);
+#endif
+
+    p->suboptimal = false;
+    p->needs_recreate = false;
+    p->cur_width = sinfo.imageExtent.width;
+    p->cur_height = sinfo.imageExtent.height;
+
+    PL_DEBUG(sw, "(Re)creating swapchain of size %dx%d",
+             sinfo.imageExtent.width,
+             sinfo.imageExtent.height);
+
+#ifdef PL_HAVE_UNIX
+    if (vk->props.vendorID == VK_VENDOR_ID_NVIDIA) {
+        vk->DeviceWaitIdle(vk->dev);
+        vk_wait_idle(vk);
+    }
+#endif
+
+    // Calling `vkCreateSwapchainKHR` puts sinfo.oldSwapchain into a retired
+    // state whether the call succeeds or not, so we always need to garbage
+    // collect it afterwards - asynchronously as it may still be in use
+    sinfo.oldSwapchain = p->swapchain;
+    p->swapchain = VK_NULL_HANDLE;
+    VkResult res = vk->CreateSwapchainKHR(vk->dev, &sinfo, PL_VK_ALLOC, &p->swapchain);
+    vk_dev_callback(vk, (vk_cb) destroy_swapchain, vk, vk_wrap_handle(sinfo.oldSwapchain));
+    PL_VK_ASSERT(res, "vk->CreateSwapchainKHR(...)");
+
+    // Get the new swapchain images
+    VK(vk->GetSwapchainImagesKHR(vk->dev, p->swapchain, &num_images, NULL));
+    vkimages = pl_calloc_ptr(NULL, num_images, vkimages);
+    VK(vk->GetSwapchainImagesKHR(vk->dev, p->swapchain, &num_images, vkimages));
+
+    for (int i = 0; i < num_images; i++)
+        PL_VK_NAME(IMAGE, vkimages[i], "swapchain");
+
+    // If needed, allocate some more semaphores
+    while (num_images > p->sems.num) {
+        VkSemaphore sem_in = VK_NULL_HANDLE, sem_out = VK_NULL_HANDLE;
+        static const VkSemaphoreCreateInfo seminfo = {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+        };
+        VK(vk->CreateSemaphore(vk->dev, &seminfo, PL_VK_ALLOC, &sem_in));
+        VK(vk->CreateSemaphore(vk->dev, &seminfo, PL_VK_ALLOC, &sem_out));
+        PL_VK_NAME(SEMAPHORE, sem_in, "swapchain in");
+        PL_VK_NAME(SEMAPHORE, sem_out, "swapchain out");
+
+        PL_ARRAY_APPEND(sw, p->sems, (struct sem_pair) {
+            .in = sem_in,
+            .out = sem_out,
+        });
+    }
+
+    // Recreate the pl_tex wrappers
+    for (int i = 0; i < p->images.num; i++)
+        pl_tex_destroy(gpu, &p->images.elem[i]);
+    p->images.num = 0;
+
+    for (int i = 0; i < num_images; i++) {
+        const VkExtent2D *ext = &sinfo.imageExtent;
+        pl_tex tex = pl_vulkan_wrap(gpu, pl_vulkan_wrap_params(
+            .image = vkimages[i],
+            .width = ext->width,
+            .height = ext->height,
+            .format = sinfo.imageFormat,
+            .usage = sinfo.imageUsage,
+        ));
+        if (!tex)
+            goto error;
+        PL_ARRAY_APPEND(sw, p->images, tex);
+    }
+
+    pl_assert(num_images > 0);
+    int bits = 0;
+
+    // The channel with the most bits is probably the most authoritative about
+    // the actual color information (consider e.g. a2bgr10). Slight downside
+    // in that it results in rounding r/b for e.g. rgb565, but we don't pick
+    // surfaces with fewer than 8 bits anyway, so let's not care for now.
+    pl_fmt fmt = p->images.elem[0]->params.format;
+    for (int i = 0; i < fmt->num_components; i++)
+        bits = PL_MAX(bits, fmt->component_depth[i]);
+
+    p->color_repr.bits.sample_depth = bits;
+    p->color_repr.bits.color_depth = bits;
+
+    // Note: `p->color_space.hdr` is (re-)applied by `set_hdr_metadata`
+    map_color_space(sinfo.imageColorSpace, &p->color_space);
+
+    // Forcibly re-apply HDR metadata, bypassing the no-op check
+    struct pl_hdr_metadata metadata = p->hdr_metadata;
+    p->hdr_metadata = pl_hdr_metadata_empty;
+    set_hdr_metadata(p, &metadata);
+
+    pl_free(vkimages);
+    return true;
+
+error:
+    PL_ERR(vk, "Failed (re)creating swapchain!");
+    pl_free(vkimages);
+    vk->DestroySwapchainKHR(vk->dev, p->swapchain, PL_VK_ALLOC);
+    p->swapchain = VK_NULL_HANDLE;
+    p->cur_width = p->cur_height = 0;
+    return false;
+}
+
+static bool vk_sw_start_frame(pl_swapchain sw,
+                              struct pl_swapchain_frame *out_frame)
+{
+    struct priv *p = PL_PRIV(sw);
+    struct vk_ctx *vk = p->vk;
+    pl_mutex_lock(&p->lock);
+
+    bool recreate = !p->swapchain || p->needs_recreate;
+    if (p->suboptimal && !p->params.allow_suboptimal)
+        recreate = true;
+
+    if (recreate && !vk_sw_recreate(sw, 0, 0)) {
+        pl_mutex_unlock(&p->lock);
+        return false;
+    }
+
+    VkSemaphore sem_in = p->sems.elem[p->idx_sems].in;
+    PL_TRACE(vk, "vkAcquireNextImageKHR signals 0x%"PRIx64, (uint64_t) sem_in);
+
+    for (int attempts = 0; attempts < 2; attempts++) {
+        uint32_t imgidx = 0;
+        VkResult res = vk->AcquireNextImageKHR(vk->dev, p->swapchain, UINT64_MAX,
+                                               sem_in, VK_NULL_HANDLE, &imgidx);
+
+        switch (res) {
+        case VK_SUBOPTIMAL_KHR:
+            p->suboptimal = true;
+            // fall through
+        case VK_SUCCESS:
+            p->last_imgidx = imgidx;
+            pl_vulkan_release_ex(sw->gpu, pl_vulkan_release_params(
+                .tex        = p->images.elem[imgidx],
+                .layout     = VK_IMAGE_LAYOUT_UNDEFINED,
+                .qf         = VK_QUEUE_FAMILY_IGNORED,
+                .semaphore  = { sem_in },
+            ));
+            *out_frame = (struct pl_swapchain_frame) {
+                .fbo = p->images.elem[imgidx],
+                .flipped = false,
+                .color_repr = p->color_repr,
+                .color_space = p->color_space,
+            };
+            // keep lock held
+            return true;
+
+        case VK_ERROR_OUT_OF_DATE_KHR: {
+            // In these cases try recreating the swapchain
+            if (!vk_sw_recreate(sw, 0, 0)) {
+                pl_mutex_unlock(&p->lock);
+                return false;
+            }
+            continue;
+        }
+
+        default:
+            PL_ERR(vk, "Failed acquiring swapchain image: %s", vk_res_str(res));
+            pl_mutex_unlock(&p->lock);
+            return false;
+        }
+    }
+
+    // If we've exhausted the number of attempts to recreate the swapchain,
+    // just give up silently and let the user retry some time later.
+    pl_mutex_unlock(&p->lock);
+    return false;
+}
+
+static void present_cb(struct priv *p, void *arg)
+{
+    (void) pl_rc_deref(&p->frames_in_flight);
+}
+
+static bool vk_sw_submit_frame(pl_swapchain sw)
+{
+    pl_gpu gpu = sw->gpu;
+    struct priv *p = PL_PRIV(sw);
+    struct vk_ctx *vk = p->vk;
+    pl_assert(p->last_imgidx >= 0);
+    pl_assert(p->swapchain);
+    uint32_t idx = p->last_imgidx;
+    VkSemaphore sem_out = p->sems.elem[p->idx_sems++].out;
+    p->idx_sems %= p->sems.num;
+    p->last_imgidx = -1;
+
+    bool held = pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params(
+        .tex        = p->images.elem[idx],
+        .layout     = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,
+        .qf         = VK_QUEUE_FAMILY_IGNORED,
+        .semaphore  = { sem_out },
+    ));
+
+    if (!held) {
+        PL_ERR(gpu, "Failed holding swapchain image for presentation");
+        pl_mutex_unlock(&p->lock);
+        return false;
+    }
+
+    struct vk_cmd *cmd = pl_vk_steal_cmd(gpu);
+    if (!cmd) {
+        pl_mutex_unlock(&p->lock);
+        return false;
+    }
+
+    pl_rc_ref(&p->frames_in_flight);
+    vk_cmd_callback(cmd, (vk_cb) present_cb, p, NULL);
+    if (!vk_cmd_submit(&cmd)) {
+        pl_mutex_unlock(&p->lock);
+        return false;
+    }
+
+    struct vk_cmdpool *pool = vk->pool_graphics;
+    int qidx = pool->idx_queues;
+    VkQueue queue = pool->queues[qidx];
+
+    vk_rotate_queues(p->vk);
+    vk_malloc_garbage_collect(vk->ma);
+
+    VkPresentInfoKHR pinfo = {
+        .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR,
+        .waitSemaphoreCount = 1,
+        .pWaitSemaphores = &sem_out,
+        .swapchainCount = 1,
+        .pSwapchains = &p->swapchain,
+        .pImageIndices = &idx,
+    };
+
+    PL_TRACE(vk, "vkQueuePresentKHR waits on 0x%"PRIx64, (uint64_t) sem_out);
+    vk->lock_queue(vk->queue_ctx, pool->qf, qidx);
+    VkResult res = vk->QueuePresentKHR(queue, &pinfo);
+    vk->unlock_queue(vk->queue_ctx, pool->qf, qidx);
+    pl_mutex_unlock(&p->lock);
+
+    switch (res) {
+    case VK_SUBOPTIMAL_KHR:
+        p->suboptimal = true;
+        // fall through
+    case VK_SUCCESS:
+        return true;
+
+    case VK_ERROR_OUT_OF_DATE_KHR:
+        // We can silently ignore this error, since the next start_frame will
+        // recreate the swapchain automatically.
+        return true;
+
+    default:
+        PL_ERR(vk, "Failed presenting to queue %p: %s", (void *) queue,
+               vk_res_str(res));
+        return false;
+    }
+}
+
+static void vk_sw_swap_buffers(pl_swapchain sw)
+{
+    struct priv *p = PL_PRIV(sw);
+
+    pl_mutex_lock(&p->lock);
+    while (pl_rc_count(&p->frames_in_flight) >= p->swapchain_depth) {
+        pl_mutex_unlock(&p->lock); // don't hold mutex while blocking
+        vk_poll_commands(p->vk, UINT64_MAX);
+        pl_mutex_lock(&p->lock);
+    }
+    pl_mutex_unlock(&p->lock);
+}
+
+static bool vk_sw_resize(pl_swapchain sw, int *width, int *height)
+{
+    struct priv *p = PL_PRIV(sw);
+    bool ok = true;
+
+    pl_mutex_lock(&p->lock);
+
+    bool width_changed = *width && *width != p->cur_width,
+         height_changed = *height && *height != p->cur_height;
+
+    if (p->suboptimal || p->needs_recreate || width_changed || height_changed)
+        ok = vk_sw_recreate(sw, *width, *height);
+
+    *width = p->cur_width;
+    *height = p->cur_height;
+
+    pl_mutex_unlock(&p->lock);
+    return ok;
+}
+
+static void vk_sw_colorspace_hint(pl_swapchain sw, const struct pl_color_space *csp)
+{
+    struct priv *p = PL_PRIV(sw);
+    pl_mutex_lock(&p->lock);
+
+    // This should never fail if the swapchain already exists
+    bool ok = pick_surf_format(sw, csp);
+    set_hdr_metadata(p, &csp->hdr);
+    pl_assert(ok);
+
+    pl_mutex_unlock(&p->lock);
+}
+
+bool pl_vulkan_swapchain_suboptimal(pl_swapchain sw)
+{
+    struct priv *p = PL_PRIV(sw);
+    return p->suboptimal;
+}
+
+static const struct pl_sw_fns vulkan_swapchain = {
+    .destroy            = vk_sw_destroy,
+    .latency            = vk_sw_latency,
+    .resize             = vk_sw_resize,
+    .colorspace_hint    = vk_sw_colorspace_hint,
+    .start_frame        = vk_sw_start_frame,
+    .submit_frame       = vk_sw_submit_frame,
+    .swap_buffers       = vk_sw_swap_buffers,
+};
diff --git a/src/vulkan/utils.c b/src/vulkan/utils.c
new file mode 100644
index 0000000..914f9e4
--- /dev/null
+++ b/src/vulkan/utils.c
@@ -0,0 +1,181 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "utils.h"
+
+VkExternalMemoryHandleTypeFlagBitsKHR
+vk_mem_handle_type(enum pl_handle_type handle_type)
+{
+    if (!handle_type)
+        return 0;
+
+    switch (handle_type) {
+    case PL_HANDLE_FD:
+        return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+    case PL_HANDLE_WIN32:
+        return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR;
+    case PL_HANDLE_WIN32_KMT:
+        return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR;
+    case PL_HANDLE_DMA_BUF:
+        return VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT;
+    case PL_HANDLE_HOST_PTR:
+        return VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT;
+    case PL_HANDLE_MTL_TEX:
+    case PL_HANDLE_IOSURFACE:
+        return 0;
+    }
+
+    pl_unreachable();
+}
+
+VkExternalSemaphoreHandleTypeFlagBitsKHR
+vk_sync_handle_type(enum pl_handle_type handle_type)
+{
+    if (!handle_type)
+        return 0;
+
+    switch (handle_type) {
+    case PL_HANDLE_FD:
+        return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+    case PL_HANDLE_WIN32:
+        return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR;
+    case PL_HANDLE_WIN32_KMT:
+        return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR;
+    case PL_HANDLE_DMA_BUF:
+    case PL_HANDLE_HOST_PTR:
+    case PL_HANDLE_MTL_TEX:
+    case PL_HANDLE_IOSURFACE:
+        return 0;
+    }
+
+    pl_unreachable();
+}
+
+bool vk_external_mem_check(struct vk_ctx *vk,
+                           const VkExternalMemoryPropertiesKHR *props,
+                           enum pl_handle_type handle_type,
+                           bool import)
+{
+    VkExternalMemoryFeatureFlagsKHR flags = props->externalMemoryFeatures;
+    VkExternalMemoryHandleTypeFlagBitsKHR vk_handle = vk_mem_handle_type(handle_type);
+
+    if (import) {
+        if (!(flags & VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT_KHR)) {
+            PL_DEBUG(vk, "Handle type %s (0x%x) is not importable",
+                     vk_handle_name(vk_handle), (unsigned int) handle_type);
+            return false;
+        }
+    } else {
+        if (!(flags & VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT_KHR)) {
+            PL_DEBUG(vk, "Handle type %s (0x%x) is not exportable",
+                     vk_handle_name(vk_handle), (unsigned int) handle_type);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+const enum pl_handle_type vk_mem_handle_list[] = {
+        PL_HANDLE_HOST_PTR,
+#ifdef PL_HAVE_UNIX
+        PL_HANDLE_FD,
+        PL_HANDLE_DMA_BUF,
+#endif
+#ifdef PL_HAVE_WIN32
+        PL_HANDLE_WIN32,
+        PL_HANDLE_WIN32_KMT,
+#endif
+        0
+};
+
+const enum pl_handle_type vk_sync_handle_list[] = {
+#ifdef PL_HAVE_UNIX
+        PL_HANDLE_FD,
+#endif
+#ifdef PL_HAVE_WIN32
+        PL_HANDLE_WIN32,
+        PL_HANDLE_WIN32_KMT,
+#endif
+        0
+};
+
+const void *vk_find_struct(const void *chain, VkStructureType stype)
+{
+    const VkBaseInStructure *in = chain;
+    while (in) {
+        if (in->sType == stype)
+            return in;
+
+        in = in->pNext;
+    }
+
+    return NULL;
+}
+
+void vk_link_struct(void *chain, const void *in)
+{
+    if (!in)
+        return;
+
+    VkBaseOutStructure *out = chain;
+    while (out->pNext)
+        out = out->pNext;
+
+    out->pNext = (void *) in;
+}
+
+void *vk_struct_memdup(void *alloc, const void *pin)
+{
+    if (!pin)
+        return NULL;
+
+    const VkBaseInStructure *in = pin;
+    size_t size = vk_struct_size(in->sType);
+    pl_assert(size);
+
+    VkBaseOutStructure *out = pl_memdup(alloc, in, size);
+    out->pNext = NULL;
+    return out;
+}
+
+void *vk_chain_memdup(void *alloc, const void *pin)
+{
+    if (!pin)
+        return NULL;
+
+    const VkBaseInStructure *in = pin;
+    VkBaseOutStructure *out = vk_struct_memdup(alloc, in);
+    pl_assert(out);
+
+    out->pNext = vk_chain_memdup(alloc, in->pNext);
+    return out;
+}
+
+void *vk_chain_alloc(void *alloc, void *chain, VkStructureType stype)
+{
+    for (VkBaseOutStructure *out = chain;; out = out->pNext) {
+        if (out->sType == stype)
+            return out;
+        if (!out->pNext) {
+            VkBaseOutStructure *s = pl_zalloc(alloc, vk_struct_size(stype));
+            s->sType = stype;
+            out->pNext = s;
+            return s;
+        }
+    }
+}
diff --git a/src/vulkan/utils.h b/src/vulkan/utils.h
new file mode 100644
index 0000000..cb1c5f5
--- /dev/null
+++ b/src/vulkan/utils.h
@@ -0,0 +1,136 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+// Return a human-readable name for various vulkan enums
+const char *vk_res_str(VkResult res);
+const char *vk_fmt_name(VkFormat fmt);
+const char *vk_csp_name(VkColorSpaceKHR csp);
+const char *vk_handle_name(VkExternalMemoryHandleTypeFlagBitsKHR handle);
+const char *vk_obj_type(VkObjectType obj);
+const char *vk_alpha_mode(VkCompositeAlphaFlagsKHR alpha);
+const char *vk_surface_transform(VkSurfaceTransformFlagsKHR transform);
+
+// Return the size of an arbitrary vulkan struct. Returns 0 for unknown structs
+size_t vk_struct_size(VkStructureType stype);
+
+// Returns the vulkan API version which a given extension was promoted to, or 0
+// if the extension is not promoted.
+uint32_t vk_ext_promoted_ver(const char *extension);
+
+// Enum translation boilerplate
+VkExternalMemoryHandleTypeFlagBitsKHR vk_mem_handle_type(enum pl_handle_type);
+VkExternalSemaphoreHandleTypeFlagBitsKHR vk_sync_handle_type(enum pl_handle_type);
+
+// Bitmask of all access flags that imply a read/write operation, respectively
+extern const VkAccessFlags2 vk_access_read;
+extern const VkAccessFlags2 vk_access_write;
+
+// Check for compatibility of a VkExternalMemoryProperties
+bool vk_external_mem_check(struct vk_ctx *vk,
+                           const VkExternalMemoryPropertiesKHR *props,
+                           enum pl_handle_type handle_type,
+                           bool check_import);
+
+// Static lists of external handle types we should try probing for
+extern const enum pl_handle_type vk_mem_handle_list[];
+extern const enum pl_handle_type vk_sync_handle_list[];
+
+// Find a structure in a pNext chain, or NULL
+const void *vk_find_struct(const void *chain, VkStructureType stype);
+
+// Link a structure into a pNext chain
+void vk_link_struct(void *chain, const void *in);
+
+// Make a copy of a structure, not including the pNext chain
+void *vk_struct_memdup(void *alloc, const void *in);
+
+// Make a deep copy of an entire pNext chain
+void *vk_chain_memdup(void *alloc, const void *in);
+
+// Find a structure in a pNext chain, or allocate + link it if absent.
+void *vk_chain_alloc(void *alloc, void *chain, VkStructureType stype);
+
+// Renormalize input features into a state consistent for a given API version.
+// If `api_ver` is specified as 0, *both* meta-structs and extension structs
+// will be emitted. Note: `out` should be initialized by the user. In
+// particular, if it already contains a valid features chain, then this
+// function will effectively act as a union.
+void vk_features_normalize(void *alloc, const VkPhysicalDeviceFeatures2 *in,
+                           uint32_t api_ver, VkPhysicalDeviceFeatures2 *out);
+
+// Convenience macros to simplify a lot of common boilerplate
+#define PL_VK_ASSERT(res, str)                            \
+    do {                                                  \
+        if (res != VK_SUCCESS) {                          \
+            PL_ERR(vk, str ": %s (%s:%d)",                \
+                   vk_res_str(res), __FILE__, __LINE__);  \
+            goto error;                                   \
+        }                                                 \
+    } while (0)
+
+#define VK(cmd)                                           \
+    do {                                                  \
+        PL_TRACE(vk, #cmd);                               \
+        VkResult _res = (cmd);                            \
+        PL_VK_ASSERT(_res, #cmd);                         \
+    } while (0)
+
+#define PL_VK_NAME(type, obj, name)                                             \
+    do {                                                                        \
+        if (vk->SetDebugUtilsObjectNameEXT) {                                   \
+            vk->SetDebugUtilsObjectNameEXT(vk->dev, &(VkDebugUtilsObjectNameInfoEXT) { \
+                .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT,    \
+                .objectType = VK_OBJECT_TYPE_##type,                            \
+                .objectHandle = (uint64_t) (obj),                               \
+                .pObjectName = (name),                                          \
+            });                                                                 \
+        }                                                                       \
+    } while (0)
+
+// Variant of PL_VK_NAME for dispatchable handles
+#define PL_VK_NAME_HANDLE(type, obj, name) \
+    PL_VK_NAME(type, (uintptr_t) (obj), name)
+
+// Helper functions to wrap and unwrap non-dispatchable handles into pointers.
+// Note that wrap/unwrap must always be used linearly.
+#if VK_USE_64_BIT_PTR_DEFINES == 1
+#define vk_wrap_handle(h) (h)
+#define vk_unwrap_handle(h) (h)
+#elif UINTPTR_MAX >= UINT64_MAX
+#define vk_wrap_handle(h) ((void *) (uintptr_t) (h))
+#define vk_unwrap_handle(h) ((uint64_t) (uintptr_t) (h))
+#else
+static inline void *vk_wrap_handle(uint64_t h)
+{
+    uint64_t *wrapper = malloc(sizeof(h));
+    assert(wrapper);
+    *wrapper = h;
+    return wrapper;
+}
+
+static inline uint64_t vk_unwrap_handle(void *h)
+{
+    uint64_t *wrapper = h;
+    uint64_t ret = *wrapper;
+    free(wrapper);
+    return ret;
+}
+#endif
diff --git a/src/vulkan/utils_gen.c.j2 b/src/vulkan/utils_gen.c.j2
new file mode 100644
index 0000000..6db0454
--- /dev/null
+++ b/src/vulkan/utils_gen.c.j2
@@ -0,0 +1,137 @@
+#define VK_ENABLE_BETA_EXTENSIONS
+#include "vulkan/utils.h"
+
+const char *vk_res_str(VkResult res)
+{
+    switch (res) {
+{% for res in vkresults %}
+    case {{ res }}: return "{{ res }}";
+{% endfor %}
+
+    default: return "unknown error";
+    }
+}
+
+const char *vk_fmt_name(VkFormat fmt)
+{
+    switch (fmt) {
+{% for fmt in vkformats %}
+    case {{ fmt }}: return "{{ fmt }}";
+{% endfor %}
+
+    default: return "unknown format";
+    }
+}
+
+const char *vk_csp_name(VkColorSpaceKHR csp)
+{
+    switch (csp) {
+{% for csp in vkspaces %}
+    case {{ csp }}: return "{{ csp }}";
+{% endfor %}
+
+    default: return "unknown color space";
+    }
+}
+
+const char *vk_handle_name(VkExternalMemoryHandleTypeFlagBitsKHR handle)
+{
+    switch (handle) {
+{% for handle in vkhandles %}
+    case {{ handle }}: return "{{ handle }}";
+{% endfor %}
+
+    default: return "unknown handle type";
+    }
+}
+
+const char *vk_alpha_mode(VkCompositeAlphaFlagsKHR alpha)
+{
+    switch (alpha) {
+{% for mode in vkalphas %}
+    case {{ mode }}: return "{{ mode }}";
+{% endfor %}
+
+    default: return "unknown alpha mode";
+    }
+}
+
+const char *vk_surface_transform(VkSurfaceTransformFlagsKHR tf)
+{
+    switch (tf) {
+{% for tf in vktransforms %}
+    case {{ tf }}: return "{{ tf }}";
+{% endfor %}
+
+    default: return "unknown surface transform";
+    }
+}
+
+
+const char *vk_obj_type(VkObjectType obj)
+{
+    switch (obj) {
+{% for obj in vkobjects %}
+    case {{ obj.enum }}: return "{{ obj.name }}";
+{% endfor %}
+
+    default: return "unknown object";
+    }
+}
+
+size_t vk_struct_size(VkStructureType stype)
+{
+    switch (stype) {
+{% for struct in vkstructs %}
+    case {{ struct.stype }}: return sizeof({{ struct.name }});
+{% endfor %}
+
+    default: return 0;
+    }
+}
+
+uint32_t vk_ext_promoted_ver(const char *extension)
+{
+{% for ext in vkexts %}
+{%  if ext.promoted_ver %}
+    if (!strcmp(extension, "{{ ext.name }}"))
+        return {{ ext.promoted_ver }};
+{%  endif %}
+{% endfor %}
+    return 0;
+}
+
+void vk_features_normalize(void *alloc, const VkPhysicalDeviceFeatures2 *fin,
+                           uint32_t api_ver, VkPhysicalDeviceFeatures2 *out)
+{
+    for (const VkBaseInStructure *in = (void *) fin; in; in = in->pNext) {
+        switch (in->sType) {
+        default: break;
+{% for fs in vkfeatures %}
+        case {{ fs.stype }}: {
+            const {{ fs.name }} *i = (const void *) in;
+{% for f in fs.features %}
+            if (i->{{ f.name }}) {
+{% for r in f.replacements %}
+{% if r.core_ver %}
+               if (!api_ver || api_ver >= {{ r.core_ver }})
+{% elif r.max_ver %}
+               if (!api_ver || api_ver < {{ r.max_ver }})
+{% endif %}
+{% if fs.is_base %}
+                out->{{ f.name }} = true;
+{% else %}
+                (({{ r.name }} *) vk_chain_alloc(alloc, out, {{ r.stype }}))->{{ f.name }} = true;
+{% endif %}
+{% endfor %}
+            }
+{% endfor %}
+            break;
+        }
+{% endfor %}
+        }
+    }
+}
+
+const VkAccessFlags2 vk_access_read = {{ '0x%x' % vkaccess.read }}LLU;
+const VkAccessFlags2 vk_access_write = {{ '0x%x' % vkaccess.write }}LLU;
diff --git a/src/vulkan/utils_gen.py b/src/vulkan/utils_gen.py
new file mode 100644
index 0000000..a8652fd
--- /dev/null
+++ b/src/vulkan/utils_gen.py
@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+#
+# This file is part of libplacebo.
+#
+# libplacebo is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# libplacebo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with libplacebo.  If not, see <http://www.gnu.org/licenses/>.
+
+import os.path
+import re
+import sys
+import xml.etree.ElementTree as ET
+
+try:
+    import jinja2
+except ModuleNotFoundError:
+    print('Module \'jinja2\' not found, please install \'python3-Jinja2\' or '
+          'an equivalent package on your system! Alternatively, run '
+          '`git submodule update --init` followed by `meson --wipe`.',
+          file=sys.stderr)
+    sys.exit(1)
+
+TEMPLATE = jinja2.Environment(
+    loader = jinja2.FileSystemLoader(searchpath=os.path.dirname(__file__)),
+    trim_blocks=True,
+).get_template('utils_gen.c.j2')
+
+class Obj(object):
+    def __init__(self, **kwargs):
+        self.__dict__.update(kwargs)
+
+class VkXML(ET.ElementTree):
+    def blacklist_block(self, req):
+        for t in req.iterfind('type'):
+            self.blacklist_types.add(t.attrib['name'])
+        for e in req.iterfind('enum'):
+            self.blacklist_enums.add(e.attrib['name'])
+
+    def __init__(self, *args, **kwargs):
+
+        super().__init__(*args, **kwargs)
+        self.blacklist_types = set()
+        self.blacklist_enums = set()
+
+        for f in self.iterfind('feature'):
+            # Feature block for non-Vulkan API
+            if not 'vulkan' in f.attrib['api'].split(','):
+                for r in f.iterfind('require'):
+                    self.blacklist_block(r)
+
+        for e in self.iterfind('extensions/extension'):
+            # Entire extension is unsupported on vulkan or platform-specifid
+            if not 'vulkan' in e.attrib['supported'].split(',') or 'platform' in e.attrib:
+                for r in e.iterfind('require'):
+                    self.blacklist_block(r)
+                continue
+
+            # Only individual <require> blocks are API-specific
+            for r in e.iterfind('require[@api]'):
+                if not 'vulkan' in r.attrib['api'].split(','):
+                    self.blacklist_block(r)
+
+    def findall_enum(self, name):
+        for e in self.iterfind('enums[@name="{0}"]/enum'.format(name)):
+            if not 'alias' in e.attrib:
+                if not e.attrib['name'] in self.blacklist_enums:
+                    yield e
+        for e in self.iterfind('.//enum[@extends="{0}"]'.format(name)):
+            if not 'alias' in e.attrib:
+                if not e.attrib['name'] in self.blacklist_enums:
+                    yield e
+
+    def findall_type(self, category):
+        for t in self.iterfind('types/type[@category="{0}"]'.format(category)):
+            name = t.attrib.get('name') or t.find('name').text
+            if name in self.blacklist_types:
+                continue
+            yield t
+
+
+def get_vkenum(registry, enum):
+    for e in registry.findall_enum(enum):
+        yield e.attrib['name']
+
+def get_vkobjects(registry):
+    for t in registry.findall_type('handle'):
+        if 'objtypeenum' in t.attrib:
+            yield Obj(enum = t.attrib['objtypeenum'],
+                      name = t.find('name').text)
+
+def get_vkstructs(registry):
+    for t in registry.findall_type('struct'):
+        stype = None
+        for m in t.iterfind('member'):
+            if m.find('name').text == 'sType':
+                stype = m
+                break
+
+        if stype is not None and 'values' in stype.attrib:
+            yield Obj(stype = stype.attrib['values'],
+                      name = t.attrib['name'])
+
+def get_vkaccess(registry):
+    access = Obj(read = 0, write = 0)
+    for e in registry.findall_enum('VkAccessFlagBits2'):
+        if '_READ_' in e.attrib['name']:
+            access.read |= 1 << int(e.attrib['bitpos'])
+        if '_WRITE_' in e.attrib['name']:
+            access.write |= 1 << int(e.attrib['bitpos'])
+    return access
+
+def get_vkexts(registry):
+    for e in registry.iterfind('extensions/extension'):
+        promoted_ver = None
+        if res := re.match(r'VK_VERSION_(\d)_(\d)', e.attrib.get('promotedto', '')):
+            promoted_ver = 'VK_API_VERSION_{0}_{1}'.format(res[1], res[2])
+        yield Obj(name = e.attrib['name'],
+                  promoted_ver = promoted_ver)
+
+def get_vkfeatures(registry):
+    structs = [];
+    featuremap = {}; # features -> [struct]
+    for t in registry.findall_type('struct'):
+        sname = t.attrib['name']
+        is_base = sname == 'VkPhysicalDeviceFeatures'
+        extends = t.attrib.get('structextends', [])
+        if is_base:
+            sname = 'VkPhysicalDeviceFeatures2'
+            stype = 'VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2'
+        elif not 'VkPhysicalDeviceFeatures2' in extends:
+            continue
+
+        features = []
+        for f in t.iterfind('member'):
+            if f.find('type').text == 'VkStructureType':
+                stype = f.attrib['values']
+            elif f.find('type').text == 'VkBool32':
+                fname = f.find('name').text
+                if is_base:
+                    fname = 'features.' + fname
+                features.append(Obj(name = fname))
+
+        core_ver = None
+        if res := re.match(r'VkPhysicalDeviceVulkan(\d)(\d)Features', sname):
+            core_ver = 'VK_API_VERSION_{0}_{1}'.format(res[1], res[2])
+
+        struct = Obj(name       = sname,
+                     stype      = stype,
+                     core_ver   = core_ver,
+                     is_base    = is_base,
+                     features   = features)
+
+        structs.append(struct)
+        for f in features:
+            featuremap.setdefault(f.name, []).append(struct)
+
+    for s in structs:
+        for f in s.features:
+            f.replacements = featuremap[f.name]
+            core_ver = next(( r.core_ver for r in f.replacements if r.core_ver ), None)
+            for r in f.replacements:
+                if not r.core_ver:
+                    r.max_ver = core_ver
+
+    yield from structs
+
+def find_registry_xml(datadir):
+    registry_paths = [
+        '{0}/vulkan/registry/vk.xml'.format(datadir),
+        '$MINGW_PREFIX/share/vulkan/registry/vk.xml',
+        '%VULKAN_SDK%/share/vulkan/registry/vk.xml',
+        '$VULKAN_SDK/share/vulkan/registry/vk.xml',
+        '/usr/share/vulkan/registry/vk.xml',
+    ]
+
+    for p in registry_paths:
+        path = os.path.expandvars(p)
+        if os.path.isfile(path):
+            print('Found vk.xml: {0}'.format(path))
+            return path
+
+    print('Could not find the vulkan registry (vk.xml), please specify its '
+          'location manually using the -Dvulkan-registry=/path/to/vk.xml '
+          'option!', file=sys.stderr)
+    sys.exit(1)
+
+if __name__ == '__main__':
+    assert len(sys.argv) == 4
+    datadir = sys.argv[1]
+    xmlfile = sys.argv[2]
+    outfile = sys.argv[3]
+
+    if not xmlfile or xmlfile == '':
+        xmlfile = find_registry_xml(datadir)
+
+    registry = VkXML(ET.parse(xmlfile))
+    with open(outfile, 'w') as f:
+        f.write(TEMPLATE.render(
+            vkresults = get_vkenum(registry, 'VkResult'),
+            vkformats = get_vkenum(registry, 'VkFormat'),
+            vkspaces  = get_vkenum(registry, 'VkColorSpaceKHR'),
+            vkhandles = get_vkenum(registry, 'VkExternalMemoryHandleTypeFlagBits'),
+            vkalphas  = get_vkenum(registry, 'VkCompositeAlphaFlagBitsKHR'),
+            vktransforms = get_vkenum(registry, 'VkSurfaceTransformFlagBitsKHR'),
+            vkobjects = get_vkobjects(registry),
+            vkstructs = get_vkstructs(registry),
+            vkaccess = get_vkaccess(registry),
+            vkexts = get_vkexts(registry),
+            vkfeatures = get_vkfeatures(registry),
+        ))