summaryrefslogtreecommitdiffstats
path: root/src/vulkan
diff options
context:
space:
mode:
Diffstat (limited to 'src/vulkan')
-rw-r--r--src/vulkan/command.c571
-rw-r--r--src/vulkan/command.h142
-rw-r--r--src/vulkan/common.h234
-rw-r--r--src/vulkan/context.c1704
-rw-r--r--src/vulkan/formats.c616
-rw-r--r--src/vulkan/formats.h34
-rw-r--r--src/vulkan/gpu.c924
-rw-r--r--src/vulkan/gpu.h175
-rw-r--r--src/vulkan/gpu_buf.c470
-rw-r--r--src/vulkan/gpu_pass.c964
-rw-r--r--src/vulkan/gpu_tex.c1453
-rw-r--r--src/vulkan/malloc.c1058
-rw-r--r--src/vulkan/malloc.h72
-rw-r--r--src/vulkan/meson.build59
-rw-r--r--src/vulkan/stubs.c108
-rw-r--r--src/vulkan/swapchain.c911
-rw-r--r--src/vulkan/utils.c181
-rw-r--r--src/vulkan/utils.h136
-rw-r--r--src/vulkan/utils_gen.c.j2137
-rw-r--r--src/vulkan/utils_gen.py219
20 files changed, 10168 insertions, 0 deletions
diff --git a/src/vulkan/command.c b/src/vulkan/command.c
new file mode 100644
index 0000000..5020aff
--- /dev/null
+++ b/src/vulkan/command.c
@@ -0,0 +1,571 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "command.h"
+#include "utils.h"
+
+// returns VK_SUCCESS (completed), VK_TIMEOUT (not yet completed) or an error
+static VkResult vk_cmd_poll(struct vk_cmd *cmd, uint64_t timeout)
+{
+ struct vk_ctx *vk = cmd->pool->vk;
+ return vk->WaitSemaphores(vk->dev, &(VkSemaphoreWaitInfo) {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
+ .semaphoreCount = 1,
+ .pSemaphores = &cmd->sync.sem,
+ .pValues = &cmd->sync.value,
+ }, timeout);
+}
+
+static void flush_callbacks(struct vk_ctx *vk)
+{
+ while (vk->num_pending_callbacks) {
+ const struct vk_callback *cb = vk->pending_callbacks++;
+ vk->num_pending_callbacks--;
+ cb->run(cb->priv, cb->arg);
+ }
+}
+
+static void vk_cmd_reset(struct vk_cmd *cmd)
+{
+ struct vk_ctx *vk = cmd->pool->vk;
+
+ // Flush possible callbacks left over from a previous command still in the
+ // process of being reset, whose callback triggered this command being
+ // reset.
+ flush_callbacks(vk);
+ vk->pending_callbacks = cmd->callbacks.elem;
+ vk->num_pending_callbacks = cmd->callbacks.num;
+ flush_callbacks(vk);
+
+ cmd->callbacks.num = 0;
+ cmd->deps.num = 0;
+ cmd->sigs.num = 0;
+}
+
+static void vk_cmd_destroy(struct vk_cmd *cmd)
+{
+ if (!cmd)
+ return;
+
+ struct vk_ctx *vk = cmd->pool->vk;
+ vk_cmd_poll(cmd, UINT64_MAX);
+ vk_cmd_reset(cmd);
+ vk->DestroySemaphore(vk->dev, cmd->sync.sem, PL_VK_ALLOC);
+ vk->FreeCommandBuffers(vk->dev, cmd->pool->pool, 1, &cmd->buf);
+
+ pl_free(cmd);
+}
+
+static struct vk_cmd *vk_cmd_create(struct vk_cmdpool *pool)
+{
+ struct vk_ctx *vk = pool->vk;
+ struct vk_cmd *cmd = pl_zalloc_ptr(NULL, cmd);
+ cmd->pool = pool;
+
+ VkCommandBufferAllocateInfo ainfo = {
+ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+ .commandPool = pool->pool,
+ .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+ .commandBufferCount = 1,
+ };
+
+ VK(vk->AllocateCommandBuffers(vk->dev, &ainfo, &cmd->buf));
+
+ static const VkSemaphoreTypeCreateInfo stinfo = {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO,
+ .semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE,
+ .initialValue = 0,
+ };
+
+ static const VkSemaphoreCreateInfo sinfo = {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+ .pNext = &stinfo,
+ };
+
+ VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &cmd->sync.sem));
+ PL_VK_NAME(SEMAPHORE, cmd->sync.sem, "cmd");
+
+ return cmd;
+
+error:
+ vk_cmd_destroy(cmd);
+ vk->failed = true;
+ return NULL;
+}
+
+void vk_dev_callback(struct vk_ctx *vk, vk_cb callback,
+ const void *priv, const void *arg)
+{
+ pl_mutex_lock(&vk->lock);
+ if (vk->cmds_pending.num > 0) {
+ struct vk_cmd *last_cmd = vk->cmds_pending.elem[vk->cmds_pending.num - 1];
+ vk_cmd_callback(last_cmd, callback, priv, arg);
+ } else {
+ // The device was already idle, so we can just immediately call it
+ callback((void *) priv, (void *) arg);
+ }
+ pl_mutex_unlock(&vk->lock);
+}
+
+void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback,
+ const void *priv, const void *arg)
+{
+ PL_ARRAY_APPEND(cmd, cmd->callbacks, (struct vk_callback) {
+ .run = callback,
+ .priv = (void *) priv,
+ .arg = (void *) arg,
+ });
+}
+
+void vk_cmd_dep(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem dep)
+{
+ PL_ARRAY_APPEND(cmd, cmd->deps, (VkSemaphoreSubmitInfo) {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
+ .semaphore = dep.sem,
+ .value = dep.value,
+ .stageMask = stage,
+ });
+}
+
+void vk_cmd_sig(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem sig)
+{
+ VkSemaphoreSubmitInfo sinfo = {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
+ .semaphore = sig.sem,
+ .value = sig.value,
+ .stageMask = stage,
+ };
+
+ // Try updating existing semaphore signal operations in-place
+ for (int i = 0; i < cmd->sigs.num; i++) {
+ if (cmd->sigs.elem[i].semaphore == sig.sem) {
+ pl_assert(sig.value > cmd->sigs.elem[i].value);
+ cmd->sigs.elem[i] = sinfo;
+ return;
+ }
+ }
+
+ PL_ARRAY_APPEND(cmd, cmd->sigs, sinfo);
+}
+
+#define SET(FLAG, CHECK) \
+ if (flags2 & (CHECK)) \
+ flags |= FLAG
+
+static VkAccessFlags lower_access2(VkAccessFlags2 flags2)
+{
+ VkAccessFlags flags = flags2 & VK_ACCESS_FLAG_BITS_MAX_ENUM;
+ SET(VK_ACCESS_SHADER_READ_BIT, VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
+ VK_ACCESS_2_SHADER_STORAGE_READ_BIT);
+ SET(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT);
+ return flags;
+}
+
+static VkPipelineStageFlags lower_stage2(VkPipelineStageFlags2 flags2)
+{
+ VkPipelineStageFlags flags = flags2 & VK_PIPELINE_STAGE_FLAG_BITS_MAX_ENUM;
+ SET(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_2_COPY_BIT |
+ VK_PIPELINE_STAGE_2_RESOLVE_BIT |
+ VK_PIPELINE_STAGE_2_BLIT_BIT |
+ VK_PIPELINE_STAGE_2_CLEAR_BIT);
+ SET(VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
+ VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT);
+ return flags;
+}
+
+#undef SET
+
+void vk_cmd_barrier(struct vk_cmd *cmd, const VkDependencyInfo *info)
+{
+ struct vk_ctx *vk = cmd->pool->vk;
+ if (vk->CmdPipelineBarrier2KHR) {
+ vk->CmdPipelineBarrier2KHR(cmd->buf, info);
+ return;
+ }
+
+ pl_assert(!info->pNext);
+ pl_assert(info->memoryBarrierCount == 0);
+ pl_assert(info->bufferMemoryBarrierCount + info->imageMemoryBarrierCount == 1);
+
+ if (info->bufferMemoryBarrierCount) {
+
+ const VkBufferMemoryBarrier2 *barr2 = info->pBufferMemoryBarriers;
+ const VkBufferMemoryBarrier barr = {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
+ .pNext = barr2->pNext,
+ .srcAccessMask = lower_access2(barr2->srcAccessMask),
+ .dstAccessMask = lower_access2(barr2->dstAccessMask),
+ .srcQueueFamilyIndex = barr2->srcQueueFamilyIndex,
+ .dstQueueFamilyIndex = barr2->dstQueueFamilyIndex,
+ .buffer = barr2->buffer,
+ .offset = barr2->offset,
+ .size = barr2->size,
+ };
+
+ vk->CmdPipelineBarrier(cmd->buf, lower_stage2(barr2->srcStageMask),
+ lower_stage2(barr2->dstStageMask),
+ info->dependencyFlags,
+ 0, NULL, 1, &barr, 0, NULL);
+
+ } else {
+
+ const VkImageMemoryBarrier2 *barr2 = info->pImageMemoryBarriers;
+ const VkImageMemoryBarrier barr = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+ .pNext = barr2->pNext,
+ .srcAccessMask = lower_access2(barr2->srcAccessMask),
+ .dstAccessMask = lower_access2(barr2->dstAccessMask),
+ .oldLayout = barr2->oldLayout,
+ .newLayout = barr2->newLayout,
+ .srcQueueFamilyIndex = barr2->srcQueueFamilyIndex,
+ .dstQueueFamilyIndex = barr2->dstQueueFamilyIndex,
+ .image = barr2->image,
+ .subresourceRange = barr2->subresourceRange,
+ };
+
+ vk->CmdPipelineBarrier(cmd->buf, lower_stage2(barr2->srcStageMask),
+ lower_stage2(barr2->dstStageMask),
+ info->dependencyFlags,
+ 0, NULL, 0, NULL, 1, &barr);
+ }
+}
+
+struct vk_sync_scope vk_sem_barrier(struct vk_cmd *cmd, struct vk_sem *sem,
+ VkPipelineStageFlags2 stage,
+ VkAccessFlags2 access, bool is_trans)
+{
+ bool is_write = (access & vk_access_write) || is_trans;
+
+ // Writes need to be synchronized against the last *read* (which is
+ // transitively synchronized against the last write), reads only
+ // need to be synchronized against the last write.
+ struct vk_sync_scope last = sem->write;
+ if (is_write && sem->read.access)
+ last = sem->read;
+
+ if (last.queue != cmd->queue) {
+ if (!is_write && sem->read.queue == cmd->queue) {
+ // No semaphore needed in this case because the implicit submission
+ // order execution dependencies already transitively imply a wait
+ // for the previous write
+ } else if (last.sync.sem) {
+ // Image barrier still needs to depend on this stage for implicit
+ // ordering guarantees to apply properly
+ vk_cmd_dep(cmd, stage, last.sync);
+ last.stage = stage;
+ }
+
+ // Last access is on different queue, so no pipeline barrier needed
+ last.access = 0;
+ }
+
+ if (!is_write && sem->read.queue == cmd->queue &&
+ (sem->read.stage & stage) == stage &&
+ (sem->read.access & access) == access)
+ {
+ // A past pipeline barrier already covers this access transitively, so
+ // we don't need to emit another pipeline barrier at all
+ last.access = 0;
+ }
+
+ if (is_write) {
+ sem->write = (struct vk_sync_scope) {
+ .sync = cmd->sync,
+ .queue = cmd->queue,
+ .stage = stage,
+ .access = access,
+ };
+
+ sem->read = (struct vk_sync_scope) {
+ .sync = cmd->sync,
+ .queue = cmd->queue,
+ // no stage or access scope, because no reads happened yet
+ };
+ } else if (sem->read.queue == cmd->queue) {
+ // Coalesce multiple same-queue reads into a single access scope
+ sem->read.sync = cmd->sync;
+ sem->read.stage |= stage;
+ sem->read.access |= access;
+ } else {
+ sem->read = (struct vk_sync_scope) {
+ .sync = cmd->sync,
+ .queue = cmd->queue,
+ .stage = stage,
+ .access = access,
+ };
+ }
+
+ // We never need to include pipeline barriers for reads, only writes
+ last.access &= vk_access_write;
+ return last;
+}
+
+struct vk_cmdpool *vk_cmdpool_create(struct vk_ctx *vk, int qf, int qnum,
+ VkQueueFamilyProperties props)
+{
+ struct vk_cmdpool *pool = pl_alloc_ptr(NULL, pool);
+ *pool = (struct vk_cmdpool) {
+ .vk = vk,
+ .props = props,
+ .qf = qf,
+ .queues = pl_calloc(pool, qnum, sizeof(VkQueue)),
+ .num_queues = qnum,
+ };
+
+ for (int n = 0; n < qnum; n++)
+ vk->GetDeviceQueue(vk->dev, qf, n, &pool->queues[n]);
+
+ VkCommandPoolCreateInfo cinfo = {
+ .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+ .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT |
+ VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
+ .queueFamilyIndex = qf,
+ };
+
+ VK(vk->CreateCommandPool(vk->dev, &cinfo, PL_VK_ALLOC, &pool->pool));
+ return pool;
+
+error:
+ vk_cmdpool_destroy(pool);
+ vk->failed = true;
+ return NULL;
+}
+
+void vk_cmdpool_destroy(struct vk_cmdpool *pool)
+{
+ if (!pool)
+ return;
+
+ for (int i = 0; i < pool->cmds.num; i++)
+ vk_cmd_destroy(pool->cmds.elem[i]);
+
+ struct vk_ctx *vk = pool->vk;
+ vk->DestroyCommandPool(vk->dev, pool->pool, PL_VK_ALLOC);
+ pl_free(pool);
+}
+
+struct vk_cmd *vk_cmd_begin(struct vk_cmdpool *pool, pl_debug_tag debug_tag)
+{
+ struct vk_ctx *vk = pool->vk;
+
+ // Garbage collect the cmdpool first, to increase the chances of getting
+ // an already-available command buffer.
+ vk_poll_commands(vk, 0);
+
+ struct vk_cmd *cmd = NULL;
+ pl_mutex_lock(&vk->lock);
+ if (!PL_ARRAY_POP(pool->cmds, &cmd)) {
+ cmd = vk_cmd_create(pool);
+ if (!cmd) {
+ pl_mutex_unlock(&vk->lock);
+ goto error;
+ }
+ }
+
+ cmd->qindex = pool->idx_queues;
+ cmd->queue = pool->queues[cmd->qindex];
+ pl_mutex_unlock(&vk->lock);
+
+ VkCommandBufferBeginInfo binfo = {
+ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+ .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
+ };
+
+ VK(vk->BeginCommandBuffer(cmd->buf, &binfo));
+
+ debug_tag = PL_DEF(debug_tag, "vk_cmd");
+ PL_VK_NAME_HANDLE(COMMAND_BUFFER, cmd->buf, debug_tag);
+ PL_VK_NAME(SEMAPHORE, cmd->sync.sem, debug_tag);
+
+ cmd->sync.value++;
+ vk_cmd_sig(cmd, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, cmd->sync);
+ return cmd;
+
+error:
+ // Something has to be seriously messed up if we get to this point
+ vk_cmd_destroy(cmd);
+ vk->failed = true;
+ return NULL;
+}
+
+static VkResult vk_queue_submit2(struct vk_ctx *vk, VkQueue queue,
+ const VkSubmitInfo2 *info2, VkFence fence)
+{
+ if (vk->QueueSubmit2KHR)
+ return vk->QueueSubmit2KHR(queue, 1, info2, fence);
+
+ const uint32_t num_deps = info2->waitSemaphoreInfoCount;
+ const uint32_t num_sigs = info2->signalSemaphoreInfoCount;
+ const uint32_t num_cmds = info2->commandBufferInfoCount;
+
+ void *tmp = pl_tmp(NULL);
+ VkSemaphore *deps = pl_calloc_ptr(tmp, num_deps, deps);
+ VkPipelineStageFlags *masks = pl_calloc_ptr(tmp, num_deps, masks);
+ uint64_t *depvals = pl_calloc_ptr(tmp, num_deps, depvals);
+ VkSemaphore *sigs = pl_calloc_ptr(tmp, num_sigs, sigs);
+ uint64_t *sigvals = pl_calloc_ptr(tmp, num_sigs, sigvals);
+ VkCommandBuffer *cmds = pl_calloc_ptr(tmp, num_cmds, cmds);
+
+ for (int i = 0; i < num_deps; i++) {
+ deps[i] = info2->pWaitSemaphoreInfos[i].semaphore;
+ masks[i] = info2->pWaitSemaphoreInfos[i].stageMask;
+ depvals[i] = info2->pWaitSemaphoreInfos[i].value;
+ }
+ for (int i = 0; i < num_sigs; i++) {
+ sigs[i] = info2->pSignalSemaphoreInfos[i].semaphore;
+ sigvals[i] = info2->pSignalSemaphoreInfos[i].value;
+ }
+ for (int i = 0; i < num_cmds; i++)
+ cmds[i] = info2->pCommandBufferInfos[i].commandBuffer;
+
+ const VkTimelineSemaphoreSubmitInfo tinfo = {
+ .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO,
+ .pNext = info2->pNext,
+ .waitSemaphoreValueCount = num_deps,
+ .pWaitSemaphoreValues = depvals,
+ .signalSemaphoreValueCount = num_sigs,
+ .pSignalSemaphoreValues = sigvals,
+ };
+
+ const VkSubmitInfo info = {
+ .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+ .pNext = &tinfo,
+ .waitSemaphoreCount = num_deps,
+ .pWaitSemaphores = deps,
+ .pWaitDstStageMask = masks,
+ .commandBufferCount = num_cmds,
+ .pCommandBuffers = cmds,
+ .signalSemaphoreCount = num_sigs,
+ .pSignalSemaphores = sigs,
+ };
+
+ VkResult res = vk->QueueSubmit(queue, 1, &info, fence);
+ pl_free(tmp);
+ return res;
+}
+
+bool vk_cmd_submit(struct vk_cmd **pcmd)
+{
+ struct vk_cmd *cmd = *pcmd;
+ if (!cmd)
+ return true;
+
+ *pcmd = NULL;
+ struct vk_cmdpool *pool = cmd->pool;
+ struct vk_ctx *vk = pool->vk;
+
+ VK(vk->EndCommandBuffer(cmd->buf));
+
+ VkSubmitInfo2 sinfo = {
+ .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
+ .waitSemaphoreInfoCount = cmd->deps.num,
+ .pWaitSemaphoreInfos = cmd->deps.elem,
+ .signalSemaphoreInfoCount = cmd->sigs.num,
+ .pSignalSemaphoreInfos = cmd->sigs.elem,
+ .commandBufferInfoCount = 1,
+ .pCommandBufferInfos = &(VkCommandBufferSubmitInfo) {
+ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+ .commandBuffer = cmd->buf,
+ },
+ };
+
+ if (pl_msg_test(vk->log, PL_LOG_TRACE)) {
+ PL_TRACE(vk, "Submitting command %p on queue %p (QF %d):",
+ (void *) cmd->buf, (void *) cmd->queue, pool->qf);
+ for (int n = 0; n < cmd->deps.num; n++) {
+ PL_TRACE(vk, " waits on semaphore 0x%"PRIx64" = %"PRIu64,
+ (uint64_t) cmd->deps.elem[n].semaphore, cmd->deps.elem[n].value);
+ }
+ for (int n = 0; n < cmd->sigs.num; n++) {
+ PL_TRACE(vk, " signals semaphore 0x%"PRIx64" = %"PRIu64,
+ (uint64_t) cmd->sigs.elem[n].semaphore, cmd->sigs.elem[n].value);
+ }
+ if (cmd->callbacks.num)
+ PL_TRACE(vk, " signals %d callbacks", cmd->callbacks.num);
+ }
+
+ vk->lock_queue(vk->queue_ctx, pool->qf, cmd->qindex);
+ VkResult res = vk_queue_submit2(vk, cmd->queue, &sinfo, VK_NULL_HANDLE);
+ vk->unlock_queue(vk->queue_ctx, pool->qf, cmd->qindex);
+ PL_VK_ASSERT(res, "vkQueueSubmit2");
+
+ pl_mutex_lock(&vk->lock);
+ PL_ARRAY_APPEND(vk->alloc, vk->cmds_pending, cmd);
+ pl_mutex_unlock(&vk->lock);
+ return true;
+
+error:
+ vk_cmd_reset(cmd);
+ pl_mutex_lock(&vk->lock);
+ PL_ARRAY_APPEND(pool, pool->cmds, cmd);
+ pl_mutex_unlock(&vk->lock);
+ vk->failed = true;
+ return false;
+}
+
+bool vk_poll_commands(struct vk_ctx *vk, uint64_t timeout)
+{
+ bool ret = false;
+ pl_mutex_lock(&vk->lock);
+
+ while (vk->cmds_pending.num) {
+ struct vk_cmd *cmd = vk->cmds_pending.elem[0];
+ struct vk_cmdpool *pool = cmd->pool;
+ pl_mutex_unlock(&vk->lock); // don't hold mutex while blocking
+ if (vk_cmd_poll(cmd, timeout) == VK_TIMEOUT)
+ return ret;
+ pl_mutex_lock(&vk->lock);
+ if (!vk->cmds_pending.num || vk->cmds_pending.elem[0] != cmd)
+ continue; // another thread modified this state while blocking
+
+ PL_TRACE(vk, "VkSemaphore signalled: 0x%"PRIx64" = %"PRIu64,
+ (uint64_t) cmd->sync.sem, cmd->sync.value);
+ PL_ARRAY_REMOVE_AT(vk->cmds_pending, 0); // remove before callbacks
+ vk_cmd_reset(cmd);
+ PL_ARRAY_APPEND(pool, pool->cmds, cmd);
+ ret = true;
+
+ // If we've successfully spent some time waiting for at least one
+ // command, disable the timeout. This has the dual purpose of both
+ // making sure we don't over-wait due to repeat timeout application,
+ // but also makes sure we don't block on future commands if we've
+ // already spend time waiting for one.
+ timeout = 0;
+ }
+
+ pl_mutex_unlock(&vk->lock);
+ return ret;
+}
+
+void vk_rotate_queues(struct vk_ctx *vk)
+{
+ pl_mutex_lock(&vk->lock);
+
+ // Rotate the queues to ensure good parallelism across frames
+ for (int i = 0; i < vk->pools.num; i++) {
+ struct vk_cmdpool *pool = vk->pools.elem[i];
+ pool->idx_queues = (pool->idx_queues + 1) % pool->num_queues;
+ PL_TRACE(vk, "QF %d: %d/%d", pool->qf, pool->idx_queues, pool->num_queues);
+ }
+
+ pl_mutex_unlock(&vk->lock);
+}
+
+void vk_wait_idle(struct vk_ctx *vk)
+{
+ while (vk_poll_commands(vk, UINT64_MAX)) ;
+}
diff --git a/src/vulkan/command.h b/src/vulkan/command.h
new file mode 100644
index 0000000..4c70482
--- /dev/null
+++ b/src/vulkan/command.h
@@ -0,0 +1,142 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include "common.h"
+
+// Since lots of vulkan operations need to be done lazily once the affected
+// resources are no longer in use, provide an abstraction for tracking these.
+// In practice, these are only checked and run when submitting new commands, so
+// the actual execution may be delayed by a frame.
+typedef void (*vk_cb)(void *p, void *arg);
+
+struct vk_callback {
+ vk_cb run;
+ void *priv;
+ void *arg;
+};
+
+// Associate a callback with the completion of all currently pending commands.
+// This will essentially run once the device is completely idle.
+void vk_dev_callback(struct vk_ctx *vk, vk_cb callback,
+ const void *priv, const void *arg);
+
+// Helper wrapper around command buffers that also track dependencies,
+// callbacks and synchronization primitives
+//
+// Thread-safety: Unsafe
+struct vk_cmd {
+ struct vk_cmdpool *pool; // pool it was allocated from
+ pl_vulkan_sem sync; // pending execution, tied to lifetime of device
+ VkQueue queue; // the submission queue (for recording/pending)
+ int qindex; // the index of `queue` in `pool`
+ VkCommandBuffer buf; // the command buffer itself
+ // Command dependencies and signals. Not owned by the vk_cmd.
+ PL_ARRAY(VkSemaphoreSubmitInfo) deps;
+ PL_ARRAY(VkSemaphoreSubmitInfo) sigs;
+ // "Callbacks" to fire once a command completes. These are used for
+ // multiple purposes, ranging from resource deallocation to fencing.
+ PL_ARRAY(struct vk_callback) callbacks;
+};
+
+// Associate a callback with the completion of the current command. This
+// function will be run once the command completes, or shortly thereafter.
+void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback,
+ const void *priv, const void *arg);
+
+// Associate a raw dependency for the current command. This semaphore must
+// signal by the corresponding stage before the command may execute.
+void vk_cmd_dep(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem dep);
+
+// Associate a raw signal with the current command. This semaphore will signal
+// after the given stage completes.
+void vk_cmd_sig(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem sig);
+
+// Compatibility wrappers for vkCmdPipelineBarrier2 (works with pre-1.3)
+void vk_cmd_barrier(struct vk_cmd *cmd, const VkDependencyInfo *info);
+
+// Synchronization scope
+struct vk_sync_scope {
+ pl_vulkan_sem sync; // semaphore of last access
+ VkQueue queue; // source queue of last access
+ VkPipelineStageFlags2 stage;// stage bitmask of last access
+ VkAccessFlags2 access; // access type bitmask
+};
+
+// Synchronization primitive
+struct vk_sem {
+ struct vk_sync_scope read, write;
+};
+
+// Updates the `vk_sem` state for a given access. If `is_trans` is set, this
+// access is treated as a write (since it alters the resource's state).
+//
+// Returns a struct describing the previous access to a resource. A pipeline
+// barrier is only required if the previous access scope is nonzero.
+struct vk_sync_scope vk_sem_barrier(struct vk_cmd *cmd, struct vk_sem *sem,
+ VkPipelineStageFlags2 stage,
+ VkAccessFlags2 access, bool is_trans);
+
+// Command pool / queue family hybrid abstraction
+struct vk_cmdpool {
+ struct vk_ctx *vk;
+ VkQueueFamilyProperties props;
+ int qf; // queue family index
+ VkCommandPool pool;
+ VkQueue *queues;
+ int num_queues;
+ int idx_queues;
+ // Command buffers associated with this queue. These are available for
+ // re-recording
+ PL_ARRAY(struct vk_cmd *) cmds;
+};
+
+// Set up a vk_cmdpool corresponding to a queue family. `qnum` may be less than
+// `props.queueCount`, to restrict the number of queues in this queue family.
+struct vk_cmdpool *vk_cmdpool_create(struct vk_ctx *vk, int qf, int qnum,
+ VkQueueFamilyProperties props);
+
+void vk_cmdpool_destroy(struct vk_cmdpool *pool);
+
+// Fetch a command buffer from a command pool and begin recording to it.
+// Returns NULL on failure.
+struct vk_cmd *vk_cmd_begin(struct vk_cmdpool *pool, pl_debug_tag debug_tag);
+
+// Finish recording a command buffer and submit it for execution. This function
+// takes over ownership of **cmd, and sets *cmd to NULL in doing so.
+bool vk_cmd_submit(struct vk_cmd **cmd);
+
+// Block until some commands complete executing. This is the only function that
+// actually processes the callbacks. Will wait at most `timeout` nanoseconds
+// for the completion of any command. The timeout may also be passed as 0, in
+// which case this function will not block, but only poll for completed
+// commands. Returns whether any forward progress was made.
+//
+// This does *not* flush any queued commands, forgetting to do so may result
+// in infinite loops if waiting for the completion of callbacks that were
+// never flushed!
+bool vk_poll_commands(struct vk_ctx *vk, uint64_t timeout);
+
+// Rotate through queues in each command pool. Call this once per frame, after
+// submitting all of the command buffers for that frame. Calling this more
+// often than that is possible but bad for performance.
+void vk_rotate_queues(struct vk_ctx *vk);
+
+// Wait until all commands are complete, i.e. the device is idle. This is
+// basically equivalent to calling `vk_poll_commands` with a timeout of
+// UINT64_MAX until it returns `false`.
+void vk_wait_idle(struct vk_ctx *vk);
diff --git a/src/vulkan/common.h b/src/vulkan/common.h
new file mode 100644
index 0000000..31b309e
--- /dev/null
+++ b/src/vulkan/common.h
@@ -0,0 +1,234 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#define VK_NO_PROTOTYPES
+#define VK_ENABLE_BETA_EXTENSIONS // for VK_KHR_portability_subset
+#define VK_USE_PLATFORM_METAL_EXT
+
+#include "../common.h"
+#include "../log.h"
+#include "../pl_thread.h"
+
+#include <libplacebo/vulkan.h>
+
+#ifdef PL_HAVE_WIN32
+#include <windows.h>
+#include <vulkan/vulkan_win32.h>
+#endif
+
+// Vulkan allows the optional use of a custom allocator. We don't need one but
+// mark this parameter with a better name in case we ever decide to change this
+// in the future. (And to make the code more readable)
+#define PL_VK_ALLOC NULL
+
+// Type of a vulkan function that needs to be loaded
+#define PL_VK_FUN(name) PFN_vk##name name
+
+// Load a vulkan instance-level extension function directly (on the stack)
+#define PL_VK_LOAD_FUN(inst, name, get_addr) \
+ PL_VK_FUN(name) = (PFN_vk##name) get_addr(inst, "vk" #name);
+
+#ifndef VK_VENDOR_ID_NVIDIA
+#define VK_VENDOR_ID_NVIDIA 0x10DE
+#endif
+
+// Shared struct used to hold vulkan context information
+struct vk_ctx {
+ pl_mutex lock;
+ pl_vulkan vulkan;
+ void *alloc; // host allocations bound to the lifetime of this vk_ctx
+ struct vk_malloc *ma; // VRAM malloc layer
+ pl_vk_inst internal_instance;
+ pl_log log;
+ VkInstance inst;
+ VkPhysicalDevice physd;
+ VkPhysicalDeviceProperties props;
+ VkPhysicalDeviceFeatures2 features;
+ uint32_t api_ver; // device API version
+ VkDevice dev;
+ bool imported; // device was not created by us
+
+ // Generic error flag for catching "failed" devices
+ bool failed;
+
+ // Enabled extensions
+ PL_ARRAY(const char *) exts;
+
+ // Command pools (one per queue family)
+ PL_ARRAY(struct vk_cmdpool *) pools;
+
+ // Pointers into `pools` (always set)
+ struct vk_cmdpool *pool_graphics;
+ struct vk_cmdpool *pool_compute;
+ struct vk_cmdpool *pool_transfer;
+
+ // Queue locking functions
+ PL_ARRAY(PL_ARRAY(pl_mutex)) queue_locks;
+ void (*lock_queue)(void *queue_ctx, uint32_t qf, uint32_t idx);
+ void (*unlock_queue)(void *queue_ctx, uint32_t qf, uint32_t idx);
+ void *queue_ctx;
+
+ // Pending commands. These are shared for the entire mpvk_ctx to ensure
+ // submission and callbacks are FIFO
+ PL_ARRAY(struct vk_cmd *) cmds_pending; // submitted but not completed
+
+ // Pending callbacks that still need to be drained before processing
+ // callbacks for the next command (in case commands are recursively being
+ // polled from another callback)
+ const struct vk_callback *pending_callbacks;
+ int num_pending_callbacks;
+
+ // Instance-level function pointers
+ PL_VK_FUN(CreateDevice);
+ PL_VK_FUN(EnumerateDeviceExtensionProperties);
+ PL_VK_FUN(GetDeviceProcAddr);
+ PL_VK_FUN(GetInstanceProcAddr);
+ PL_VK_FUN(GetPhysicalDeviceExternalBufferProperties);
+ PL_VK_FUN(GetPhysicalDeviceExternalSemaphoreProperties);
+ PL_VK_FUN(GetPhysicalDeviceFeatures2KHR);
+ PL_VK_FUN(GetPhysicalDeviceFormatProperties);
+ PL_VK_FUN(GetPhysicalDeviceFormatProperties2KHR);
+ PL_VK_FUN(GetPhysicalDeviceImageFormatProperties2KHR);
+ PL_VK_FUN(GetPhysicalDeviceMemoryProperties);
+ PL_VK_FUN(GetPhysicalDeviceProperties);
+ PL_VK_FUN(GetPhysicalDeviceProperties2);
+ PL_VK_FUN(GetPhysicalDeviceQueueFamilyProperties);
+ PL_VK_FUN(GetPhysicalDeviceSurfaceCapabilitiesKHR);
+ PL_VK_FUN(GetPhysicalDeviceSurfaceFormatsKHR);
+ PL_VK_FUN(GetPhysicalDeviceSurfacePresentModesKHR);
+ PL_VK_FUN(GetPhysicalDeviceSurfaceSupportKHR);
+
+ // Device-level function pointers
+ PL_VK_FUN(AcquireNextImageKHR);
+ PL_VK_FUN(AllocateCommandBuffers);
+ PL_VK_FUN(AllocateDescriptorSets);
+ PL_VK_FUN(AllocateMemory);
+ PL_VK_FUN(BeginCommandBuffer);
+ PL_VK_FUN(BindBufferMemory);
+ PL_VK_FUN(BindImageMemory);
+ PL_VK_FUN(CmdBeginDebugUtilsLabelEXT);
+ PL_VK_FUN(CmdBeginRenderPass);
+ PL_VK_FUN(CmdBindDescriptorSets);
+ PL_VK_FUN(CmdBindIndexBuffer);
+ PL_VK_FUN(CmdBindPipeline);
+ PL_VK_FUN(CmdBindVertexBuffers);
+ PL_VK_FUN(CmdBlitImage);
+ PL_VK_FUN(CmdClearColorImage);
+ PL_VK_FUN(CmdCopyBuffer);
+ PL_VK_FUN(CmdCopyBufferToImage);
+ PL_VK_FUN(CmdCopyImage);
+ PL_VK_FUN(CmdCopyImageToBuffer);
+ PL_VK_FUN(CmdDispatch);
+ PL_VK_FUN(CmdDraw);
+ PL_VK_FUN(CmdDrawIndexed);
+ PL_VK_FUN(CmdEndDebugUtilsLabelEXT);
+ PL_VK_FUN(CmdEndRenderPass);
+ PL_VK_FUN(CmdPipelineBarrier);
+ PL_VK_FUN(CmdPipelineBarrier2KHR);
+ PL_VK_FUN(CmdPushConstants);
+ PL_VK_FUN(CmdPushDescriptorSetKHR);
+ PL_VK_FUN(CmdResetQueryPool);
+ PL_VK_FUN(CmdSetScissor);
+ PL_VK_FUN(CmdSetViewport);
+ PL_VK_FUN(CmdUpdateBuffer);
+ PL_VK_FUN(CmdWriteTimestamp);
+ PL_VK_FUN(CreateBuffer);
+ PL_VK_FUN(CreateBufferView);
+ PL_VK_FUN(CreateCommandPool);
+ PL_VK_FUN(CreateComputePipelines);
+ PL_VK_FUN(CreateDebugReportCallbackEXT);
+ PL_VK_FUN(CreateDescriptorPool);
+ PL_VK_FUN(CreateDescriptorSetLayout);
+ PL_VK_FUN(CreateFence);
+ PL_VK_FUN(CreateFramebuffer);
+ PL_VK_FUN(CreateGraphicsPipelines);
+ PL_VK_FUN(CreateImage);
+ PL_VK_FUN(CreateImageView);
+ PL_VK_FUN(CreatePipelineCache);
+ PL_VK_FUN(CreatePipelineLayout);
+ PL_VK_FUN(CreateQueryPool);
+ PL_VK_FUN(CreateRenderPass);
+ PL_VK_FUN(CreateSampler);
+ PL_VK_FUN(CreateSemaphore);
+ PL_VK_FUN(CreateShaderModule);
+ PL_VK_FUN(CreateSwapchainKHR);
+ PL_VK_FUN(DestroyBuffer);
+ PL_VK_FUN(DestroyBufferView);
+ PL_VK_FUN(DestroyCommandPool);
+ PL_VK_FUN(DestroyDebugReportCallbackEXT);
+ PL_VK_FUN(DestroyDescriptorPool);
+ PL_VK_FUN(DestroyDescriptorSetLayout);
+ PL_VK_FUN(DestroyDevice);
+ PL_VK_FUN(DestroyFence);
+ PL_VK_FUN(DestroyFramebuffer);
+ PL_VK_FUN(DestroyImage);
+ PL_VK_FUN(DestroyImageView);
+ PL_VK_FUN(DestroyInstance);
+ PL_VK_FUN(DestroyPipeline);
+ PL_VK_FUN(DestroyPipelineCache);
+ PL_VK_FUN(DestroyPipelineLayout);
+ PL_VK_FUN(DestroyQueryPool);
+ PL_VK_FUN(DestroyRenderPass);
+ PL_VK_FUN(DestroySampler);
+ PL_VK_FUN(DestroySemaphore);
+ PL_VK_FUN(DestroyShaderModule);
+ PL_VK_FUN(DestroySwapchainKHR);
+ PL_VK_FUN(DeviceWaitIdle);
+ PL_VK_FUN(EndCommandBuffer);
+ PL_VK_FUN(FlushMappedMemoryRanges);
+ PL_VK_FUN(FreeCommandBuffers);
+ PL_VK_FUN(FreeMemory);
+ PL_VK_FUN(GetBufferMemoryRequirements);
+ PL_VK_FUN(GetDeviceQueue);
+ PL_VK_FUN(GetImageDrmFormatModifierPropertiesEXT);
+ PL_VK_FUN(GetImageMemoryRequirements2);
+ PL_VK_FUN(GetImageSubresourceLayout);
+ PL_VK_FUN(GetMemoryFdKHR);
+ PL_VK_FUN(GetMemoryFdPropertiesKHR);
+ PL_VK_FUN(GetMemoryHostPointerPropertiesEXT);
+ PL_VK_FUN(GetPipelineCacheData);
+ PL_VK_FUN(GetQueryPoolResults);
+ PL_VK_FUN(GetSemaphoreFdKHR);
+ PL_VK_FUN(GetSwapchainImagesKHR);
+ PL_VK_FUN(InvalidateMappedMemoryRanges);
+ PL_VK_FUN(MapMemory);
+ PL_VK_FUN(QueuePresentKHR);
+ PL_VK_FUN(QueueSubmit);
+ PL_VK_FUN(QueueSubmit2KHR);
+ PL_VK_FUN(QueueWaitIdle);
+ PL_VK_FUN(ResetFences);
+ PL_VK_FUN(ResetQueryPool);
+ PL_VK_FUN(SetDebugUtilsObjectNameEXT);
+ PL_VK_FUN(SetHdrMetadataEXT);
+ PL_VK_FUN(UpdateDescriptorSets);
+ PL_VK_FUN(WaitForFences);
+ PL_VK_FUN(WaitSemaphores);
+
+#ifdef PL_HAVE_WIN32
+ PL_VK_FUN(GetMemoryWin32HandleKHR);
+ PL_VK_FUN(GetSemaphoreWin32HandleKHR);
+#endif
+
+#ifdef VK_EXT_metal_objects
+ PL_VK_FUN(ExportMetalObjectsEXT);
+#endif
+#ifdef VK_EXT_full_screen_exclusive
+ PL_VK_FUN(AcquireFullScreenExclusiveModeEXT);
+#endif
+};
diff --git a/src/vulkan/context.c b/src/vulkan/context.c
new file mode 100644
index 0000000..ad8a859
--- /dev/null
+++ b/src/vulkan/context.c
@@ -0,0 +1,1704 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include "command.h"
+#include "utils.h"
+#include "gpu.h"
+
+#ifdef PL_HAVE_VK_PROC_ADDR
+VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetInstanceProcAddr(
+ VkInstance instance,
+ const char* pName);
+#endif
+
+const struct pl_vk_inst_params pl_vk_inst_default_params = {0};
+
+struct vk_fun {
+ const char *name;
+ size_t offset;
+ bool device_level;
+};
+
+struct vk_ext {
+ const char *name;
+ const struct vk_fun *funs;
+};
+
+#define PL_VK_INST_FUN(N) \
+ { .name = "vk" #N, \
+ .offset = offsetof(struct vk_ctx, N), \
+ }
+
+#define PL_VK_DEV_FUN(N) \
+ { .name = "vk" #N, \
+ .offset = offsetof(struct vk_ctx, N), \
+ .device_level = true, \
+ }
+
+// Table of optional vulkan instance extensions
+static const char *vk_instance_extensions[] = {
+ VK_KHR_SURFACE_EXTENSION_NAME,
+ VK_EXT_SWAPCHAIN_COLOR_SPACE_EXTENSION_NAME,
+ VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME,
+ VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME,
+ VK_KHR_GET_SURFACE_CAPABILITIES_2_EXTENSION_NAME,
+};
+
+// List of mandatory instance-level function pointers, including functions
+// associated with mandatory instance extensions
+static const struct vk_fun vk_inst_funs[] = {
+ PL_VK_INST_FUN(CreateDevice),
+ PL_VK_INST_FUN(EnumerateDeviceExtensionProperties),
+ PL_VK_INST_FUN(GetDeviceProcAddr),
+ PL_VK_INST_FUN(GetPhysicalDeviceExternalBufferProperties),
+ PL_VK_INST_FUN(GetPhysicalDeviceExternalSemaphoreProperties),
+ PL_VK_INST_FUN(GetPhysicalDeviceFeatures2KHR),
+ PL_VK_INST_FUN(GetPhysicalDeviceFormatProperties),
+ PL_VK_INST_FUN(GetPhysicalDeviceFormatProperties2KHR),
+ PL_VK_INST_FUN(GetPhysicalDeviceImageFormatProperties2KHR),
+ PL_VK_INST_FUN(GetPhysicalDeviceMemoryProperties),
+ PL_VK_INST_FUN(GetPhysicalDeviceProperties),
+ PL_VK_INST_FUN(GetPhysicalDeviceProperties2),
+ PL_VK_INST_FUN(GetPhysicalDeviceQueueFamilyProperties),
+
+ // These are not actually mandatory, but they're universal enough that we
+ // just load them unconditionally (in lieu of not having proper support for
+ // loading arbitrary instance extensions). Their use is generally guarded
+ // behind various VkSurfaceKHR values already being provided by the API
+ // user (implying this extension is loaded).
+ PL_VK_INST_FUN(GetPhysicalDeviceSurfaceCapabilitiesKHR),
+ PL_VK_INST_FUN(GetPhysicalDeviceSurfaceFormatsKHR),
+ PL_VK_INST_FUN(GetPhysicalDeviceSurfacePresentModesKHR),
+ PL_VK_INST_FUN(GetPhysicalDeviceSurfaceSupportKHR),
+};
+
+// Table of vulkan device extensions and functions they load, including
+// functions exported by dependent instance-level extensions
+static const struct vk_ext vk_device_extensions[] = {
+ {
+ .name = VK_KHR_SWAPCHAIN_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(AcquireNextImageKHR),
+ PL_VK_DEV_FUN(CreateSwapchainKHR),
+ PL_VK_DEV_FUN(DestroySwapchainKHR),
+ PL_VK_DEV_FUN(GetSwapchainImagesKHR),
+ PL_VK_DEV_FUN(QueuePresentKHR),
+ {0}
+ },
+ }, {
+ .name = VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(CmdPushDescriptorSetKHR),
+ {0}
+ },
+ }, {
+ .name = VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(GetMemoryFdKHR),
+ {0}
+ },
+ }, {
+ .name = VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(GetMemoryFdPropertiesKHR),
+ {0}
+ },
+#ifdef PL_HAVE_WIN32
+ }, {
+ .name = VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(GetMemoryWin32HandleKHR),
+ {0}
+ },
+#endif
+ }, {
+ .name = VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(GetMemoryHostPointerPropertiesEXT),
+ {0}
+ },
+ }, {
+ .name = VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(GetSemaphoreFdKHR),
+ {0}
+ },
+#ifdef PL_HAVE_WIN32
+ }, {
+ .name = VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(GetSemaphoreWin32HandleKHR),
+ {0}
+ },
+#endif
+ }, {
+ .name = VK_EXT_PCI_BUS_INFO_EXTENSION_NAME,
+ }, {
+ .name = VK_EXT_HDR_METADATA_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(SetHdrMetadataEXT),
+ {0}
+ },
+ }, {
+ .name = VK_EXT_IMAGE_DRM_FORMAT_MODIFIER_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(GetImageDrmFormatModifierPropertiesEXT),
+ {0}
+ },
+#ifdef VK_KHR_portability_subset
+ }, {
+ .name = VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME,
+#endif
+#ifdef VK_EXT_metal_objects
+ }, {
+ .name = VK_EXT_METAL_OBJECTS_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(ExportMetalObjectsEXT),
+ {0}
+ },
+#endif
+#ifdef VK_EXT_full_screen_exclusive
+ }, {
+ .name = VK_EXT_FULL_SCREEN_EXCLUSIVE_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(AcquireFullScreenExclusiveModeEXT),
+ {0}
+ },
+#endif
+ }, {
+ .name = VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME,
+ .funs = (const struct vk_fun[]) {
+ PL_VK_DEV_FUN(CmdPipelineBarrier2KHR),
+ PL_VK_DEV_FUN(QueueSubmit2KHR),
+ {0}
+ },
+ },
+};
+
+// Make sure to keep this in sync with the above!
+const char * const pl_vulkan_recommended_extensions[] = {
+ VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME,
+ VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME,
+ VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME,
+ VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME,
+ VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME,
+#ifdef PL_HAVE_WIN32
+ VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME,
+ VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME,
+#endif
+ VK_EXT_PCI_BUS_INFO_EXTENSION_NAME,
+ VK_EXT_HDR_METADATA_EXTENSION_NAME,
+ VK_EXT_IMAGE_DRM_FORMAT_MODIFIER_EXTENSION_NAME,
+#ifdef VK_KHR_portability_subset
+ VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME,
+#endif
+#ifdef VK_EXT_metal_objects
+ VK_EXT_METAL_OBJECTS_EXTENSION_NAME,
+#endif
+#ifdef VK_EXT_full_screen_exclusive
+ VK_EXT_FULL_SCREEN_EXCLUSIVE_EXTENSION_NAME,
+#endif
+ VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME,
+};
+
+const int pl_vulkan_num_recommended_extensions =
+ PL_ARRAY_SIZE(pl_vulkan_recommended_extensions);
+
+// +1 because VK_KHR_swapchain is not automatically pulled in
+static_assert(PL_ARRAY_SIZE(pl_vulkan_recommended_extensions) + 1 ==
+ PL_ARRAY_SIZE(vk_device_extensions),
+ "pl_vulkan_recommended_extensions out of sync with "
+ "vk_device_extensions?");
+
+// Recommended features; keep in sync with libavutil vulkan hwcontext
+static const VkPhysicalDeviceVulkan13Features recommended_vk13 = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES,
+ .computeFullSubgroups = true,
+ .maintenance4 = true,
+ .shaderZeroInitializeWorkgroupMemory = true,
+ .synchronization2 = true,
+};
+
+static const VkPhysicalDeviceVulkan12Features recommended_vk12 = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES,
+ .pNext = (void *) &recommended_vk13,
+ .bufferDeviceAddress = true,
+ .storagePushConstant8 = true,
+ .shaderInt8 = true,
+ .shaderFloat16 = true,
+ .shaderSharedInt64Atomics = true,
+ .storageBuffer8BitAccess = true,
+ .uniformAndStorageBuffer8BitAccess = true,
+ .vulkanMemoryModel = true,
+ .vulkanMemoryModelDeviceScope = true,
+};
+
+static const VkPhysicalDeviceVulkan11Features recommended_vk11 = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES,
+ .pNext = (void *) &recommended_vk12,
+ .samplerYcbcrConversion = true,
+ .storagePushConstant16 = true,
+};
+
+const VkPhysicalDeviceFeatures2 pl_vulkan_recommended_features = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2,
+ .pNext = (void *) &recommended_vk11,
+ .features = {
+ .shaderImageGatherExtended = true,
+ .shaderStorageImageReadWithoutFormat = true,
+ .shaderStorageImageWriteWithoutFormat = true,
+
+ // Needed for GPU-assisted validation, but not harmful to enable
+ .fragmentStoresAndAtomics = true,
+ .vertexPipelineStoresAndAtomics = true,
+ .shaderInt64 = true,
+ }
+};
+
+// Required features
+static const VkPhysicalDeviceVulkan12Features required_vk12 = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES,
+ .hostQueryReset = true,
+ .timelineSemaphore = true,
+};
+
+static const VkPhysicalDeviceVulkan11Features required_vk11 = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES,
+ .pNext = (void *) &required_vk12,
+};
+
+const VkPhysicalDeviceFeatures2 pl_vulkan_required_features = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2,
+ .pNext = (void *) &required_vk11,
+};
+
+static bool check_required_features(struct vk_ctx *vk)
+{
+ #define CHECK_FEATURE(maj, min, feat) do { \
+ const VkPhysicalDeviceVulkan##maj##min##Features *f; \
+ f = vk_find_struct(&vk->features, \
+ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_##maj##_##min##_FEATURES); \
+ if (!f || !f->feat) { \
+ PL_ERR(vk, "Missing device feature: " #feat); \
+ return false; \
+ } \
+ } while (0)
+
+ CHECK_FEATURE(1, 2, hostQueryReset);
+ CHECK_FEATURE(1, 2, timelineSemaphore);
+
+ #undef CHECK_FEATURE
+ return true;
+}
+
+
+// List of mandatory device-level functions
+//
+// Note: Also includes VK_EXT_debug_utils functions, even though they aren't
+// mandatory, simply because we load that extension in a special way.
+static const struct vk_fun vk_dev_funs[] = {
+ PL_VK_DEV_FUN(AllocateCommandBuffers),
+ PL_VK_DEV_FUN(AllocateDescriptorSets),
+ PL_VK_DEV_FUN(AllocateMemory),
+ PL_VK_DEV_FUN(BeginCommandBuffer),
+ PL_VK_DEV_FUN(BindBufferMemory),
+ PL_VK_DEV_FUN(BindImageMemory),
+ PL_VK_DEV_FUN(CmdBeginDebugUtilsLabelEXT),
+ PL_VK_DEV_FUN(CmdBeginRenderPass),
+ PL_VK_DEV_FUN(CmdBindDescriptorSets),
+ PL_VK_DEV_FUN(CmdBindIndexBuffer),
+ PL_VK_DEV_FUN(CmdBindPipeline),
+ PL_VK_DEV_FUN(CmdBindVertexBuffers),
+ PL_VK_DEV_FUN(CmdBlitImage),
+ PL_VK_DEV_FUN(CmdClearColorImage),
+ PL_VK_DEV_FUN(CmdCopyBuffer),
+ PL_VK_DEV_FUN(CmdCopyBufferToImage),
+ PL_VK_DEV_FUN(CmdCopyImage),
+ PL_VK_DEV_FUN(CmdCopyImageToBuffer),
+ PL_VK_DEV_FUN(CmdDispatch),
+ PL_VK_DEV_FUN(CmdDraw),
+ PL_VK_DEV_FUN(CmdDrawIndexed),
+ PL_VK_DEV_FUN(CmdEndDebugUtilsLabelEXT),
+ PL_VK_DEV_FUN(CmdEndRenderPass),
+ PL_VK_DEV_FUN(CmdPipelineBarrier),
+ PL_VK_DEV_FUN(CmdPushConstants),
+ PL_VK_DEV_FUN(CmdResetQueryPool),
+ PL_VK_DEV_FUN(CmdSetScissor),
+ PL_VK_DEV_FUN(CmdSetViewport),
+ PL_VK_DEV_FUN(CmdUpdateBuffer),
+ PL_VK_DEV_FUN(CmdWriteTimestamp),
+ PL_VK_DEV_FUN(CreateBuffer),
+ PL_VK_DEV_FUN(CreateBufferView),
+ PL_VK_DEV_FUN(CreateCommandPool),
+ PL_VK_DEV_FUN(CreateComputePipelines),
+ PL_VK_DEV_FUN(CreateDescriptorPool),
+ PL_VK_DEV_FUN(CreateDescriptorSetLayout),
+ PL_VK_DEV_FUN(CreateFence),
+ PL_VK_DEV_FUN(CreateFramebuffer),
+ PL_VK_DEV_FUN(CreateGraphicsPipelines),
+ PL_VK_DEV_FUN(CreateImage),
+ PL_VK_DEV_FUN(CreateImageView),
+ PL_VK_DEV_FUN(CreatePipelineCache),
+ PL_VK_DEV_FUN(CreatePipelineLayout),
+ PL_VK_DEV_FUN(CreateQueryPool),
+ PL_VK_DEV_FUN(CreateRenderPass),
+ PL_VK_DEV_FUN(CreateSampler),
+ PL_VK_DEV_FUN(CreateSemaphore),
+ PL_VK_DEV_FUN(CreateShaderModule),
+ PL_VK_DEV_FUN(DestroyBuffer),
+ PL_VK_DEV_FUN(DestroyBufferView),
+ PL_VK_DEV_FUN(DestroyCommandPool),
+ PL_VK_DEV_FUN(DestroyDescriptorPool),
+ PL_VK_DEV_FUN(DestroyDescriptorSetLayout),
+ PL_VK_DEV_FUN(DestroyDevice),
+ PL_VK_DEV_FUN(DestroyFence),
+ PL_VK_DEV_FUN(DestroyFramebuffer),
+ PL_VK_DEV_FUN(DestroyImage),
+ PL_VK_DEV_FUN(DestroyImageView),
+ PL_VK_DEV_FUN(DestroyInstance),
+ PL_VK_DEV_FUN(DestroyPipeline),
+ PL_VK_DEV_FUN(DestroyPipelineCache),
+ PL_VK_DEV_FUN(DestroyPipelineLayout),
+ PL_VK_DEV_FUN(DestroyQueryPool),
+ PL_VK_DEV_FUN(DestroyRenderPass),
+ PL_VK_DEV_FUN(DestroySampler),
+ PL_VK_DEV_FUN(DestroySemaphore),
+ PL_VK_DEV_FUN(DestroyShaderModule),
+ PL_VK_DEV_FUN(DeviceWaitIdle),
+ PL_VK_DEV_FUN(EndCommandBuffer),
+ PL_VK_DEV_FUN(FlushMappedMemoryRanges),
+ PL_VK_DEV_FUN(FreeCommandBuffers),
+ PL_VK_DEV_FUN(FreeMemory),
+ PL_VK_DEV_FUN(GetBufferMemoryRequirements),
+ PL_VK_DEV_FUN(GetDeviceQueue),
+ PL_VK_DEV_FUN(GetImageMemoryRequirements2),
+ PL_VK_DEV_FUN(GetImageSubresourceLayout),
+ PL_VK_DEV_FUN(GetPipelineCacheData),
+ PL_VK_DEV_FUN(GetQueryPoolResults),
+ PL_VK_DEV_FUN(InvalidateMappedMemoryRanges),
+ PL_VK_DEV_FUN(MapMemory),
+ PL_VK_DEV_FUN(QueueSubmit),
+ PL_VK_DEV_FUN(QueueWaitIdle),
+ PL_VK_DEV_FUN(ResetFences),
+ PL_VK_DEV_FUN(ResetQueryPool),
+ PL_VK_DEV_FUN(SetDebugUtilsObjectNameEXT),
+ PL_VK_DEV_FUN(UpdateDescriptorSets),
+ PL_VK_DEV_FUN(WaitForFences),
+ PL_VK_DEV_FUN(WaitSemaphores),
+};
+
+static void load_vk_fun(struct vk_ctx *vk, const struct vk_fun *fun)
+{
+ PFN_vkVoidFunction *pfn = (void *) ((uintptr_t) vk + (ptrdiff_t) fun->offset);
+
+ if (fun->device_level) {
+ *pfn = vk->GetDeviceProcAddr(vk->dev, fun->name);
+ } else {
+ *pfn = vk->GetInstanceProcAddr(vk->inst, fun->name);
+ };
+
+ if (!*pfn) {
+ // Some functions get their extension suffix stripped when promoted
+ // to core. As a very simple work-around to this, try loading the
+ // function a second time with the reserved suffixes stripped.
+ static const char *ext_suffixes[] = { "KHR", "EXT" };
+ pl_str fun_name = pl_str0(fun->name);
+ char buf[64];
+
+ for (int i = 0; i < PL_ARRAY_SIZE(ext_suffixes); i++) {
+ if (!pl_str_eatend0(&fun_name, ext_suffixes[i]))
+ continue;
+
+ pl_assert(sizeof(buf) > fun_name.len);
+ snprintf(buf, sizeof(buf), "%.*s", PL_STR_FMT(fun_name));
+ if (fun->device_level) {
+ *pfn = vk->GetDeviceProcAddr(vk->dev, buf);
+ } else {
+ *pfn = vk->GetInstanceProcAddr(vk->inst, buf);
+ }
+ return;
+ }
+ }
+}
+
+// Private struct for pl_vk_inst
+struct priv {
+ VkDebugUtilsMessengerEXT debug_utils_cb;
+};
+
+void pl_vk_inst_destroy(pl_vk_inst *inst_ptr)
+{
+ pl_vk_inst inst = *inst_ptr;
+ if (!inst)
+ return;
+
+ struct priv *p = PL_PRIV(inst);
+ if (p->debug_utils_cb) {
+ PL_VK_LOAD_FUN(inst->instance, DestroyDebugUtilsMessengerEXT, inst->get_proc_addr);
+ DestroyDebugUtilsMessengerEXT(inst->instance, p->debug_utils_cb, PL_VK_ALLOC);
+ }
+
+ PL_VK_LOAD_FUN(inst->instance, DestroyInstance, inst->get_proc_addr);
+ DestroyInstance(inst->instance, PL_VK_ALLOC);
+ pl_free_ptr((void **) inst_ptr);
+}
+
+static VkBool32 VKAPI_PTR vk_dbg_utils_cb(VkDebugUtilsMessageSeverityFlagBitsEXT sev,
+ VkDebugUtilsMessageTypeFlagsEXT msgType,
+ const VkDebugUtilsMessengerCallbackDataEXT *data,
+ void *priv)
+{
+ pl_log log = priv;
+
+ // Ignore errors for messages that we consider false positives
+ switch (data->messageIdNumber) {
+ case 0x7cd0911d: // VUID-VkSwapchainCreateInfoKHR-imageExtent-01274
+ case 0x8928392f: // UNASSIGNED-BestPractices-NonSuccess-Result
+ case 0xdc18ad6b: // UNASSIGNED-BestPractices-vkAllocateMemory-small-allocation
+ case 0xb3d4346b: // UNASSIGNED-BestPractices-vkBindMemory-small-dedicated-allocation
+ case 0x6cfe18a5: // UNASSIGNED-BestPractices-SemaphoreCount
+ case 0x48a09f6c: // UNASSIGNED-BestPractices-pipeline-stage-flags
+ // profile chain expectations
+ case 0x30f4ac70: // VUID-VkImageCreateInfo-pNext-06811
+ return false;
+
+ case 0x5f379b89: // UNASSIGNED-BestPractices-Error-Result
+ if (strstr(data->pMessage, "VK_ERROR_FORMAT_NOT_SUPPORTED"))
+ return false;
+ break;
+
+ case 0xf6a37cfa: // VUID-vkGetImageSubresourceLayout-format-04461
+ // Work around https://github.com/KhronosGroup/Vulkan-Docs/issues/2109
+ return false;
+ }
+
+ enum pl_log_level lev;
+ switch (sev) {
+ case VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT: lev = PL_LOG_ERR; break;
+ case VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT: lev = PL_LOG_WARN; break;
+ case VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT: lev = PL_LOG_DEBUG; break;
+ case VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT: lev = PL_LOG_TRACE; break;
+ default: lev = PL_LOG_INFO; break;
+ }
+
+ pl_msg(log, lev, "vk %s", data->pMessage);
+
+ for (int i = 0; i < data->queueLabelCount; i++)
+ pl_msg(log, lev, " during %s", data->pQueueLabels[i].pLabelName);
+ for (int i = 0; i < data->cmdBufLabelCount; i++)
+ pl_msg(log, lev, " inside %s", data->pCmdBufLabels[i].pLabelName);
+ for (int i = 0; i < data->objectCount; i++) {
+ const VkDebugUtilsObjectNameInfoEXT *obj = &data->pObjects[i];
+ pl_msg(log, lev, " using %s: %s (0x%llx)",
+ vk_obj_type(obj->objectType),
+ obj->pObjectName ? obj->pObjectName : "anon",
+ (unsigned long long) obj->objectHandle);
+ }
+
+ // The return value of this function determines whether the call will
+ // be explicitly aborted (to prevent GPU errors) or not. In this case,
+ // we generally want this to be on for the validation errors, but nothing
+ // else (e.g. performance warnings)
+ bool is_error = (sev & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) &&
+ (msgType & VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT);
+
+ if (is_error) {
+ pl_log_stack_trace(log, lev);
+ pl_debug_abort();
+ return true;
+ }
+
+ return false;
+}
+
+static PFN_vkGetInstanceProcAddr get_proc_addr_fallback(pl_log log,
+ PFN_vkGetInstanceProcAddr get_proc_addr)
+{
+ if (get_proc_addr)
+ return get_proc_addr;
+
+#ifdef PL_HAVE_VK_PROC_ADDR
+ return vkGetInstanceProcAddr;
+#else
+ pl_fatal(log, "No `vkGetInstanceProcAddr` function provided, and "
+ "libplacebo built without linking against this function!");
+ return NULL;
+#endif
+}
+
+#define PRINTF_VER(ver) \
+ (int) VK_API_VERSION_MAJOR(ver), \
+ (int) VK_API_VERSION_MINOR(ver), \
+ (int) VK_API_VERSION_PATCH(ver)
+
+pl_vk_inst pl_vk_inst_create(pl_log log, const struct pl_vk_inst_params *params)
+{
+ void *tmp = pl_tmp(NULL);
+ params = PL_DEF(params, &pl_vk_inst_default_params);
+ VkInstance inst = NULL;
+ pl_clock_t start;
+
+ PL_ARRAY(const char *) exts = {0};
+
+ PFN_vkGetInstanceProcAddr get_addr;
+ if (!(get_addr = get_proc_addr_fallback(log, params->get_proc_addr)))
+ goto error;
+
+ // Query instance version support
+ uint32_t api_ver = VK_API_VERSION_1_0;
+ PL_VK_LOAD_FUN(NULL, EnumerateInstanceVersion, get_addr);
+ if (EnumerateInstanceVersion && EnumerateInstanceVersion(&api_ver) != VK_SUCCESS)
+ goto error;
+
+ pl_debug(log, "Available instance version: %d.%d.%d", PRINTF_VER(api_ver));
+
+ if (params->max_api_version) {
+ api_ver = PL_MIN(api_ver, params->max_api_version);
+ pl_info(log, "Restricting API version to %d.%d.%d... new version %d.%d.%d",
+ PRINTF_VER(params->max_api_version), PRINTF_VER(api_ver));
+ }
+
+ if (api_ver < PL_VK_MIN_VERSION) {
+ pl_fatal(log, "Instance API version %d.%d.%d is lower than the minimum "
+ "required version of %d.%d.%d, cannot proceed!",
+ PRINTF_VER(api_ver), PRINTF_VER(PL_VK_MIN_VERSION));
+ goto error;
+ }
+
+ VkInstanceCreateInfo info = {
+ .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
+ .pApplicationInfo = &(VkApplicationInfo) {
+ .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
+ .apiVersion = api_ver,
+ },
+ };
+
+ // Enumerate all supported layers
+ start = pl_clock_now();
+ PL_VK_LOAD_FUN(NULL, EnumerateInstanceLayerProperties, get_addr);
+ uint32_t num_layers_avail = 0;
+ EnumerateInstanceLayerProperties(&num_layers_avail, NULL);
+ VkLayerProperties *layers_avail = pl_calloc_ptr(tmp, num_layers_avail, layers_avail);
+ EnumerateInstanceLayerProperties(&num_layers_avail, layers_avail);
+ pl_log_cpu_time(log, start, pl_clock_now(), "enumerating instance layers");
+
+ pl_debug(log, "Available layers:");
+ for (int i = 0; i < num_layers_avail; i++) {
+ pl_debug(log, " %s (v%d.%d.%d)", layers_avail[i].layerName,
+ PRINTF_VER(layers_avail[i].specVersion));
+ }
+
+ PL_ARRAY(const char *) layers = {0};
+
+ // Sorted by priority
+ static const char *debug_layers[] = {
+ "VK_LAYER_KHRONOS_validation",
+ "VK_LAYER_LUNARG_standard_validation",
+ };
+
+ // This layer has to be initialized first, otherwise all sorts of weirdness
+ // happens (random segfaults, yum)
+ bool debug = params->debug;
+ uint32_t debug_layer = 0; // layer idx of debug layer
+ uint32_t debug_layer_version = 0;
+ if (debug) {
+ for (int i = 0; i < PL_ARRAY_SIZE(debug_layers); i++) {
+ for (int n = 0; n < num_layers_avail; n++) {
+ if (strcmp(debug_layers[i], layers_avail[n].layerName) != 0)
+ continue;
+
+ debug_layer = n;
+ debug_layer_version = layers_avail[n].specVersion;
+ pl_info(log, "Enabling debug meta layer: %s (v%d.%d.%d)",
+ debug_layers[i], PRINTF_VER(debug_layer_version));
+ PL_ARRAY_APPEND(tmp, layers, debug_layers[i]);
+ goto debug_layers_done;
+ }
+ }
+
+ // No layer found..
+ pl_warn(log, "API debugging requested but no debug meta layers present... ignoring");
+ debug = false;
+ }
+
+debug_layers_done: ;
+
+ for (int i = 0; i < params->num_layers; i++)
+ PL_ARRAY_APPEND(tmp, layers, params->layers[i]);
+
+ for (int i = 0; i < params->num_opt_layers; i++) {
+ const char *layer = params->opt_layers[i];
+ for (int n = 0; n < num_layers_avail; n++) {
+ if (strcmp(layer, layers_avail[n].layerName) == 0) {
+ PL_ARRAY_APPEND(tmp, layers, layer);
+ break;
+ }
+ }
+ }
+
+ // Enumerate all supported extensions
+ start = pl_clock_now();
+ PL_VK_LOAD_FUN(NULL, EnumerateInstanceExtensionProperties, get_addr);
+ uint32_t num_exts_avail = 0;
+ EnumerateInstanceExtensionProperties(NULL, &num_exts_avail, NULL);
+ VkExtensionProperties *exts_avail = pl_calloc_ptr(tmp, num_exts_avail, exts_avail);
+ EnumerateInstanceExtensionProperties(NULL, &num_exts_avail, exts_avail);
+
+ struct {
+ VkExtensionProperties *exts;
+ uint32_t num_exts;
+ } *layer_exts = pl_calloc_ptr(tmp, num_layers_avail, layer_exts);
+
+ // Enumerate extensions from layers
+ for (int i = 0; i < num_layers_avail; i++) {
+ VkExtensionProperties **lexts = &layer_exts[i].exts;
+ uint32_t *num = &layer_exts[i].num_exts;
+
+ EnumerateInstanceExtensionProperties(layers_avail[i].layerName, num, NULL);
+ *lexts = pl_calloc_ptr(tmp, *num, *lexts);
+ EnumerateInstanceExtensionProperties(layers_avail[i].layerName, num, *lexts);
+
+ // Replace all extensions that are already available globally by {0}
+ for (int j = 0; j < *num; j++) {
+ for (int k = 0; k < num_exts_avail; k++) {
+ if (strcmp((*lexts)[j].extensionName, exts_avail[k].extensionName) == 0)
+ (*lexts)[j] = (VkExtensionProperties) {0};
+ }
+ }
+ }
+
+ pl_log_cpu_time(log, start, pl_clock_now(), "enumerating instance extensions");
+ pl_debug(log, "Available instance extensions:");
+ for (int i = 0; i < num_exts_avail; i++)
+ pl_debug(log, " %s", exts_avail[i].extensionName);
+ for (int i = 0; i < num_layers_avail; i++) {
+ for (int j = 0; j < layer_exts[i].num_exts; j++) {
+ if (!layer_exts[i].exts[j].extensionName[0])
+ continue;
+
+ pl_debug(log, " %s (via %s)",
+ layer_exts[i].exts[j].extensionName,
+ layers_avail[i].layerName);
+ }
+ }
+
+ // Add mandatory extensions
+ PL_ARRAY_APPEND(tmp, exts, VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME);
+
+ // Add optional extensions
+ for (int i = 0; i < PL_ARRAY_SIZE(vk_instance_extensions); i++) {
+ const char *ext = vk_instance_extensions[i];
+ for (int n = 0; n < num_exts_avail; n++) {
+ if (strcmp(ext, exts_avail[n].extensionName) == 0) {
+ PL_ARRAY_APPEND(tmp, exts, ext);
+ break;
+ }
+ }
+ }
+
+#ifdef VK_KHR_portability_enumeration
+ // Required for macOS ( MoltenVK ) compatibility
+ for (int n = 0; n < num_exts_avail; n++) {
+ if (strcmp(VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME, exts_avail[n].extensionName) == 0) {
+ PL_ARRAY_APPEND(tmp, exts, VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME);
+ info.flags |= VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR;
+ break;
+ }
+ }
+#endif
+
+ // Add extra user extensions
+ for (int i = 0; i < params->num_extensions; i++) {
+ const char *ext = params->extensions[i];
+ PL_ARRAY_APPEND(tmp, exts, ext);
+
+ // Enable any additional layers that are required for this extension
+ for (int n = 0; n < num_layers_avail; n++) {
+ for (int j = 0; j < layer_exts[n].num_exts; j++) {
+ if (!layer_exts[n].exts[j].extensionName[0])
+ continue;
+ if (strcmp(ext, layer_exts[n].exts[j].extensionName) == 0) {
+ PL_ARRAY_APPEND(tmp, layers, layers_avail[n].layerName);
+ goto next_user_ext;
+ }
+ }
+ }
+
+next_user_ext: ;
+ }
+
+ // Add extra optional user extensions
+ for (int i = 0; i < params->num_opt_extensions; i++) {
+ const char *ext = params->opt_extensions[i];
+ for (int n = 0; n < num_exts_avail; n++) {
+ if (strcmp(ext, exts_avail[n].extensionName) == 0) {
+ PL_ARRAY_APPEND(tmp, exts, ext);
+ goto next_opt_user_ext;
+ }
+ }
+
+ for (int n = 0; n < num_layers_avail; n++) {
+ for (int j = 0; j < layer_exts[n].num_exts; j++) {
+ if (!layer_exts[n].exts[j].extensionName[0])
+ continue;
+ if (strcmp(ext, layer_exts[n].exts[j].extensionName) == 0) {
+ PL_ARRAY_APPEND(tmp, exts, ext);
+ PL_ARRAY_APPEND(tmp, layers, layers_avail[n].layerName);
+ goto next_opt_user_ext;
+ }
+ }
+ }
+
+next_opt_user_ext: ;
+ }
+
+ // If debugging is enabled, load the necessary debug utils extension
+ if (debug) {
+ const char * const ext = VK_EXT_DEBUG_UTILS_EXTENSION_NAME;
+ for (int n = 0; n < num_exts_avail; n++) {
+ if (strcmp(ext, exts_avail[n].extensionName) == 0) {
+ PL_ARRAY_APPEND(tmp, exts, ext);
+ goto debug_ext_done;
+ }
+ }
+
+ for (int n = 0; n < layer_exts[debug_layer].num_exts; n++) {
+ if (strcmp(ext, layer_exts[debug_layer].exts[n].extensionName) == 0) {
+ PL_ARRAY_APPEND(tmp, exts, ext);
+ goto debug_ext_done;
+ }
+ }
+
+ // No extension found
+ pl_warn(log, "API debug layers enabled but no debug report extension "
+ "found... ignoring. Debug messages may be spilling to "
+ "stdout/stderr!");
+ debug = false;
+ }
+
+debug_ext_done: ;
+
+ // Limit this to 1.3.250+ because of bugs in older versions.
+ if (debug && params->debug_extra &&
+ debug_layer_version >= VK_MAKE_API_VERSION(0, 1, 3, 259))
+ {
+ // Try enabling as many validation features as possible
+ static const VkValidationFeatureEnableEXT validation_features[] = {
+ VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT,
+ VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT,
+ VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT,
+ VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT,
+ };
+
+ static const VkValidationFeaturesEXT vinfo = {
+ .sType = VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT,
+ .pEnabledValidationFeatures = validation_features,
+ .enabledValidationFeatureCount = PL_ARRAY_SIZE(validation_features),
+ };
+
+ const char * const ext = VK_EXT_VALIDATION_FEATURES_EXTENSION_NAME;
+ for (int n = 0; n < num_exts_avail; n++) {
+ if (strcmp(ext, exts_avail[n].extensionName) == 0) {
+ PL_ARRAY_APPEND(tmp, exts, ext);
+ vk_link_struct(&info, &vinfo);
+ goto debug_extra_ext_done;
+ }
+ }
+
+ for (int n = 0; n < layer_exts[debug_layer].num_exts; n++) {
+ if (strcmp(ext, layer_exts[debug_layer].exts[n].extensionName) == 0) {
+ PL_ARRAY_APPEND(tmp, exts, ext);
+ vk_link_struct(&info, &vinfo);
+ goto debug_extra_ext_done;
+ }
+ }
+
+ pl_warn(log, "GPU-assisted validation enabled but not supported by "
+ "instance, disabling...");
+ }
+
+debug_extra_ext_done: ;
+
+ info.ppEnabledExtensionNames = exts.elem;
+ info.enabledExtensionCount = exts.num;
+ info.ppEnabledLayerNames = layers.elem;
+ info.enabledLayerCount = layers.num;
+
+ pl_info(log, "Creating vulkan instance%s", exts.num ? " with extensions:" : "");
+ for (int i = 0; i < exts.num; i++)
+ pl_info(log, " %s", exts.elem[i]);
+
+ if (layers.num) {
+ pl_info(log, " and layers:");
+ for (int i = 0; i < layers.num; i++)
+ pl_info(log, " %s", layers.elem[i]);
+ }
+
+ start = pl_clock_now();
+ PL_VK_LOAD_FUN(NULL, CreateInstance, get_addr);
+ VkResult res = CreateInstance(&info, PL_VK_ALLOC, &inst);
+ pl_log_cpu_time(log, start, pl_clock_now(), "creating vulkan instance");
+ if (res != VK_SUCCESS) {
+ pl_fatal(log, "Failed creating instance: %s", vk_res_str(res));
+ goto error;
+ }
+
+ struct pl_vk_inst_t *pl_vk = pl_zalloc_obj(NULL, pl_vk, struct priv);
+ struct priv *p = PL_PRIV(pl_vk);
+ *pl_vk = (struct pl_vk_inst_t) {
+ .instance = inst,
+ .api_version = api_ver,
+ .get_proc_addr = get_addr,
+ .extensions = pl_steal(pl_vk, exts.elem),
+ .num_extensions = exts.num,
+ .layers = pl_steal(pl_vk, layers.elem),
+ .num_layers = layers.num,
+ };
+
+ // Set up a debug callback to catch validation messages
+ if (debug) {
+ VkDebugUtilsMessengerCreateInfoEXT dinfo = {
+ .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT,
+ .messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT |
+ VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT |
+ VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT |
+ VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT,
+ .messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT |
+ VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT |
+ VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT,
+ .pfnUserCallback = vk_dbg_utils_cb,
+ .pUserData = (void *) log,
+ };
+
+ PL_VK_LOAD_FUN(inst, CreateDebugUtilsMessengerEXT, get_addr);
+ CreateDebugUtilsMessengerEXT(inst, &dinfo, PL_VK_ALLOC, &p->debug_utils_cb);
+ }
+
+ pl_free(tmp);
+ return pl_vk;
+
+error:
+ pl_fatal(log, "Failed initializing vulkan instance");
+ if (inst) {
+ PL_VK_LOAD_FUN(inst, DestroyInstance, get_addr);
+ DestroyInstance(inst, PL_VK_ALLOC);
+ }
+ pl_free(tmp);
+ return NULL;
+}
+
+const struct pl_vulkan_params pl_vulkan_default_params = { PL_VULKAN_DEFAULTS };
+
+void pl_vulkan_destroy(pl_vulkan *pl_vk)
+{
+ if (!*pl_vk)
+ return;
+
+ struct vk_ctx *vk = PL_PRIV(*pl_vk);
+ if (vk->dev) {
+ if ((*pl_vk)->gpu) {
+ PL_DEBUG(vk, "Waiting for remaining commands...");
+ pl_gpu_finish((*pl_vk)->gpu);
+ pl_assert(vk->cmds_pending.num == 0);
+
+ pl_gpu_destroy((*pl_vk)->gpu);
+ }
+ vk_malloc_destroy(&vk->ma);
+ for (int i = 0; i < vk->pools.num; i++)
+ vk_cmdpool_destroy(vk->pools.elem[i]);
+
+ if (!vk->imported)
+ vk->DestroyDevice(vk->dev, PL_VK_ALLOC);
+ }
+
+ for (int i = 0; i < vk->queue_locks.num; i++) {
+ for (int n = 0; n < vk->queue_locks.elem[i].num; n++)
+ pl_mutex_destroy(&vk->queue_locks.elem[i].elem[n]);
+ }
+
+ pl_vk_inst_destroy(&vk->internal_instance);
+ pl_mutex_destroy(&vk->lock);
+ pl_free_ptr((void **) pl_vk);
+}
+
+static bool supports_surf(pl_log log, VkInstance inst,
+ PFN_vkGetInstanceProcAddr get_addr,
+ VkPhysicalDevice physd, VkSurfaceKHR surf)
+{
+ // Hack for the VK macro's logging to work
+ struct { pl_log log; } *vk = (void *) &log;
+
+ PL_VK_LOAD_FUN(inst, GetPhysicalDeviceQueueFamilyProperties, get_addr);
+ PL_VK_LOAD_FUN(inst, GetPhysicalDeviceSurfaceSupportKHR, get_addr);
+ uint32_t qfnum = 0;
+ GetPhysicalDeviceQueueFamilyProperties(physd, &qfnum, NULL);
+
+ for (int i = 0; i < qfnum; i++) {
+ VkBool32 sup = false;
+ VK(GetPhysicalDeviceSurfaceSupportKHR(physd, i, surf, &sup));
+ if (sup)
+ return true;
+ }
+
+error:
+ return false;
+}
+
+VkPhysicalDevice pl_vulkan_choose_device(pl_log log,
+ const struct pl_vulkan_device_params *params)
+{
+ // Hack for the VK macro's logging to work
+ struct { pl_log log; } *vk = (void *) &log;
+ PL_INFO(vk, "Probing for vulkan devices:");
+
+ pl_assert(params->instance);
+ VkInstance inst = params->instance;
+ VkPhysicalDevice dev = VK_NULL_HANDLE;
+
+ PFN_vkGetInstanceProcAddr get_addr;
+ if (!(get_addr = get_proc_addr_fallback(log, params->get_proc_addr)))
+ return NULL;
+
+ PL_VK_LOAD_FUN(inst, EnumeratePhysicalDevices, get_addr);
+ PL_VK_LOAD_FUN(inst, GetPhysicalDeviceProperties2, get_addr);
+ pl_assert(GetPhysicalDeviceProperties2);
+
+ pl_clock_t start = pl_clock_now();
+ VkPhysicalDevice *devices = NULL;
+ uint32_t num = 0;
+ VK(EnumeratePhysicalDevices(inst, &num, NULL));
+ devices = pl_calloc_ptr(NULL, num, devices);
+ VK(EnumeratePhysicalDevices(inst, &num, devices));
+ pl_log_cpu_time(log, start, pl_clock_now(), "enumerating physical devices");
+
+ static const struct { const char *name; int priority; } types[] = {
+ [VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU] = {"discrete", 5},
+ [VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU] = {"integrated", 4},
+ [VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU] = {"virtual", 3},
+ [VK_PHYSICAL_DEVICE_TYPE_CPU] = {"software", 2},
+ [VK_PHYSICAL_DEVICE_TYPE_OTHER] = {"other", 1},
+ };
+
+ static const uint8_t nil[VK_UUID_SIZE] = {0};
+ bool uuid_set = memcmp(params->device_uuid, nil, VK_UUID_SIZE) != 0;
+
+ int best = -1;
+ for (int i = 0; i < num; i++) {
+ VkPhysicalDeviceIDPropertiesKHR id_props = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR,
+ };
+
+ VkPhysicalDeviceProperties2 prop = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
+ .pNext = &id_props,
+ };
+
+ GetPhysicalDeviceProperties2(devices[i], &prop);
+ VkPhysicalDeviceType t = prop.properties.deviceType;
+ const char *dtype = t < PL_ARRAY_SIZE(types) ? types[t].name : "unknown?";
+ PL_INFO(vk, " GPU %d: %s v%d.%d.%d (%s)", i, prop.properties.deviceName,
+ PRINTF_VER(prop.properties.apiVersion), dtype);
+ PL_INFO(vk, " uuid: %s", PRINT_UUID(id_props.deviceUUID));
+
+ if (params->surface) {
+ if (!supports_surf(log, inst, get_addr, devices[i], params->surface)) {
+ PL_DEBUG(vk, " -> excluding due to lack of surface support");
+ continue;
+ }
+ }
+
+ if (uuid_set) {
+ if (memcmp(id_props.deviceUUID, params->device_uuid, VK_UUID_SIZE) == 0) {
+ dev = devices[i];
+ continue;
+ } else {
+ PL_DEBUG(vk, " -> excluding due to UUID mismatch");
+ continue;
+ }
+ } else if (params->device_name && params->device_name[0] != '\0') {
+ if (strcmp(params->device_name, prop.properties.deviceName) == 0) {
+ dev = devices[i];
+ continue;
+ } else {
+ PL_DEBUG(vk, " -> excluding due to name mismatch");
+ continue;
+ }
+ }
+
+ if (!params->allow_software && t == VK_PHYSICAL_DEVICE_TYPE_CPU) {
+ PL_DEBUG(vk, " -> excluding due to !params->allow_software");
+ continue;
+ }
+
+ if (prop.properties.apiVersion < PL_VK_MIN_VERSION) {
+ PL_DEBUG(vk, " -> excluding due to too low API version");
+ continue;
+ }
+
+ int priority = t < PL_ARRAY_SIZE(types) ? types[t].priority : 0;
+ if (priority > best) {
+ dev = devices[i];
+ best = priority;
+ }
+ }
+
+error:
+ pl_free(devices);
+ return dev;
+}
+
+static void lock_queue_internal(void *priv, uint32_t qf, uint32_t qidx)
+{
+ struct vk_ctx *vk = priv;
+ pl_mutex_lock(&vk->queue_locks.elem[qf].elem[qidx]);
+}
+
+static void unlock_queue_internal(void *priv, uint32_t qf, uint32_t qidx)
+{
+ struct vk_ctx *vk = priv;
+ pl_mutex_unlock(&vk->queue_locks.elem[qf].elem[qidx]);
+}
+
+static void init_queue_locks(struct vk_ctx *vk, uint32_t qfnum,
+ const VkQueueFamilyProperties *qfs)
+{
+ vk->queue_locks.elem = pl_calloc_ptr(vk->alloc, qfnum, vk->queue_locks.elem);
+ vk->queue_locks.num = qfnum;
+ for (int i = 0; i < qfnum; i++) {
+ const uint32_t qnum = qfs[i].queueCount;
+ vk->queue_locks.elem[i].elem = pl_calloc(vk->alloc, qnum, sizeof(pl_mutex));
+ vk->queue_locks.elem[i].num = qnum;
+ for (int n = 0; n < qnum; n++)
+ pl_mutex_init(&vk->queue_locks.elem[i].elem[n]);
+ }
+
+ vk->lock_queue = lock_queue_internal;
+ vk->unlock_queue = unlock_queue_internal;
+ vk->queue_ctx = vk;
+}
+
+// Find the most specialized queue supported a combination of flags. In cases
+// where there are multiple queue families at the same specialization level,
+// this finds the one with the most queues. Returns -1 if no queue was found.
+static int find_qf(VkQueueFamilyProperties *qfs, int qfnum, VkQueueFlags flags)
+{
+ int idx = -1;
+ for (int i = 0; i < qfnum; i++) {
+ if ((qfs[i].queueFlags & flags) != flags)
+ continue;
+
+ // QF is more specialized. Since we don't care about other bits like
+ // SPARSE_BIT, mask the ones we're interestew in
+ const VkQueueFlags mask = VK_QUEUE_GRAPHICS_BIT |
+ VK_QUEUE_TRANSFER_BIT |
+ VK_QUEUE_COMPUTE_BIT;
+
+ if (idx < 0 || (qfs[i].queueFlags & mask) < (qfs[idx].queueFlags & mask))
+ idx = i;
+
+ // QF has more queues (at the same specialization level)
+ if (qfs[i].queueFlags == qfs[idx].queueFlags &&
+ qfs[i].queueCount > qfs[idx].queueCount)
+ idx = i;
+ }
+
+ return idx;
+}
+
+static bool device_init(struct vk_ctx *vk, const struct pl_vulkan_params *params)
+{
+ pl_assert(vk->physd);
+ void *tmp = pl_tmp(NULL);
+
+ // Enumerate the queue families and find suitable families for each task
+ uint32_t qfnum = 0;
+ vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, NULL);
+ VkQueueFamilyProperties *qfs = pl_calloc_ptr(tmp, qfnum, qfs);
+ vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, qfs);
+ init_queue_locks(vk, qfnum, qfs);
+
+ PL_DEBUG(vk, "Queue families supported by device:");
+ for (int i = 0; i < qfnum; i++) {
+ PL_DEBUG(vk, " %d: flags 0x%"PRIx32" num %"PRIu32, i,
+ qfs[i].queueFlags, qfs[i].queueCount);
+ }
+
+ VkQueueFlagBits gfx_flags = VK_QUEUE_GRAPHICS_BIT;
+ if (!params->async_compute)
+ gfx_flags |= VK_QUEUE_COMPUTE_BIT;
+
+ int idx_gfx = find_qf(qfs, qfnum, gfx_flags);
+ int idx_comp = find_qf(qfs, qfnum, VK_QUEUE_COMPUTE_BIT);
+ int idx_tf = find_qf(qfs, qfnum, VK_QUEUE_TRANSFER_BIT);
+ if (idx_tf < 0)
+ idx_tf = idx_comp;
+
+ if (!params->async_compute)
+ idx_comp = idx_gfx;
+ if (!params->async_transfer)
+ idx_tf = idx_gfx;
+
+ PL_DEBUG(vk, "Using graphics queue %d", idx_gfx);
+ if (idx_tf != idx_gfx)
+ PL_INFO(vk, "Using async transfer (queue %d)", idx_tf);
+ if (idx_comp != idx_gfx)
+ PL_INFO(vk, "Using async compute (queue %d)", idx_comp);
+
+ // Vulkan requires at least one GRAPHICS+COMPUTE queue, so if this fails
+ // something is horribly wrong.
+ pl_assert(idx_gfx >= 0 && idx_comp >= 0 && idx_tf >= 0);
+
+ // If needed, ensure we can actually present to the surface using this queue
+ if (params->surface) {
+ VkBool32 sup = false;
+ VK(vk->GetPhysicalDeviceSurfaceSupportKHR(vk->physd, idx_gfx,
+ params->surface, &sup));
+ if (!sup) {
+ PL_FATAL(vk, "Queue family does not support surface presentation!");
+ goto error;
+ }
+ }
+
+ // Enumerate all supported extensions
+ pl_clock_t start = pl_clock_now();
+ uint32_t num_exts_avail = 0;
+ VK(vk->EnumerateDeviceExtensionProperties(vk->physd, NULL, &num_exts_avail, NULL));
+ VkExtensionProperties *exts_avail = pl_calloc_ptr(tmp, num_exts_avail, exts_avail);
+ VK(vk->EnumerateDeviceExtensionProperties(vk->physd, NULL, &num_exts_avail, exts_avail));
+ pl_log_cpu_time(vk->log, start, pl_clock_now(), "enumerating device extensions");
+
+ PL_DEBUG(vk, "Available device extensions:");
+ for (int i = 0; i < num_exts_avail; i++)
+ PL_DEBUG(vk, " %s", exts_avail[i].extensionName);
+
+ // Add all extensions we need
+ if (params->surface)
+ PL_ARRAY_APPEND(vk->alloc, vk->exts, VK_KHR_SWAPCHAIN_EXTENSION_NAME);
+
+ // Keep track of all optional function pointers associated with extensions
+ PL_ARRAY(const struct vk_fun *) ext_funs = {0};
+
+ // Add all optional device-level extensions extensions
+ for (int i = 0; i < PL_ARRAY_SIZE(vk_device_extensions); i++) {
+ const struct vk_ext *ext = &vk_device_extensions[i];
+ uint32_t core_ver = vk_ext_promoted_ver(ext->name);
+ if (core_ver && vk->api_ver >= core_ver) {
+ // Layer is already implicitly enabled by the API version
+ for (const struct vk_fun *f = ext->funs; f && f->name; f++)
+ PL_ARRAY_APPEND(tmp, ext_funs, f);
+ continue;
+ }
+
+ for (int n = 0; n < num_exts_avail; n++) {
+ if (strcmp(ext->name, exts_avail[n].extensionName) == 0) {
+ PL_ARRAY_APPEND(vk->alloc, vk->exts, ext->name);
+ for (const struct vk_fun *f = ext->funs; f && f->name; f++)
+ PL_ARRAY_APPEND(tmp, ext_funs, f);
+ break;
+ }
+ }
+ }
+
+ // Add extra user extensions
+ for (int i = 0; i < params->num_extensions; i++)
+ PL_ARRAY_APPEND(vk->alloc, vk->exts, params->extensions[i]);
+
+ // Add optional extra user extensions
+ for (int i = 0; i < params->num_opt_extensions; i++) {
+ const char *ext = params->opt_extensions[i];
+ for (int n = 0; n < num_exts_avail; n++) {
+ if (strcmp(ext, exts_avail[n].extensionName) == 0) {
+ PL_ARRAY_APPEND(vk->alloc, vk->exts, ext);
+ break;
+ }
+ }
+ }
+
+ VkPhysicalDeviceFeatures2 features = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR
+ };
+
+ vk_features_normalize(tmp, &pl_vulkan_required_features, vk->api_ver, &features);
+ vk_features_normalize(tmp, &pl_vulkan_recommended_features, vk->api_ver, &features);
+ vk_features_normalize(tmp, params->features, vk->api_ver, &features);
+
+ // Explicitly clear the features struct before querying feature support
+ // from the driver. This way, we don't mistakenly mark as supported
+ // features coming from structs the driver doesn't have support for.
+ VkPhysicalDeviceFeatures2 *features_sup = vk_chain_memdup(tmp, &features);;
+ for (VkBaseOutStructure *out = (void *) features_sup; out; out = out->pNext) {
+ const size_t size = vk_struct_size(out->sType);
+ memset(&out[1], 0, size - sizeof(out[0]));
+ }
+
+ vk->GetPhysicalDeviceFeatures2KHR(vk->physd, features_sup);
+
+ // Filter out unsupported features
+ for (VkBaseOutStructure *f = (VkBaseOutStructure *) &features; f; f = f->pNext) {
+ const VkBaseInStructure *sup = vk_find_struct(features_sup, f->sType);
+ VkBool32 *flags = (VkBool32 *) &f[1];
+ const VkBool32 *flags_sup = (const VkBool32 *) &sup[1];
+ const size_t size = vk_struct_size(f->sType) - sizeof(VkBaseOutStructure);
+ for (int i = 0; i < size / sizeof(VkBool32); i++)
+ flags[i] &= flags_sup[i];
+ }
+
+ // Construct normalized output chain
+ vk->features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
+ vk_features_normalize(vk->alloc, &features, 0, &vk->features);
+ if (!check_required_features(vk)) {
+ PL_FATAL(vk, "Vulkan device does not support all required features!");
+ goto error;
+ }
+
+ // Enable all queues at device creation time, to maximize compatibility
+ // with other API users (e.g. FFmpeg)
+ PL_ARRAY(VkDeviceQueueCreateInfo) qinfos = {0};
+ for (int i = 0; i < qfnum; i++) {
+ bool use_qf = i == idx_gfx || i == idx_comp || i == idx_tf;
+ use_qf |= qfs[i].queueFlags & params->extra_queues;
+ if (!use_qf)
+ continue;
+ PL_ARRAY_APPEND(tmp, qinfos, (VkDeviceQueueCreateInfo) {
+ .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
+ .queueFamilyIndex = i,
+ .queueCount = qfs[i].queueCount,
+ .pQueuePriorities = pl_calloc(tmp, qfs[i].queueCount, sizeof(float)),
+ });
+ }
+
+ VkDeviceCreateInfo dinfo = {
+ .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
+ .pNext = &features,
+ .pQueueCreateInfos = qinfos.elem,
+ .queueCreateInfoCount = qinfos.num,
+ .ppEnabledExtensionNames = vk->exts.elem,
+ .enabledExtensionCount = vk->exts.num,
+ };
+
+ PL_INFO(vk, "Creating vulkan device%s", vk->exts.num ? " with extensions:" : "");
+ for (int i = 0; i < vk->exts.num; i++)
+ PL_INFO(vk, " %s", vk->exts.elem[i]);
+
+ start = pl_clock_now();
+ VK(vk->CreateDevice(vk->physd, &dinfo, PL_VK_ALLOC, &vk->dev));
+ pl_log_cpu_time(vk->log, start, pl_clock_now(), "creating vulkan device");
+
+ // Load all mandatory device-level functions
+ for (int i = 0; i < PL_ARRAY_SIZE(vk_dev_funs); i++)
+ load_vk_fun(vk, &vk_dev_funs[i]);
+
+ // Load all of the optional functions from the extensions we enabled
+ for (int i = 0; i < ext_funs.num; i++)
+ load_vk_fun(vk, ext_funs.elem[i]);
+
+ // Create the command pools for the queues we care about
+ const uint32_t qmax = PL_DEF(params->queue_count, UINT32_MAX);
+ for (int i = 0; i < qfnum; i++) {
+ if (i != idx_gfx && i != idx_tf && i != idx_comp)
+ continue; // ignore QFs not used internally
+
+ int qnum = qfs[i].queueCount;
+ if (qmax < qnum) {
+ PL_DEBUG(vk, "Restricting QF %d from %d queues to %d", i, qnum, qmax);
+ qnum = qmax;
+ }
+
+ struct vk_cmdpool *pool = vk_cmdpool_create(vk, i, qnum, qfs[i]);
+ if (!pool)
+ goto error;
+ PL_ARRAY_APPEND(vk->alloc, vk->pools, pool);
+
+ // Update the pool_* pointers based on the corresponding index
+ const char *qf_name = NULL;
+ if (i == idx_tf) {
+ vk->pool_transfer = pool;
+ qf_name = "transfer";
+ }
+ if (i == idx_comp) {
+ vk->pool_compute = pool;
+ qf_name = "compute";
+ }
+ if (i == idx_gfx) {
+ vk->pool_graphics = pool;
+ qf_name = "graphics";
+ }
+
+ for (int n = 0; n < pool->num_queues; n++)
+ PL_VK_NAME_HANDLE(QUEUE, pool->queues[n], qf_name);
+ }
+
+ pl_free(tmp);
+ return true;
+
+error:
+ PL_FATAL(vk, "Failed creating logical device!");
+ pl_free(tmp);
+ vk->failed = true;
+ return false;
+}
+
+static void lock_queue(pl_vulkan pl_vk, uint32_t qf, uint32_t qidx)
+{
+ struct vk_ctx *vk = PL_PRIV(pl_vk);
+ vk->lock_queue(vk->queue_ctx, qf, qidx);
+}
+
+static void unlock_queue(pl_vulkan pl_vk, uint32_t qf, uint32_t qidx)
+{
+ struct vk_ctx *vk = PL_PRIV(pl_vk);
+ vk->unlock_queue(vk->queue_ctx, qf, qidx);
+}
+
+static bool finalize_context(struct pl_vulkan_t *pl_vk, int max_glsl_version)
+{
+ struct vk_ctx *vk = PL_PRIV(pl_vk);
+
+ pl_assert(vk->pool_graphics);
+ pl_assert(vk->pool_compute);
+ pl_assert(vk->pool_transfer);
+
+ vk->ma = vk_malloc_create(vk);
+ if (!vk->ma)
+ return false;
+
+ pl_vk->gpu = pl_gpu_create_vk(vk);
+ if (!pl_vk->gpu)
+ return false;
+
+ // Blacklist / restrict features
+ if (max_glsl_version) {
+ struct pl_glsl_version *glsl = (struct pl_glsl_version *) &pl_vk->gpu->glsl;
+ glsl->version = PL_MIN(glsl->version, max_glsl_version);
+ glsl->version = PL_MAX(glsl->version, 140); // required for GL_KHR_vulkan_glsl
+ PL_INFO(vk, "Restricting GLSL version to %d... new version is %d",
+ max_glsl_version, glsl->version);
+ }
+
+ // Expose the resulting vulkan objects
+ pl_vk->instance = vk->inst;
+ pl_vk->phys_device = vk->physd;
+ pl_vk->device = vk->dev;
+ pl_vk->get_proc_addr = vk->GetInstanceProcAddr;
+ pl_vk->api_version = vk->api_ver;
+ pl_vk->extensions = vk->exts.elem;
+ pl_vk->num_extensions = vk->exts.num;
+ pl_vk->features = &vk->features;
+ pl_vk->num_queues = vk->pools.num;
+ pl_vk->queues = pl_calloc_ptr(vk->alloc, vk->pools.num, pl_vk->queues);
+ pl_vk->lock_queue = lock_queue;
+ pl_vk->unlock_queue = unlock_queue;
+
+ for (int i = 0; i < vk->pools.num; i++) {
+ struct pl_vulkan_queue *queues = (struct pl_vulkan_queue *) pl_vk->queues;
+ queues[i] = (struct pl_vulkan_queue) {
+ .index = vk->pools.elem[i]->qf,
+ .count = vk->pools.elem[i]->num_queues,
+ };
+
+ if (vk->pools.elem[i] == vk->pool_graphics)
+ pl_vk->queue_graphics = queues[i];
+ if (vk->pools.elem[i] == vk->pool_compute)
+ pl_vk->queue_compute = queues[i];
+ if (vk->pools.elem[i] == vk->pool_transfer)
+ pl_vk->queue_transfer = queues[i];
+ }
+
+ pl_assert(vk->lock_queue);
+ pl_assert(vk->unlock_queue);
+ return true;
+}
+
+pl_vulkan pl_vulkan_create(pl_log log, const struct pl_vulkan_params *params)
+{
+ params = PL_DEF(params, &pl_vulkan_default_params);
+ struct pl_vulkan_t *pl_vk = pl_zalloc_obj(NULL, pl_vk, struct vk_ctx);
+ struct vk_ctx *vk = PL_PRIV(pl_vk);
+ *vk = (struct vk_ctx) {
+ .vulkan = pl_vk,
+ .alloc = pl_vk,
+ .log = log,
+ .inst = params->instance,
+ .GetInstanceProcAddr = get_proc_addr_fallback(log, params->get_proc_addr),
+ };
+
+ pl_mutex_init_type(&vk->lock, PL_MUTEX_RECURSIVE);
+ if (!vk->GetInstanceProcAddr)
+ goto error;
+
+ if (!vk->inst) {
+ pl_assert(!params->surface);
+ pl_assert(!params->device);
+ PL_DEBUG(vk, "No VkInstance provided, creating one...");
+
+ // Mirror the instance params here to set `get_proc_addr` correctly
+ struct pl_vk_inst_params iparams;
+ iparams = *PL_DEF(params->instance_params, &pl_vk_inst_default_params);
+ iparams.get_proc_addr = params->get_proc_addr;
+ vk->internal_instance = pl_vk_inst_create(log, &iparams);
+ if (!vk->internal_instance)
+ goto error;
+ vk->inst = vk->internal_instance->instance;
+ }
+
+ // Directly load all mandatory instance-level function pointers, since
+ // these will be required for all further device creation logic
+ for (int i = 0; i < PL_ARRAY_SIZE(vk_inst_funs); i++)
+ load_vk_fun(vk, &vk_inst_funs[i]);
+
+ // Choose the physical device
+ if (params->device) {
+ PL_DEBUG(vk, "Using specified VkPhysicalDevice");
+ vk->physd = params->device;
+ } else {
+ struct pl_vulkan_device_params dparams = {
+ .instance = vk->inst,
+ .get_proc_addr = params->get_proc_addr,
+ .surface = params->surface,
+ .device_name = params->device_name,
+ .allow_software = params->allow_software,
+ };
+ memcpy(dparams.device_uuid, params->device_uuid, VK_UUID_SIZE);
+
+ vk->physd = pl_vulkan_choose_device(log, &dparams);
+ if (!vk->physd) {
+ PL_FATAL(vk, "Found no suitable device, giving up.");
+ goto error;
+ }
+ }
+
+ VkPhysicalDeviceIDPropertiesKHR id_props = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR,
+ };
+
+ VkPhysicalDeviceProperties2KHR prop = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
+ .pNext = &id_props,
+ };
+
+ vk->GetPhysicalDeviceProperties2(vk->physd, &prop);
+ vk->props = prop.properties;
+
+ PL_INFO(vk, "Vulkan device properties:");
+ PL_INFO(vk, " Device Name: %s", prop.properties.deviceName);
+ PL_INFO(vk, " Device ID: %"PRIx32":%"PRIx32, prop.properties.vendorID,
+ prop.properties.deviceID);
+ PL_INFO(vk, " Device UUID: %s", PRINT_UUID(id_props.deviceUUID));
+ PL_INFO(vk, " Driver version: %"PRIx32, prop.properties.driverVersion);
+ PL_INFO(vk, " API version: %d.%d.%d", PRINTF_VER(prop.properties.apiVersion));
+
+ // Needed by device_init
+ vk->api_ver = prop.properties.apiVersion;
+ if (params->max_api_version) {
+ vk->api_ver = PL_MIN(vk->api_ver, params->max_api_version);
+ PL_INFO(vk, "Restricting API version to %d.%d.%d... new version %d.%d.%d",
+ PRINTF_VER(params->max_api_version), PRINTF_VER(vk->api_ver));
+ }
+
+ if (vk->api_ver < PL_VK_MIN_VERSION) {
+ PL_FATAL(vk, "Device API version %d.%d.%d is lower than the minimum "
+ "required version of %d.%d.%d, cannot proceed!",
+ PRINTF_VER(vk->api_ver), PRINTF_VER(PL_VK_MIN_VERSION));
+ goto error;
+ }
+
+ // Finally, initialize the logical device and the rest of the vk_ctx
+ if (!device_init(vk, params))
+ goto error;
+
+ if (!finalize_context(pl_vk, params->max_glsl_version))
+ goto error;
+
+ return pl_vk;
+
+error:
+ PL_FATAL(vk, "Failed initializing vulkan device");
+ pl_vulkan_destroy((pl_vulkan *) &pl_vk);
+ return NULL;
+}
+
+pl_vulkan pl_vulkan_import(pl_log log, const struct pl_vulkan_import_params *params)
+{
+ void *tmp = pl_tmp(NULL);
+
+ struct pl_vulkan_t *pl_vk = pl_zalloc_obj(NULL, pl_vk, struct vk_ctx);
+ struct vk_ctx *vk = PL_PRIV(pl_vk);
+ *vk = (struct vk_ctx) {
+ .vulkan = pl_vk,
+ .alloc = pl_vk,
+ .log = log,
+ .imported = true,
+ .inst = params->instance,
+ .physd = params->phys_device,
+ .dev = params->device,
+ .GetInstanceProcAddr = get_proc_addr_fallback(log, params->get_proc_addr),
+ .lock_queue = params->lock_queue,
+ .unlock_queue = params->unlock_queue,
+ .queue_ctx = params->queue_ctx,
+ };
+
+ pl_mutex_init_type(&vk->lock, PL_MUTEX_RECURSIVE);
+ if (!vk->GetInstanceProcAddr)
+ goto error;
+
+ for (int i = 0; i < PL_ARRAY_SIZE(vk_inst_funs); i++)
+ load_vk_fun(vk, &vk_inst_funs[i]);
+
+ VkPhysicalDeviceIDPropertiesKHR id_props = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR,
+ };
+
+ VkPhysicalDeviceProperties2KHR prop = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
+ .pNext = &id_props,
+ };
+
+ pl_assert(vk->GetPhysicalDeviceProperties2);
+ vk->GetPhysicalDeviceProperties2(vk->physd, &prop);
+ vk->props = prop.properties;
+
+ PL_INFO(vk, "Imported vulkan device properties:");
+ PL_INFO(vk, " Device Name: %s", prop.properties.deviceName);
+ PL_INFO(vk, " Device ID: %"PRIx32":%"PRIx32, prop.properties.vendorID,
+ prop.properties.deviceID);
+ PL_INFO(vk, " Device UUID: %s", PRINT_UUID(id_props.deviceUUID));
+ PL_INFO(vk, " Driver version: %"PRIx32, prop.properties.driverVersion);
+ PL_INFO(vk, " API version: %d.%d.%d", PRINTF_VER(prop.properties.apiVersion));
+
+ vk->api_ver = prop.properties.apiVersion;
+ if (params->max_api_version) {
+ vk->api_ver = PL_MIN(vk->api_ver, params->max_api_version);
+ PL_INFO(vk, "Restricting API version to %d.%d.%d... new version %d.%d.%d",
+ PRINTF_VER(params->max_api_version), PRINTF_VER(vk->api_ver));
+ }
+
+ if (vk->api_ver < PL_VK_MIN_VERSION) {
+ PL_FATAL(vk, "Device API version %d.%d.%d is lower than the minimum "
+ "required version of %d.%d.%d, cannot proceed!",
+ PRINTF_VER(vk->api_ver), PRINTF_VER(PL_VK_MIN_VERSION));
+ goto error;
+ }
+
+ vk->features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
+ vk_features_normalize(vk->alloc, params->features, 0, &vk->features);
+ if (!check_required_features(vk)) {
+ PL_FATAL(vk, "Imported Vulkan device was not created with all required "
+ "features!");
+ goto error;
+ }
+
+ // Load all mandatory device-level functions
+ for (int i = 0; i < PL_ARRAY_SIZE(vk_dev_funs); i++)
+ load_vk_fun(vk, &vk_dev_funs[i]);
+
+ // Load all of the optional functions from the extensions enabled
+ for (int i = 0; i < PL_ARRAY_SIZE(vk_device_extensions); i++) {
+ const struct vk_ext *ext = &vk_device_extensions[i];
+ uint32_t core_ver = vk_ext_promoted_ver(ext->name);
+ if (core_ver && vk->api_ver >= core_ver) {
+ for (const struct vk_fun *f = ext->funs; f && f->name; f++)
+ load_vk_fun(vk, f);
+ continue;
+ }
+ for (int n = 0; n < params->num_extensions; n++) {
+ if (strcmp(ext->name, params->extensions[n]) == 0) {
+ for (const struct vk_fun *f = ext->funs; f && f->name; f++)
+ load_vk_fun(vk, f);
+ break;
+ }
+ }
+ }
+
+ uint32_t qfnum = 0;
+ vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, NULL);
+ VkQueueFamilyProperties *qfs = pl_calloc_ptr(tmp, qfnum, qfs);
+ vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, qfs);
+ if (!params->lock_queue)
+ init_queue_locks(vk, qfnum, qfs);
+
+ // Create the command pools for each unique qf that exists
+ struct {
+ const struct pl_vulkan_queue *info;
+ struct vk_cmdpool **pool;
+ VkQueueFlagBits flags; // *any* of these flags provide the cap
+ } qinfos[] = {
+ {
+ .info = &params->queue_graphics,
+ .pool = &vk->pool_graphics,
+ .flags = VK_QUEUE_GRAPHICS_BIT,
+ }, {
+ .info = &params->queue_compute,
+ .pool = &vk->pool_compute,
+ .flags = VK_QUEUE_COMPUTE_BIT,
+ }, {
+ .info = &params->queue_transfer,
+ .pool = &vk->pool_transfer,
+ .flags = VK_QUEUE_TRANSFER_BIT |
+ VK_QUEUE_GRAPHICS_BIT |
+ VK_QUEUE_COMPUTE_BIT,
+ }
+ };
+
+ for (int i = 0; i < PL_ARRAY_SIZE(qinfos); i++) {
+ int qf = qinfos[i].info->index;
+ struct vk_cmdpool **pool = qinfos[i].pool;
+ if (!qinfos[i].info->count)
+ continue;
+
+ // API sanity check
+ pl_assert(qfs[qf].queueFlags & qinfos[i].flags);
+
+ // See if we already created a pool for this queue family
+ for (int j = 0; j < i; j++) {
+ if (qinfos[j].info->count && qinfos[j].info->index == qf) {
+ *pool = *qinfos[j].pool;
+ goto next_qf;
+ }
+ }
+
+ *pool = vk_cmdpool_create(vk, qf, qinfos[i].info->count, qfs[qf]);
+ if (!*pool)
+ goto error;
+ PL_ARRAY_APPEND(vk->alloc, vk->pools, *pool);
+
+ // Pre-emptively set "lower priority" pools as well
+ for (int j = i+1; j < PL_ARRAY_SIZE(qinfos); j++) {
+ if (qfs[qf].queueFlags & qinfos[j].flags)
+ *qinfos[j].pool = *pool;
+ }
+
+next_qf: ;
+ }
+
+ if (!vk->pool_graphics) {
+ PL_ERR(vk, "No valid queues provided?");
+ goto error;
+ }
+
+ if (!finalize_context(pl_vk, params->max_glsl_version))
+ goto error;
+
+ pl_free(tmp);
+ return pl_vk;
+
+error:
+ PL_FATAL(vk, "Failed importing vulkan device");
+ pl_vulkan_destroy((pl_vulkan *) &pl_vk);
+ pl_free(tmp);
+ return NULL;
+}
diff --git a/src/vulkan/formats.c b/src/vulkan/formats.c
new file mode 100644
index 0000000..f0eb0fb
--- /dev/null
+++ b/src/vulkan/formats.c
@@ -0,0 +1,616 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "formats.h"
+
+#define FMT(_name, num, size, ftype, bits, idx) \
+ (struct pl_fmt_t) { \
+ .name = _name, \
+ .type = PL_FMT_##ftype, \
+ .num_components = num, \
+ .component_depth = bits, \
+ .internal_size = size, \
+ .opaque = false, \
+ .texel_size = size, \
+ .texel_align = size, \
+ .host_bits = bits, \
+ .sample_order = idx, \
+ }
+
+#define IDX(...) {__VA_ARGS__}
+#define BITS(...) {__VA_ARGS__}
+
+#define REGFMT(name, num, bits, type) \
+ FMT(name, num, (num) * (bits) / 8, type, \
+ BITS(bits, bits, bits, bits), \
+ IDX(0, 1, 2, 3))
+
+#define EMUFMT(_name, in, en, ib, eb, ftype) \
+ (struct pl_fmt_t) { \
+ .name = _name, \
+ .type = PL_FMT_##ftype, \
+ .num_components = en, \
+ .component_depth = BITS(ib, ib, ib, ib),\
+ .internal_size = (in) * (ib) / 8, \
+ .opaque = false, \
+ .emulated = true, \
+ .texel_size = (en) * (eb) / 8, \
+ .texel_align = (eb) / 8, \
+ .host_bits = BITS(eb, eb, eb, eb),\
+ .sample_order = IDX(0, 1, 2, 3), \
+ }
+
+#define PACKED16FMT(_name, num, b) \
+ (struct pl_fmt_t) { \
+ .name = _name, \
+ .type = PL_FMT_UNORM, \
+ .num_components = num, \
+ .component_depth = BITS(b, b, b, b), \
+ .internal_size = (num) * 2, \
+ .texel_size = (num) * 2, \
+ .texel_align = (num) * 2, \
+ .host_bits = BITS(16, 16, 16, 16),\
+ .sample_order = IDX(0, 1, 2, 3), \
+ }
+
+#define PLANARFMT(_name, planes, size, bits) \
+ (struct pl_fmt_t) { \
+ .name = _name, \
+ .type = PL_FMT_UNORM, \
+ .num_planes = planes, \
+ .num_components = 3, \
+ .component_depth = {bits, bits, bits}, \
+ .internal_size = size, \
+ .opaque = true, \
+ }
+
+static const struct vk_format rgb8e = {
+ .tfmt = VK_FORMAT_R8G8B8A8_UNORM,
+ .bfmt = VK_FORMAT_R8G8B8_UNORM,
+ .icomps = 4,
+ .fmt = EMUFMT("rgb8", 4, 3, 8, 8, UNORM),
+};
+
+static const struct vk_format rgb16e = {
+ .tfmt = VK_FORMAT_R16G16B16A16_UNORM,
+ .bfmt = VK_FORMAT_R16G16B16_UNORM,
+ .icomps = 4,
+ .fmt = EMUFMT("rgb16", 4, 3, 16, 16, UNORM),
+};
+
+static const struct vk_format vk_formats[] = {
+ // Regular, byte-aligned integer formats
+ {VK_FORMAT_R8_UNORM, REGFMT("r8", 1, 8, UNORM)},
+ {VK_FORMAT_R8G8_UNORM, REGFMT("rg8", 2, 8, UNORM)},
+ {VK_FORMAT_R8G8B8_UNORM, REGFMT("rgb8", 3, 8, UNORM), .emufmt = &rgb8e},
+ {VK_FORMAT_R8G8B8A8_UNORM, REGFMT("rgba8", 4, 8, UNORM)},
+ {VK_FORMAT_R16_UNORM, REGFMT("r16", 1, 16, UNORM)},
+ {VK_FORMAT_R16G16_UNORM, REGFMT("rg16", 2, 16, UNORM)},
+ {VK_FORMAT_R16G16B16_UNORM, REGFMT("rgb16", 3, 16, UNORM), .emufmt = &rgb16e},
+ {VK_FORMAT_R16G16B16A16_UNORM, REGFMT("rgba16", 4, 16, UNORM)},
+
+ {VK_FORMAT_R8_SNORM, REGFMT("r8s", 1, 8, SNORM)},
+ {VK_FORMAT_R8G8_SNORM, REGFMT("rg8s", 2, 8, SNORM)},
+ {VK_FORMAT_R8G8B8_SNORM, REGFMT("rgb8s", 3, 8, SNORM)},
+ {VK_FORMAT_R8G8B8A8_SNORM, REGFMT("rgba8s", 4, 8, SNORM)},
+ {VK_FORMAT_R16_SNORM, REGFMT("r16s", 1, 16, SNORM)},
+ {VK_FORMAT_R16G16_SNORM, REGFMT("rg16s", 2, 16, SNORM)},
+ {VK_FORMAT_R16G16B16_SNORM, REGFMT("rgb16s", 3, 16, SNORM)},
+ {VK_FORMAT_R16G16B16A16_SNORM, REGFMT("rgba16s", 4, 16, SNORM)},
+
+ // Float formats (native formats: hf = half float, df = double float)
+ {VK_FORMAT_R16_SFLOAT, REGFMT("r16hf", 1, 16, FLOAT)},
+ {VK_FORMAT_R16G16_SFLOAT, REGFMT("rg16hf", 2, 16, FLOAT)},
+ {VK_FORMAT_R16G16B16_SFLOAT, REGFMT("rgb16hf", 3, 16, FLOAT)},
+ {VK_FORMAT_R16G16B16A16_SFLOAT, REGFMT("rgba16hf", 4, 16, FLOAT)},
+ {VK_FORMAT_R32_SFLOAT, REGFMT("r32f", 1, 32, FLOAT)},
+ {VK_FORMAT_R32G32_SFLOAT, REGFMT("rg32f", 2, 32, FLOAT)},
+ {VK_FORMAT_R32G32B32_SFLOAT, REGFMT("rgb32f", 3, 32, FLOAT)},
+ {VK_FORMAT_R32G32B32A32_SFLOAT, REGFMT("rgba32f", 4, 32, FLOAT)},
+
+ // Float formats (emulated upload/download)
+ {VK_FORMAT_R16_SFLOAT, EMUFMT("r16f", 1, 1, 16, 32, FLOAT)},
+ {VK_FORMAT_R16G16_SFLOAT, EMUFMT("rg16f", 2, 2, 16, 32, FLOAT)},
+ {VK_FORMAT_R16G16B16_SFLOAT, EMUFMT("rgb16f", 3, 3, 16, 32, FLOAT)},
+ {VK_FORMAT_R16G16B16A16_SFLOAT, EMUFMT("rgba16f", 4, 4, 16, 32, FLOAT)},
+
+ // Integer-sampled formats
+ {VK_FORMAT_R8_UINT, REGFMT("r8u", 1, 8, UINT)},
+ {VK_FORMAT_R8G8_UINT, REGFMT("rg8u", 2, 8, UINT)},
+ {VK_FORMAT_R8G8B8_UINT, REGFMT("rgb8u", 3, 8, UINT)},
+ {VK_FORMAT_R8G8B8A8_UINT, REGFMT("rgba8u", 4, 8, UINT)},
+ {VK_FORMAT_R16_UINT, REGFMT("r16u", 1, 16, UINT)},
+ {VK_FORMAT_R16G16_UINT, REGFMT("rg16u", 2, 16, UINT)},
+ {VK_FORMAT_R16G16B16_UINT, REGFMT("rgb16u", 3, 16, UINT)},
+ {VK_FORMAT_R16G16B16A16_UINT, REGFMT("rgba16u", 4, 16, UINT)},
+ {VK_FORMAT_R32_UINT, REGFMT("r32u", 1, 32, UINT)},
+ {VK_FORMAT_R32G32_UINT, REGFMT("rg32u", 2, 32, UINT)},
+ {VK_FORMAT_R32G32B32_UINT, REGFMT("rgb32u", 3, 32, UINT)},
+ {VK_FORMAT_R32G32B32A32_UINT, REGFMT("rgba32u", 4, 32, UINT)},
+
+ {VK_FORMAT_R8_SINT, REGFMT("r8i", 1, 8, SINT)},
+ {VK_FORMAT_R8G8_SINT, REGFMT("rg8i", 2, 8, SINT)},
+ {VK_FORMAT_R8G8B8_SINT, REGFMT("rgb8i", 3, 8, SINT)},
+ {VK_FORMAT_R8G8B8A8_SINT, REGFMT("rgba8i", 4, 8, SINT)},
+ {VK_FORMAT_R16_SINT, REGFMT("r16i", 1, 16, SINT)},
+ {VK_FORMAT_R16G16_SINT, REGFMT("rg16i", 2, 16, SINT)},
+ {VK_FORMAT_R16G16B16_SINT, REGFMT("rgb16i", 3, 16, SINT)},
+ {VK_FORMAT_R16G16B16A16_SINT, REGFMT("rgba16i", 4, 16, SINT)},
+ {VK_FORMAT_R32_SINT, REGFMT("r32i", 1, 32, SINT)},
+ {VK_FORMAT_R32G32_SINT, REGFMT("rg32i", 2, 32, SINT)},
+ {VK_FORMAT_R32G32B32_SINT, REGFMT("rgb32i", 3, 32, SINT)},
+ {VK_FORMAT_R32G32B32A32_SINT, REGFMT("rgba32i", 4, 32, SINT)},
+
+ // "Swapped" component order formats
+ {VK_FORMAT_B8G8R8_UNORM, FMT("bgr8", 3, 3, UNORM, BITS(8, 8, 8), IDX(2, 1, 0))},
+ {VK_FORMAT_B8G8R8A8_UNORM, FMT("bgra8", 4, 4, UNORM, BITS(8, 8, 8, 8), IDX(2, 1, 0, 3))},
+
+ {VK_FORMAT_B8G8R8_UINT, FMT("bgr8u", 3, 3, UINT, BITS(8, 8, 8), IDX(2, 1, 0))},
+ {VK_FORMAT_B8G8R8A8_UINT, FMT("bgra8u", 4, 4, UINT, BITS(8, 8, 8, 8), IDX(2, 1, 0, 3))},
+
+ {VK_FORMAT_B8G8R8_SINT, FMT("bgr8i", 3, 3, SINT, BITS(8, 8, 8), IDX(2, 1, 0))},
+ {VK_FORMAT_B8G8R8A8_SINT, FMT("bgra8i", 4, 4, SINT, BITS(8, 8, 8, 8), IDX(2, 1, 0, 3))},
+
+ // "Packed" integer formats
+ //
+ // Note: These have the component order reversed from what the vulkan name
+ // implies, because we order our IDX from LSB to MSB (consistent with the
+ // usual ordering from lowest byte to highest byte, on little endian
+ // platforms), but Vulkan names them from MSB to LSB.
+ {VK_FORMAT_R4G4_UNORM_PACK8, FMT("gr4", 2, 1, UNORM, BITS(4, 4), IDX(1, 0))},
+ {VK_FORMAT_B4G4R4A4_UNORM_PACK16, FMT("argb4", 4, 2, UNORM, BITS(4, 4, 4, 4), IDX(3, 0, 1, 2))},
+ {VK_FORMAT_R4G4B4A4_UNORM_PACK16, FMT("abgr4", 4, 2, UNORM, BITS(4, 4, 4, 4), IDX(3, 2, 1, 0))},
+
+ {VK_FORMAT_R5G6B5_UNORM_PACK16, FMT("bgr565", 3, 2, UNORM, BITS(5, 6, 5), IDX(2, 1, 0))},
+ {VK_FORMAT_B5G6R5_UNORM_PACK16, FMT("rgb565", 3, 2, UNORM, BITS(5, 6, 5), IDX(0, 1, 2))},
+
+ {VK_FORMAT_R5G5B5A1_UNORM_PACK16, FMT("a1bgr5", 4, 2, UNORM, BITS(1, 5, 5, 5), IDX(3, 2, 1, 0))},
+ {VK_FORMAT_B5G5R5A1_UNORM_PACK16, FMT("a1rgb5", 4, 2, UNORM, BITS(1, 5, 5, 5), IDX(3, 0, 1, 2))},
+ {VK_FORMAT_A1R5G5B5_UNORM_PACK16, FMT("bgr5a1", 4, 2, UNORM, BITS(5, 5, 5, 1), IDX(2, 1, 0, 3))},
+
+ {VK_FORMAT_A2B10G10R10_UNORM_PACK32, FMT("rgb10a2", 4, 4, UNORM, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))},
+ {VK_FORMAT_A2R10G10B10_UNORM_PACK32, FMT("bgr10a2", 4, 4, UNORM, BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))},
+ {VK_FORMAT_A2B10G10R10_SNORM_PACK32, FMT("rgb10a2s", 4, 4, SNORM, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))},
+ {VK_FORMAT_A2R10G10B10_SNORM_PACK32, FMT("bgr10a2s", 4, 4, SNORM, BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))},
+ {VK_FORMAT_A2B10G10R10_UINT_PACK32, FMT("rgb10a2u", 4, 4, UINT, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))},
+ {VK_FORMAT_A2R10G10B10_UINT_PACK32, FMT("bgr10a2u", 4, 4, UINT, BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))},
+ {VK_FORMAT_A2B10G10R10_SINT_PACK32, FMT("rgb10a2i", 4, 4, SINT, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))},
+ {VK_FORMAT_A2R10G10B10_SINT_PACK32, FMT("bgr10a2i", 4, 4, SINT, BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))},
+
+
+ // Packed 16 bit formats
+ {VK_FORMAT_R10X6_UNORM_PACK16, PACKED16FMT("rx10", 1, 10)},
+ {VK_FORMAT_R10X6G10X6_UNORM_2PACK16, PACKED16FMT("rxgx10", 2, 10)},
+ {VK_FORMAT_R12X4_UNORM_PACK16, PACKED16FMT("rx12", 1, 12)},
+ {VK_FORMAT_R12X4G12X4_UNORM_2PACK16, PACKED16FMT("rxgx12", 2, 12)},
+
+ // FIXME: enabling these requires VK_EXT_rgba10x6_formats or equivalent
+ // {VK_FORMAT_R10X6G10X6B10X6A10X6_UNORM_4PACK16, PACKED16FMT("rxgxbxax10", 4, 10)},
+ // {VK_FORMAT_R12X4G12X4B12X4A12X4_UNORM_4PACK16, PACKED16FMT("rxgxbxax12", 4, 12)},
+
+ // Planar formats
+ {VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM, PLANARFMT("g8_b8_r8_420", 3, 12, 8),
+ .pfmt = {
+ {VK_FORMAT_R8_UNORM},
+ {VK_FORMAT_R8_UNORM, .sx = 1, .sy = 1},
+ {VK_FORMAT_R8_UNORM, .sx = 1, .sy = 1},
+ },
+ },
+ {VK_FORMAT_G8_B8_R8_3PLANE_422_UNORM, PLANARFMT("g8_b8_r8_422", 3, 16, 8),
+ .pfmt = {
+ {VK_FORMAT_R8_UNORM},
+ {VK_FORMAT_R8_UNORM, .sx = 1},
+ {VK_FORMAT_R8_UNORM, .sx = 1},
+ },
+ },
+ {VK_FORMAT_G8_B8_R8_3PLANE_444_UNORM, PLANARFMT("g8_b8_r8_444", 3, 24, 8),
+ .pfmt = {
+ {VK_FORMAT_R8_UNORM},
+ {VK_FORMAT_R8_UNORM},
+ {VK_FORMAT_R8_UNORM},
+ },
+ },
+
+ {VK_FORMAT_G16_B16_R16_3PLANE_420_UNORM, PLANARFMT("g16_b16_r16_420", 3, 24, 16),
+ .pfmt = {
+ {VK_FORMAT_R16_UNORM},
+ {VK_FORMAT_R16_UNORM, .sx = 1, .sy = 1},
+ {VK_FORMAT_R16_UNORM, .sx = 1, .sy = 1},
+ },
+ },
+ {VK_FORMAT_G16_B16_R16_3PLANE_422_UNORM, PLANARFMT("g16_b16_r16_422", 3, 32, 16),
+ .pfmt = {
+ {VK_FORMAT_R16_UNORM},
+ {VK_FORMAT_R16_UNORM, .sx = 1},
+ {VK_FORMAT_R16_UNORM, .sx = 1},
+ },
+ },
+ {VK_FORMAT_G16_B16_R16_3PLANE_444_UNORM, PLANARFMT("g16_b16_r16_444", 3, 48, 16),
+ .pfmt = {
+ {VK_FORMAT_R16_UNORM},
+ {VK_FORMAT_R16_UNORM},
+ {VK_FORMAT_R16_UNORM},
+ },
+ },
+
+ {VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_420_UNORM_3PACK16, PLANARFMT("gx10_bx10_rx10_420", 3, 24, 10),
+ .pfmt = {
+ {VK_FORMAT_R10X6_UNORM_PACK16},
+ {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1, .sy = 1},
+ {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1, .sy = 1},
+ },
+ },
+ {VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_422_UNORM_3PACK16, PLANARFMT("gx10_bx10_rx10_422", 3, 32, 10),
+ .pfmt = {
+ {VK_FORMAT_R10X6_UNORM_PACK16},
+ {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1},
+ {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1},
+ },
+ },
+ {VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_444_UNORM_3PACK16, PLANARFMT("gx10_bx10_rx10_444", 3, 48, 10),
+ .pfmt = {
+ {VK_FORMAT_R10X6_UNORM_PACK16},
+ {VK_FORMAT_R10X6_UNORM_PACK16},
+ {VK_FORMAT_R10X6_UNORM_PACK16},
+ },
+ },
+
+ {VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_420_UNORM_3PACK16, PLANARFMT("gx12_bx12_rx12_420", 3, 24, 12),
+ .pfmt = {
+ {VK_FORMAT_R12X4_UNORM_PACK16},
+ {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1, .sy = 1},
+ {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1, .sy = 1},
+ },
+ },
+ {VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_422_UNORM_3PACK16, PLANARFMT("gx12_bx12_rx12_422", 3, 32, 12),
+ .pfmt = {
+ {VK_FORMAT_R12X4_UNORM_PACK16},
+ {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1},
+ {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1},
+ },
+ },
+ {VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_444_UNORM_3PACK16, PLANARFMT("gx12_bx12_rx12_444", 3, 48, 12),
+ .pfmt = {
+ {VK_FORMAT_R12X4_UNORM_PACK16},
+ {VK_FORMAT_R12X4_UNORM_PACK16},
+ {VK_FORMAT_R12X4_UNORM_PACK16},
+ },
+ },
+
+ {VK_FORMAT_G8_B8R8_2PLANE_420_UNORM, PLANARFMT("g8_br8_420", 2, 12, 8),
+ .pfmt = {
+ {VK_FORMAT_R8_UNORM},
+ {VK_FORMAT_R8G8_UNORM, .sx = 1, .sy = 1},
+ },
+ },
+ {VK_FORMAT_G8_B8R8_2PLANE_422_UNORM, PLANARFMT("g8_br8_422", 2, 16, 8),
+ .pfmt = {
+ {VK_FORMAT_R8_UNORM},
+ {VK_FORMAT_R8G8_UNORM, .sx = 1},
+ },
+ },
+ {VK_FORMAT_G8_B8R8_2PLANE_444_UNORM, PLANARFMT("g8_br8_444", 2, 24, 8),
+ .min_ver = VK_API_VERSION_1_3,
+ .pfmt = {
+ {VK_FORMAT_R8_UNORM},
+ {VK_FORMAT_R8G8_UNORM},
+ },
+ },
+
+ {VK_FORMAT_G16_B16R16_2PLANE_420_UNORM, PLANARFMT("g16_br16_420", 2, 24, 16),
+ .pfmt = {
+ {VK_FORMAT_R16_UNORM},
+ {VK_FORMAT_R16G16_UNORM, .sx = 1, .sy = 1},
+ },
+ },
+ {VK_FORMAT_G16_B16R16_2PLANE_422_UNORM, PLANARFMT("g16_br16_422", 2, 32, 16),
+ .pfmt = {
+ {VK_FORMAT_R16_UNORM},
+ {VK_FORMAT_R16G16_UNORM, .sx = 1},
+ },
+ },
+ {VK_FORMAT_G16_B16R16_2PLANE_444_UNORM, PLANARFMT("g16_br16_444", 2, 48, 16),
+ .min_ver = VK_API_VERSION_1_3,
+ .pfmt = {
+ {VK_FORMAT_R16_UNORM},
+ {VK_FORMAT_R16G16_UNORM},
+ },
+ },
+
+ {VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16, PLANARFMT("gx10_bxrx10_420", 2, 24, 10),
+ .pfmt = {
+ {VK_FORMAT_R10X6_UNORM_PACK16},
+ {VK_FORMAT_R10X6G10X6_UNORM_2PACK16, .sx = 1, .sy = 1},
+ },
+ },
+ {VK_FORMAT_G10X6_B10X6R10X6_2PLANE_422_UNORM_3PACK16, PLANARFMT("gx10_bxrx10_422", 2, 32, 10),
+ .pfmt = {
+ {VK_FORMAT_R10X6_UNORM_PACK16},
+ {VK_FORMAT_R10X6G10X6_UNORM_2PACK16, .sx = 1},
+ },
+ },
+ {VK_FORMAT_G10X6_B10X6R10X6_2PLANE_444_UNORM_3PACK16, PLANARFMT("gx10_bxrx10_444", 2, 48, 10),
+ .min_ver = VK_API_VERSION_1_3,
+ .pfmt = {
+ {VK_FORMAT_R10X6_UNORM_PACK16},
+ {VK_FORMAT_R10X6G10X6_UNORM_2PACK16},
+ },
+ },
+
+ {VK_FORMAT_G12X4_B12X4R12X4_2PLANE_420_UNORM_3PACK16, PLANARFMT("gx12_bxrx12_420", 2, 24, 12),
+ .pfmt = {
+ {VK_FORMAT_R12X4_UNORM_PACK16},
+ {VK_FORMAT_R12X4G12X4_UNORM_2PACK16, .sx = 1, .sy = 1},
+ },
+ },
+ {VK_FORMAT_G12X4_B12X4R12X4_2PLANE_422_UNORM_3PACK16, PLANARFMT("gx12_bxrx12_422", 2, 32, 12),
+ .pfmt = {
+ {VK_FORMAT_R12X4_UNORM_PACK16},
+ {VK_FORMAT_R12X4G12X4_UNORM_2PACK16, .sx = 1},
+ },
+ },
+ {VK_FORMAT_G12X4_B12X4R12X4_2PLANE_444_UNORM_3PACK16, PLANARFMT("gx12_bxrx12_444", 2, 48, 12),
+ .min_ver = VK_API_VERSION_1_3,
+ .pfmt = {
+ {VK_FORMAT_R12X4_UNORM_PACK16},
+ {VK_FORMAT_R12X4G12X4_UNORM_2PACK16},
+ },
+ },
+
+ {0}
+};
+
+#undef BITS
+#undef IDX
+#undef REGFMT
+#undef FMT
+
+void vk_setup_formats(struct pl_gpu_t *gpu)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ PL_ARRAY(pl_fmt) formats = {0};
+
+ // Texture format emulation requires at least support for texel buffers
+ bool has_emu = gpu->glsl.compute && gpu->limits.max_buffer_texels;
+
+ for (const struct vk_format *pvk_fmt = vk_formats; pvk_fmt->tfmt; pvk_fmt++) {
+ const struct vk_format *vk_fmt = pvk_fmt;
+
+ // Skip formats that require a too new version of Vulkan
+ if (vk_fmt->min_ver > vk->api_ver)
+ continue;
+
+ // Skip formats with innately emulated representation if unsupported
+ if (vk_fmt->fmt.emulated && !has_emu)
+ continue;
+
+ // Suppress some errors/warnings spit out by the format probing code
+ pl_log_level_cap(vk->log, PL_LOG_INFO);
+
+ bool has_drm_mods = vk->GetImageDrmFormatModifierPropertiesEXT;
+ VkDrmFormatModifierPropertiesEXT modifiers[16] = {0};
+ VkDrmFormatModifierPropertiesListEXT drm_props = {
+ .sType = VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT,
+ .drmFormatModifierCount = PL_ARRAY_SIZE(modifiers),
+ .pDrmFormatModifierProperties = modifiers,
+ };
+
+ VkFormatProperties2KHR prop2 = {
+ .sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2,
+ .pNext = has_drm_mods ? &drm_props : NULL,
+ };
+
+ vk->GetPhysicalDeviceFormatProperties2KHR(vk->physd, vk_fmt->tfmt, &prop2);
+
+ // If wholly unsupported, try falling back to the emulation formats
+ // for texture operations
+ VkFormatProperties *prop = &prop2.formatProperties;
+ while (has_emu && !prop->optimalTilingFeatures && vk_fmt->emufmt) {
+ vk_fmt = vk_fmt->emufmt;
+ vk->GetPhysicalDeviceFormatProperties2KHR(vk->physd, vk_fmt->tfmt, &prop2);
+ }
+
+ VkFormatFeatureFlags texflags = prop->optimalTilingFeatures;
+ VkFormatFeatureFlags bufflags = prop->bufferFeatures;
+ if (vk_fmt->fmt.emulated) {
+ // Emulated formats might have a different buffer representation
+ // than their texture representation. If they don't, assume their
+ // buffer representation is nonsensical (e.g. r16f)
+ if (vk_fmt->bfmt) {
+ vk->GetPhysicalDeviceFormatProperties(vk->physd, vk_fmt->bfmt, prop);
+ bufflags = prop->bufferFeatures;
+ } else {
+ bufflags = 0;
+ }
+ } else if (vk_fmt->fmt.num_planes) {
+ // Planar textures cannot be used directly
+ texflags = bufflags = 0;
+ }
+
+ pl_log_level_cap(vk->log, PL_LOG_NONE);
+
+ struct pl_fmt_t *fmt = pl_alloc_obj(gpu, fmt, struct pl_fmt_vk);
+ struct pl_fmt_vk *fmtp = PL_PRIV(fmt);
+ *fmt = vk_fmt->fmt;
+ *fmtp = (struct pl_fmt_vk) {
+ .vk_fmt = vk_fmt
+ };
+
+ // Always set the signature to the actual texture format, so we can use
+ // it to guarantee renderpass compatibility.
+ fmt->signature = (uint64_t) vk_fmt->tfmt;
+
+ // For sanity, clear the superfluous fields
+ for (int i = fmt->num_components; i < 4; i++) {
+ fmt->component_depth[i] = 0;
+ fmt->sample_order[i] = 0;
+ fmt->host_bits[i] = 0;
+ }
+
+ // We can set this universally
+ fmt->fourcc = pl_fmt_fourcc(fmt);
+
+ if (has_drm_mods) {
+
+ if (drm_props.drmFormatModifierCount == PL_ARRAY_SIZE(modifiers)) {
+ PL_WARN(gpu, "DRM modifier list for format %s possibly truncated",
+ fmt->name);
+ }
+
+ // Query the list of supported DRM modifiers from the driver
+ PL_ARRAY(uint64_t) modlist = {0};
+ for (int i = 0; i < drm_props.drmFormatModifierCount; i++) {
+ if (modifiers[i].drmFormatModifierPlaneCount > 1) {
+ PL_TRACE(gpu, "Ignoring format modifier %s of "
+ "format %s because its plane count %d > 1",
+ PRINT_DRM_MOD(modifiers[i].drmFormatModifier),
+ fmt->name, modifiers[i].drmFormatModifierPlaneCount);
+ continue;
+ }
+
+ // Only warn about texture format features relevant to us
+ const VkFormatFeatureFlags flag_mask =
+ VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT |
+ VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT |
+ VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT |
+ VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT |
+ VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
+ VK_FORMAT_FEATURE_BLIT_SRC_BIT |
+ VK_FORMAT_FEATURE_BLIT_DST_BIT;
+
+
+ VkFormatFeatureFlags flags = modifiers[i].drmFormatModifierTilingFeatures;
+ if ((flags & flag_mask) != (texflags & flag_mask)) {
+ PL_DEBUG(gpu, "DRM format modifier %s of format %s "
+ "supports fewer caps (0x%"PRIx32") than optimal tiling "
+ "(0x%"PRIx32"), may result in limited capability!",
+ PRINT_DRM_MOD(modifiers[i].drmFormatModifier),
+ fmt->name, flags, texflags);
+ }
+
+ PL_ARRAY_APPEND(fmt, modlist, modifiers[i].drmFormatModifier);
+ }
+
+ fmt->num_modifiers = modlist.num;
+ fmt->modifiers = modlist.elem;
+
+ } else if (gpu->export_caps.tex & PL_HANDLE_DMA_BUF) {
+
+ // Hard-code a list of static mods that we're likely to support
+ static const uint64_t static_mods[2] = {
+ DRM_FORMAT_MOD_INVALID,
+ DRM_FORMAT_MOD_LINEAR,
+ };
+
+ fmt->num_modifiers = PL_ARRAY_SIZE(static_mods);
+ fmt->modifiers = static_mods;
+
+ }
+
+ struct { VkFormatFeatureFlags flags; enum pl_fmt_caps caps; } bufbits[] = {
+ {VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT, PL_FMT_CAP_VERTEX},
+ {VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT, PL_FMT_CAP_TEXEL_UNIFORM},
+ {VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT, PL_FMT_CAP_TEXEL_STORAGE},
+ };
+
+ for (int i = 0; i < PL_ARRAY_SIZE(bufbits); i++) {
+ if ((bufflags & bufbits[i].flags) == bufbits[i].flags)
+ fmt->caps |= bufbits[i].caps;
+ }
+
+ if (fmt->caps) {
+ fmt->glsl_type = pl_var_glsl_type_name(pl_var_from_fmt(fmt, ""));
+ pl_assert(fmt->glsl_type);
+ }
+
+ struct { VkFormatFeatureFlags flags; enum pl_fmt_caps caps; } bits[] = {
+ {VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT, PL_FMT_CAP_BLENDABLE},
+ {VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT, PL_FMT_CAP_LINEAR},
+ {VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT, PL_FMT_CAP_SAMPLEABLE},
+ {VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT, PL_FMT_CAP_STORABLE},
+ {VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT, PL_FMT_CAP_RENDERABLE},
+
+ // We don't distinguish between the two blit modes for pl_fmt_caps
+ {VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT,
+ PL_FMT_CAP_BLITTABLE},
+ };
+
+ for (int i = 0; i < PL_ARRAY_SIZE(bits); i++) {
+ if ((texflags & bits[i].flags) == bits[i].flags)
+ fmt->caps |= bits[i].caps;
+ }
+
+ // For blit emulation via compute shaders
+ if (!(fmt->caps & PL_FMT_CAP_BLITTABLE) && (fmt->caps & PL_FMT_CAP_STORABLE)) {
+ fmt->caps |= PL_FMT_CAP_BLITTABLE;
+ fmtp->blit_emulated = true;
+ }
+
+ // This is technically supported for all textures, but the semantics
+ // of pl_gpu require it only be listed for non-opaque ones
+ if (!fmt->opaque)
+ fmt->caps |= PL_FMT_CAP_HOST_READABLE;
+
+ // Vulkan requires a minimum GLSL version that supports textureGather()
+ if (fmt->caps & PL_FMT_CAP_SAMPLEABLE)
+ fmt->gatherable = true;
+
+ // Disable implied capabilities where the dependencies are unavailable
+ enum pl_fmt_caps storable = PL_FMT_CAP_STORABLE | PL_FMT_CAP_TEXEL_STORAGE;
+ if (!(fmt->caps & PL_FMT_CAP_SAMPLEABLE))
+ fmt->caps &= ~PL_FMT_CAP_LINEAR;
+ if (!gpu->glsl.compute)
+ fmt->caps &= ~storable;
+
+ bool has_nofmt = vk->features.features.shaderStorageImageReadWithoutFormat &&
+ vk->features.features.shaderStorageImageWriteWithoutFormat;
+
+ if (fmt->caps & storable) {
+ int real_comps = PL_DEF(vk_fmt->icomps, fmt->num_components);
+ fmt->glsl_format = pl_fmt_glsl_format(fmt, real_comps);
+ if (!fmt->glsl_format && !has_nofmt) {
+ PL_DEBUG(gpu, "Storable format '%s' has no matching GLSL "
+ "format qualifier but read/write without format "
+ "is not supported.. disabling", fmt->name);
+ fmt->caps &= ~storable;
+ }
+ }
+
+ if (fmt->caps & storable)
+ fmt->caps |= PL_FMT_CAP_READWRITE;
+
+ // Pick sub-plane formats for planar formats
+ for (int n = 0; n < fmt->num_planes; n++) {
+ for (int i = 0; i < formats.num; i++) {
+ if (formats.elem[i]->signature == vk_fmt->pfmt[n].fmt) {
+ fmt->planes[n].format = formats.elem[i];
+ fmt->planes[n].shift_x = vk_fmt->pfmt[n].sx;
+ fmt->planes[n].shift_y = vk_fmt->pfmt[n].sy;
+ break;
+ }
+ }
+
+ pl_assert(fmt->planes[n].format);
+ }
+
+ PL_ARRAY_APPEND(gpu, formats, fmt);
+ }
+
+ gpu->formats = formats.elem;
+ gpu->num_formats = formats.num;
+}
diff --git a/src/vulkan/formats.h b/src/vulkan/formats.h
new file mode 100644
index 0000000..b1408fd
--- /dev/null
+++ b/src/vulkan/formats.h
@@ -0,0 +1,34 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+#include "gpu.h"
+
+struct vk_format {
+ VkFormat tfmt; // internal vulkan format enum (textures)
+ struct pl_fmt_t fmt;// pl_fmt template (features will be auto-detected)
+ int icomps; // internal component count (or 0 to infer from `fmt`)
+ VkFormat bfmt; // vulkan format for use as buffers (or 0 to use `tfmt`)
+ const struct vk_format *emufmt; // alternate format for emulation
+ uint32_t min_ver; // minimum vulkan API version for this format to exist
+ struct { VkFormat fmt; int sx, sy; } pfmt[4]; // plane formats (for planar textures)
+};
+
+// Add all supported formats to the `pl_gpu` format list
+void vk_setup_formats(struct pl_gpu_t *gpu);
diff --git a/src/vulkan/gpu.c b/src/vulkan/gpu.c
new file mode 100644
index 0000000..69aca67
--- /dev/null
+++ b/src/vulkan/gpu.c
@@ -0,0 +1,924 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+#include "formats.h"
+#include "glsl/spirv.h"
+
+#ifdef PL_HAVE_UNIX
+#include <unistd.h>
+#endif
+
+// Gives us enough queries for 8 results
+#define QUERY_POOL_SIZE 16
+
+struct pl_timer_t {
+ VkQueryPool qpool; // even=start, odd=stop
+ int index_write; // next index to write to
+ int index_read; // next index to read from
+ uint64_t pending; // bitmask of queries that are still running
+};
+
+static inline uint64_t timer_bit(int index)
+{
+ return 1llu << (index / 2);
+}
+
+static void timer_destroy_cb(pl_gpu gpu, pl_timer timer)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ pl_assert(!timer->pending);
+ vk->DestroyQueryPool(vk->dev, timer->qpool, PL_VK_ALLOC);
+ pl_free(timer);
+}
+
+static pl_timer vk_timer_create(pl_gpu gpu)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ pl_timer timer = pl_alloc_ptr(NULL, timer);
+ *timer = (struct pl_timer_t) {0};
+
+ struct VkQueryPoolCreateInfo qinfo = {
+ .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
+ .queryType = VK_QUERY_TYPE_TIMESTAMP,
+ .queryCount = QUERY_POOL_SIZE,
+ };
+
+ VK(vk->CreateQueryPool(vk->dev, &qinfo, PL_VK_ALLOC, &timer->qpool));
+ return timer;
+
+error:
+ timer_destroy_cb(gpu, timer);
+ return NULL;
+}
+
+static void vk_timer_destroy(pl_gpu gpu, pl_timer timer)
+{
+ vk_gpu_idle_callback(gpu, (vk_cb) timer_destroy_cb, gpu, timer);
+}
+
+static uint64_t vk_timer_query(pl_gpu gpu, pl_timer timer)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ if (timer->index_read == timer->index_write)
+ return 0; // no more unprocessed results
+
+ vk_poll_commands(vk, 0);
+ if (timer->pending & timer_bit(timer->index_read))
+ return 0; // still waiting for results
+
+ VkResult res;
+ uint64_t ts[2] = {0};
+ res = vk->GetQueryPoolResults(vk->dev, timer->qpool, timer->index_read, 2,
+ sizeof(ts), &ts[0], sizeof(uint64_t),
+ VK_QUERY_RESULT_64_BIT);
+
+ switch (res) {
+ case VK_SUCCESS:
+ timer->index_read = (timer->index_read + 2) % QUERY_POOL_SIZE;
+ return (ts[1] - ts[0]) * vk->props.limits.timestampPeriod;
+ case VK_NOT_READY:
+ return 0;
+ default:
+ PL_VK_ASSERT(res, "Retrieving query pool results");
+ }
+
+error:
+ return 0;
+}
+
+static void timer_begin(pl_gpu gpu, struct vk_cmd *cmd, pl_timer timer)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ if (!timer)
+ return;
+
+ if (!cmd->pool->props.timestampValidBits) {
+ PL_TRACE(gpu, "QF %d does not support timestamp queries", cmd->pool->qf);
+ return;
+ }
+
+ vk_poll_commands(vk, 0);
+ if (timer->pending & timer_bit(timer->index_write))
+ return; // next query is still running, skip this timer
+
+ VkQueueFlags reset_flags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT;
+ if (cmd->pool->props.queueFlags & reset_flags) {
+ // Use direct command buffer resets
+ vk->CmdResetQueryPool(cmd->buf, timer->qpool, timer->index_write, 2);
+ } else {
+ // Use host query reset
+ vk->ResetQueryPool(vk->dev, timer->qpool, timer->index_write, 2);
+ }
+
+ vk->CmdWriteTimestamp(cmd->buf, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+ timer->qpool, timer->index_write);
+
+ p->cmd_timer = timer;
+}
+
+static inline bool supports_marks(struct vk_cmd *cmd) {
+ // Spec says debug markers are only available on graphics/compute queues
+ VkQueueFlags flags = cmd->pool->props.queueFlags;
+ return flags & (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT);
+}
+
+struct vk_cmd *_begin_cmd(pl_gpu gpu, enum queue_type type, const char *label,
+ pl_timer timer)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ pl_mutex_lock(&p->recording);
+
+ struct vk_cmdpool *pool;
+ switch (type) {
+ case ANY: pool = p->cmd ? p->cmd->pool : vk->pool_graphics; break;
+ case GRAPHICS: pool = vk->pool_graphics; break;
+ case COMPUTE: pool = vk->pool_compute; break;
+ case TRANSFER: pool = vk->pool_transfer; break;
+ default: pl_unreachable();
+ }
+
+ if (!p->cmd || p->cmd->pool != pool) {
+ vk_cmd_submit(&p->cmd);
+ p->cmd = vk_cmd_begin(pool, label);
+ if (!p->cmd) {
+ pl_mutex_unlock(&p->recording);
+ return NULL;
+ }
+ }
+
+ if (vk->CmdBeginDebugUtilsLabelEXT && supports_marks(p->cmd)) {
+ vk->CmdBeginDebugUtilsLabelEXT(p->cmd->buf, &(VkDebugUtilsLabelEXT) {
+ .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,
+ .pLabelName = label,
+ });
+ }
+
+ timer_begin(gpu, p->cmd, timer);
+ return p->cmd;
+}
+
+static void timer_end_cb(void *ptimer, void *pindex)
+{
+ pl_timer timer = ptimer;
+ int index = (uintptr_t) pindex;
+ timer->pending &= ~timer_bit(index);
+}
+
+bool _end_cmd(pl_gpu gpu, struct vk_cmd **pcmd, bool submit)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ bool ret = true;
+ if (!pcmd) {
+ if (submit) {
+ pl_mutex_lock(&p->recording);
+ ret = vk_cmd_submit(&p->cmd);
+ pl_mutex_unlock(&p->recording);
+ }
+ return ret;
+ }
+
+ struct vk_cmd *cmd = *pcmd;
+ pl_assert(p->cmd == cmd);
+
+ if (p->cmd_timer) {
+ pl_timer timer = p->cmd_timer;
+ vk->CmdWriteTimestamp(cmd->buf, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+ timer->qpool, timer->index_write + 1);
+
+ timer->pending |= timer_bit(timer->index_write);
+ vk_cmd_callback(cmd, (vk_cb) timer_end_cb, timer,
+ (void *) (uintptr_t) timer->index_write);
+
+ timer->index_write = (timer->index_write + 2) % QUERY_POOL_SIZE;
+ if (timer->index_write == timer->index_read) {
+ // forcibly drop the least recent result to make space
+ timer->index_read = (timer->index_read + 2) % QUERY_POOL_SIZE;
+ }
+
+ p->cmd_timer = NULL;
+ }
+
+ if (vk->CmdEndDebugUtilsLabelEXT && supports_marks(cmd))
+ vk->CmdEndDebugUtilsLabelEXT(cmd->buf);
+
+ if (submit)
+ ret = vk_cmd_submit(&p->cmd);
+
+ pl_mutex_unlock(&p->recording);
+ return ret;
+}
+
+void vk_gpu_idle_callback(pl_gpu gpu, vk_cb cb, const void *priv, const void *arg)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ pl_mutex_lock(&p->recording);
+ if (p->cmd) {
+ vk_cmd_callback(p->cmd, cb, priv, arg);
+ } else {
+ vk_dev_callback(vk, cb, priv, arg);
+ }
+ pl_mutex_unlock(&p->recording);
+}
+
+static void vk_gpu_destroy(pl_gpu gpu)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ vk_cmd_submit(&p->cmd);
+ vk_wait_idle(vk);
+
+ for (enum pl_tex_sample_mode s = 0; s < PL_TEX_SAMPLE_MODE_COUNT; s++) {
+ for (enum pl_tex_address_mode a = 0; a < PL_TEX_ADDRESS_MODE_COUNT; a++)
+ vk->DestroySampler(vk->dev, p->samplers[s][a], PL_VK_ALLOC);
+ }
+
+ pl_spirv_destroy(&p->spirv);
+ pl_mutex_destroy(&p->recording);
+ pl_free((void *) gpu);
+}
+
+pl_vulkan pl_vulkan_get(pl_gpu gpu)
+{
+ const struct pl_gpu_fns *impl = PL_PRIV(gpu);
+ if (impl->destroy == vk_gpu_destroy) {
+ struct pl_vk *p = (struct pl_vk *) impl;
+ return p->vk->vulkan;
+ }
+
+ return NULL;
+}
+
+static pl_handle_caps vk_sync_handle_caps(struct vk_ctx *vk)
+{
+ pl_handle_caps caps = 0;
+
+ for (int i = 0; vk_sync_handle_list[i]; i++) {
+ enum pl_handle_type type = vk_sync_handle_list[i];
+
+ VkPhysicalDeviceExternalSemaphoreInfo info = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_SEMAPHORE_INFO_KHR,
+ .handleType = vk_sync_handle_type(type),
+ };
+
+ VkExternalSemaphoreProperties props = {
+ .sType = VK_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_PROPERTIES_KHR,
+ };
+
+ vk->GetPhysicalDeviceExternalSemaphoreProperties(vk->physd, &info, &props);
+ VkExternalSemaphoreFeatureFlags flags = props.externalSemaphoreFeatures;
+ if ((props.compatibleHandleTypes & info.handleType) &&
+ (flags & VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT_KHR))
+ {
+ caps |= type;
+ }
+ }
+
+ return caps;
+}
+
+static pl_handle_caps vk_tex_handle_caps(struct vk_ctx *vk, bool import)
+{
+ pl_handle_caps caps = 0;
+
+ for (int i = 0; vk_mem_handle_list[i]; i++) {
+ enum pl_handle_type handle_type = vk_mem_handle_list[i];
+ if (handle_type == PL_HANDLE_DMA_BUF && !vk->GetImageDrmFormatModifierPropertiesEXT) {
+ PL_DEBUG(vk, "Tex caps for %s (0x%x) unsupported: no DRM modifiers",
+ vk_handle_name(vk_mem_handle_type(PL_HANDLE_DMA_BUF)),
+ (unsigned int) PL_HANDLE_DMA_BUF);
+ continue;
+ }
+
+ // Query whether creation of a "basic" dummy texture would work
+ VkPhysicalDeviceImageDrmFormatModifierInfoEXT drm_pinfo = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT,
+ .drmFormatModifier = DRM_FORMAT_MOD_LINEAR,
+ .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+ };
+
+ VkPhysicalDeviceExternalImageFormatInfoKHR ext_pinfo = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO_KHR,
+ .handleType = vk_mem_handle_type(handle_type),
+ };
+
+ VkPhysicalDeviceImageFormatInfo2KHR pinfo = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2_KHR,
+ .pNext = &ext_pinfo,
+ .format = VK_FORMAT_R8_UNORM,
+ .type = VK_IMAGE_TYPE_2D,
+ .tiling = VK_IMAGE_TILING_OPTIMAL,
+ .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+ };
+
+ if (handle_type == PL_HANDLE_DMA_BUF) {
+ vk_link_struct(&pinfo, &drm_pinfo);
+ pinfo.tiling = VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
+ }
+
+ VkExternalImageFormatPropertiesKHR ext_props = {
+ .sType = VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES_KHR,
+ };
+
+ VkImageFormatProperties2KHR props = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2_KHR,
+ .pNext = &ext_props,
+ };
+
+ VkResult res;
+ res = vk->GetPhysicalDeviceImageFormatProperties2KHR(vk->physd, &pinfo, &props);
+ if (res != VK_SUCCESS) {
+ PL_DEBUG(vk, "Tex caps for %s (0x%x) unsupported: %s",
+ vk_handle_name(ext_pinfo.handleType),
+ (unsigned int) handle_type,
+ vk_res_str(res));
+ continue;
+ }
+
+ if (vk_external_mem_check(vk, &ext_props.externalMemoryProperties,
+ handle_type, import))
+ {
+ caps |= handle_type;
+ }
+ }
+
+#ifdef VK_EXT_metal_objects
+ if (vk->ExportMetalObjectsEXT && import)
+ caps |= PL_HANDLE_MTL_TEX | PL_HANDLE_IOSURFACE;
+#endif
+
+ return caps;
+}
+
+static const VkFilter filters[PL_TEX_SAMPLE_MODE_COUNT] = {
+ [PL_TEX_SAMPLE_NEAREST] = VK_FILTER_NEAREST,
+ [PL_TEX_SAMPLE_LINEAR] = VK_FILTER_LINEAR,
+};
+
+static inline struct pl_spirv_version get_spirv_version(const struct vk_ctx *vk)
+{
+ if (vk->api_ver >= VK_API_VERSION_1_3) {
+ const VkPhysicalDeviceMaintenance4Features *device_maintenance4;
+ device_maintenance4 = vk_find_struct(&vk->features,
+ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_4_FEATURES);
+
+ if (device_maintenance4 && device_maintenance4->maintenance4) {
+ return (struct pl_spirv_version) {
+ .env_version = VK_API_VERSION_1_3,
+ .spv_version = PL_SPV_VERSION(1, 6),
+ };
+ }
+ }
+
+ pl_assert(vk->api_ver >= VK_API_VERSION_1_2);
+ return (struct pl_spirv_version) {
+ .env_version = VK_API_VERSION_1_2,
+ .spv_version = PL_SPV_VERSION(1, 5),
+ };
+}
+
+static const struct pl_gpu_fns pl_fns_vk;
+
+pl_gpu pl_gpu_create_vk(struct vk_ctx *vk)
+{
+ pl_assert(vk->dev);
+
+ struct pl_gpu_t *gpu = pl_zalloc_obj(NULL, gpu, struct pl_vk);
+ gpu->log = vk->log;
+
+ struct pl_vk *p = PL_PRIV(gpu);
+ pl_mutex_init(&p->recording);
+ p->vk = vk;
+ p->impl = pl_fns_vk;
+ p->spirv = pl_spirv_create(vk->log, get_spirv_version(vk));
+ if (!p->spirv)
+ goto error;
+
+ // Query all device properties
+ VkPhysicalDevicePCIBusInfoPropertiesEXT pci_props = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PCI_BUS_INFO_PROPERTIES_EXT,
+ };
+
+ VkPhysicalDeviceIDPropertiesKHR id_props = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR,
+ .pNext = &pci_props,
+ };
+
+ VkPhysicalDevicePushDescriptorPropertiesKHR pushd_props = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR,
+ .pNext = &id_props,
+ };
+
+ VkPhysicalDeviceSubgroupProperties group_props = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES,
+ .pNext = &pushd_props,
+ };
+
+ VkPhysicalDeviceExternalMemoryHostPropertiesEXT host_props = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_MEMORY_HOST_PROPERTIES_EXT,
+ .pNext = &group_props,
+ };
+
+ VkPhysicalDeviceProperties2KHR props = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
+ .pNext = &host_props,
+ };
+
+ bool is_portability = false;
+
+#ifdef VK_KHR_portability_subset
+ VkPhysicalDevicePortabilitySubsetPropertiesKHR port_props = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PORTABILITY_SUBSET_PROPERTIES_KHR,
+ .minVertexInputBindingStrideAlignment = 1,
+ };
+
+ for (int i = 0; i < vk->exts.num; i++) {
+ if (!strcmp(vk->exts.elem[i], VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME)) {
+ vk_link_struct(&props, &port_props);
+ is_portability = true;
+ break;
+ }
+ }
+#endif
+
+ vk->GetPhysicalDeviceProperties2(vk->physd, &props);
+ VkPhysicalDeviceLimits limits = props.properties.limits;
+
+ // Determine GLSL features and limits
+ gpu->glsl = (struct pl_glsl_version) {
+ .version = 450,
+ .vulkan = true,
+ .compute = true,
+ .max_shmem_size = limits.maxComputeSharedMemorySize,
+ .max_group_threads = limits.maxComputeWorkGroupInvocations,
+ .max_group_size = {
+ limits.maxComputeWorkGroupSize[0],
+ limits.maxComputeWorkGroupSize[1],
+ limits.maxComputeWorkGroupSize[2],
+ },
+ };
+
+ VkShaderStageFlags req_stages = VK_SHADER_STAGE_FRAGMENT_BIT |
+ VK_SHADER_STAGE_COMPUTE_BIT;
+ VkSubgroupFeatureFlags req_flags = VK_SUBGROUP_FEATURE_BASIC_BIT |
+ VK_SUBGROUP_FEATURE_VOTE_BIT |
+ VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
+ VK_SUBGROUP_FEATURE_BALLOT_BIT |
+ VK_SUBGROUP_FEATURE_SHUFFLE_BIT;
+
+ if ((group_props.supportedStages & req_stages) == req_stages &&
+ (group_props.supportedOperations & req_flags) == req_flags)
+ {
+ gpu->glsl.subgroup_size = group_props.subgroupSize;
+ }
+
+ if (vk->features.features.shaderImageGatherExtended) {
+ gpu->glsl.min_gather_offset = limits.minTexelGatherOffset;
+ gpu->glsl.max_gather_offset = limits.maxTexelGatherOffset;
+ }
+
+ const size_t max_size = vk_malloc_avail(vk->ma, 0);
+ gpu->limits = (struct pl_gpu_limits) {
+ // pl_gpu
+ .thread_safe = true,
+ .callbacks = true,
+ // pl_buf
+ .max_buf_size = max_size,
+ .max_ubo_size = PL_MIN(limits.maxUniformBufferRange, max_size),
+ .max_ssbo_size = PL_MIN(limits.maxStorageBufferRange, max_size),
+ .max_vbo_size = vk_malloc_avail(vk->ma, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT),
+ .max_mapped_size = vk_malloc_avail(vk->ma, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT),
+ .max_buffer_texels = PL_MIN(limits.maxTexelBufferElements, max_size),
+ .align_host_ptr = host_props.minImportedHostPointerAlignment,
+ .host_cached = vk_malloc_avail(vk->ma, VK_MEMORY_PROPERTY_HOST_CACHED_BIT),
+ // pl_tex
+ .max_tex_1d_dim = limits.maxImageDimension1D,
+ .max_tex_2d_dim = limits.maxImageDimension2D,
+ .max_tex_3d_dim = limits.maxImageDimension3D,
+ .blittable_1d_3d = true,
+ .buf_transfer = true,
+ .align_tex_xfer_pitch = limits.optimalBufferCopyRowPitchAlignment,
+ .align_tex_xfer_offset = pl_lcm(limits.optimalBufferCopyOffsetAlignment, 4),
+ // pl_pass
+ .max_variable_comps = 0, // vulkan doesn't support these at all
+ .max_constants = SIZE_MAX,
+ .array_size_constants = !is_portability,
+ .max_pushc_size = limits.maxPushConstantsSize,
+#ifdef VK_KHR_portability_subset
+ .align_vertex_stride = port_props.minVertexInputBindingStrideAlignment,
+#else
+ .align_vertex_stride = 1,
+#endif
+ .max_dispatch = {
+ limits.maxComputeWorkGroupCount[0],
+ limits.maxComputeWorkGroupCount[1],
+ limits.maxComputeWorkGroupCount[2],
+ },
+ .fragment_queues = vk->pool_graphics->num_queues,
+ .compute_queues = vk->pool_compute->num_queues,
+ };
+
+ gpu->export_caps.buf = vk_malloc_handle_caps(vk->ma, false);
+ gpu->import_caps.buf = vk_malloc_handle_caps(vk->ma, true);
+ gpu->export_caps.tex = vk_tex_handle_caps(vk, false);
+ gpu->import_caps.tex = vk_tex_handle_caps(vk, true);
+ gpu->export_caps.sync = vk_sync_handle_caps(vk);
+ gpu->import_caps.sync = 0; // Not supported yet
+
+ if (pl_gpu_supports_interop(gpu)) {
+ pl_static_assert(sizeof(gpu->uuid) == VK_UUID_SIZE);
+ memcpy(gpu->uuid, id_props.deviceUUID, sizeof(gpu->uuid));
+
+ gpu->pci.domain = pci_props.pciDomain;
+ gpu->pci.bus = pci_props.pciBus;
+ gpu->pci.device = pci_props.pciDevice;
+ gpu->pci.function = pci_props.pciFunction;
+ }
+
+ if (vk->CmdPushDescriptorSetKHR)
+ p->max_push_descriptors = pushd_props.maxPushDescriptors;
+
+ vk_setup_formats(gpu);
+
+ // Compute the correct minimum texture alignment
+ p->min_texel_alignment = 1;
+ for (int i = 0; i < gpu->num_formats; i++) {
+ if (gpu->formats[i]->emulated || gpu->formats[i]->opaque)
+ continue;
+ size_t texel_size = gpu->formats[i]->texel_size;
+ p->min_texel_alignment = pl_lcm(p->min_texel_alignment, texel_size);
+ }
+ PL_DEBUG(gpu, "Minimum texel alignment: %zu", p->min_texel_alignment);
+
+ // Initialize the samplers
+ for (enum pl_tex_sample_mode s = 0; s < PL_TEX_SAMPLE_MODE_COUNT; s++) {
+ for (enum pl_tex_address_mode a = 0; a < PL_TEX_ADDRESS_MODE_COUNT; a++) {
+ static const VkSamplerAddressMode modes[PL_TEX_ADDRESS_MODE_COUNT] = {
+ [PL_TEX_ADDRESS_CLAMP] = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
+ [PL_TEX_ADDRESS_REPEAT] = VK_SAMPLER_ADDRESS_MODE_REPEAT,
+ [PL_TEX_ADDRESS_MIRROR] = VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT,
+ };
+
+ VkSamplerCreateInfo sinfo = {
+ .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
+ .magFilter = filters[s],
+ .minFilter = filters[s],
+ .addressModeU = modes[a],
+ .addressModeV = modes[a],
+ .addressModeW = modes[a],
+ .maxAnisotropy = 1.0,
+ };
+
+ VK(vk->CreateSampler(vk->dev, &sinfo, PL_VK_ALLOC, &p->samplers[s][a]));
+ }
+ }
+
+ return pl_gpu_finalize(gpu);
+
+error:
+ vk_gpu_destroy(gpu);
+ return NULL;
+}
+
+static void vk_sync_destroy(pl_gpu gpu, pl_sync sync)
+{
+ if (!sync)
+ return;
+
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_sync_vk *sync_vk = PL_PRIV(sync);
+
+#ifdef PL_HAVE_UNIX
+ if (sync->handle_type == PL_HANDLE_FD) {
+ if (sync->wait_handle.fd > -1)
+ close(sync->wait_handle.fd);
+ if (sync->signal_handle.fd > -1)
+ close(sync->signal_handle.fd);
+ }
+#endif
+#ifdef PL_HAVE_WIN32
+ if (sync->handle_type == PL_HANDLE_WIN32) {
+ if (sync->wait_handle.handle != NULL)
+ CloseHandle(sync->wait_handle.handle);
+ if (sync->signal_handle.handle != NULL)
+ CloseHandle(sync->signal_handle.handle);
+ }
+ // PL_HANDLE_WIN32_KMT is just an identifier. It doesn't get closed.
+#endif
+
+ vk->DestroySemaphore(vk->dev, sync_vk->wait, PL_VK_ALLOC);
+ vk->DestroySemaphore(vk->dev, sync_vk->signal, PL_VK_ALLOC);
+
+ pl_free((void *) sync);
+}
+
+void vk_sync_deref(pl_gpu gpu, pl_sync sync)
+{
+ if (!sync)
+ return;
+
+ struct pl_sync_vk *sync_vk = PL_PRIV(sync);
+ if (pl_rc_deref(&sync_vk->rc))
+ vk_sync_destroy(gpu, sync);
+}
+
+static pl_sync vk_sync_create(pl_gpu gpu, enum pl_handle_type handle_type)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ struct pl_sync_t *sync = pl_zalloc_obj(NULL, sync, struct pl_sync_vk);
+ sync->handle_type = handle_type;
+
+ struct pl_sync_vk *sync_vk = PL_PRIV(sync);
+ pl_rc_init(&sync_vk->rc);
+
+ VkExportSemaphoreCreateInfoKHR einfo = {
+ .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR,
+ .handleTypes = vk_sync_handle_type(handle_type),
+ };
+
+ switch (handle_type) {
+ case PL_HANDLE_FD:
+ sync->wait_handle.fd = -1;
+ sync->signal_handle.fd = -1;
+ break;
+ case PL_HANDLE_WIN32:
+ case PL_HANDLE_WIN32_KMT:
+ sync->wait_handle.handle = NULL;
+ sync->signal_handle.handle = NULL;
+ break;
+ case PL_HANDLE_DMA_BUF:
+ case PL_HANDLE_HOST_PTR:
+ case PL_HANDLE_MTL_TEX:
+ case PL_HANDLE_IOSURFACE:
+ pl_unreachable();
+ }
+
+ const VkSemaphoreCreateInfo sinfo = {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+ .pNext = &einfo,
+ };
+
+ VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &sync_vk->wait));
+ VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &sync_vk->signal));
+ PL_VK_NAME(SEMAPHORE, sync_vk->wait, "sync wait");
+ PL_VK_NAME(SEMAPHORE, sync_vk->signal, "sync signal");
+
+#ifdef PL_HAVE_UNIX
+ if (handle_type == PL_HANDLE_FD) {
+ VkSemaphoreGetFdInfoKHR finfo = {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR,
+ .semaphore = sync_vk->wait,
+ .handleType = einfo.handleTypes,
+ };
+
+ VK(vk->GetSemaphoreFdKHR(vk->dev, &finfo, &sync->wait_handle.fd));
+
+ finfo.semaphore = sync_vk->signal;
+ VK(vk->GetSemaphoreFdKHR(vk->dev, &finfo, &sync->signal_handle.fd));
+ }
+#endif
+
+#ifdef PL_HAVE_WIN32
+ if (handle_type == PL_HANDLE_WIN32 ||
+ handle_type == PL_HANDLE_WIN32_KMT)
+ {
+ VkSemaphoreGetWin32HandleInfoKHR handle_info = {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR,
+ .semaphore = sync_vk->wait,
+ .handleType = einfo.handleTypes,
+ };
+
+ VK(vk->GetSemaphoreWin32HandleKHR(vk->dev, &handle_info,
+ &sync->wait_handle.handle));
+
+ handle_info.semaphore = sync_vk->signal;
+ VK(vk->GetSemaphoreWin32HandleKHR(vk->dev, &handle_info,
+ &sync->signal_handle.handle));
+ }
+#endif
+
+ return sync;
+
+error:
+ vk_sync_destroy(gpu, sync);
+ return NULL;
+}
+
+void pl_vulkan_sem_destroy(pl_gpu gpu, VkSemaphore *semaphore)
+{
+ VkSemaphore sem = *semaphore;
+ if (!sem)
+ return;
+
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ vk->DestroySemaphore(vk->dev, sem, PL_VK_ALLOC);
+ *semaphore = VK_NULL_HANDLE;
+}
+
+VkSemaphore pl_vulkan_sem_create(pl_gpu gpu, const struct pl_vulkan_sem_params *params)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ pl_assert(PL_ISPOT(params->export_handle));
+ if ((params->export_handle & gpu->export_caps.sync) != params->export_handle) {
+ PL_ERR(gpu, "Invalid handle type 0x%"PRIx64" specified for "
+ "`pl_vulkan_sem_create`!", (uint64_t) params->export_handle);
+ return VK_NULL_HANDLE;
+ }
+
+ switch (params->export_handle) {
+ case PL_HANDLE_FD:
+ params->out_handle->fd = -1;
+ break;
+ case PL_HANDLE_WIN32:
+ case PL_HANDLE_WIN32_KMT:
+ params->out_handle->handle = NULL;
+ break;
+ case PL_HANDLE_DMA_BUF:
+ case PL_HANDLE_HOST_PTR:
+ case PL_HANDLE_MTL_TEX:
+ case PL_HANDLE_IOSURFACE:
+ pl_unreachable();
+ }
+
+ const VkExportSemaphoreCreateInfoKHR einfo = {
+ .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR,
+ .handleTypes = vk_sync_handle_type(params->export_handle),
+ };
+
+ const VkSemaphoreTypeCreateInfo stinfo = {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO,
+ .pNext = params->export_handle ? &einfo : NULL,
+ .semaphoreType = params->type,
+ .initialValue = params->initial_value,
+ };
+
+ const VkSemaphoreCreateInfo sinfo = {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+ .pNext = &stinfo,
+ };
+
+ VkSemaphore sem = VK_NULL_HANDLE;
+ VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &sem));
+ PL_VK_NAME(SEMAPHORE, sem, PL_DEF(params->debug_tag, "pl_vulkan_sem"));
+
+#ifdef PL_HAVE_UNIX
+ if (params->export_handle == PL_HANDLE_FD) {
+ VkSemaphoreGetFdInfoKHR finfo = {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR,
+ .handleType = einfo.handleTypes,
+ .semaphore = sem,
+ };
+
+ VK(vk->GetSemaphoreFdKHR(vk->dev, &finfo, &params->out_handle->fd));
+ }
+#endif
+
+#ifdef PL_HAVE_WIN32
+ if (params->export_handle == PL_HANDLE_WIN32 ||
+ params->export_handle == PL_HANDLE_WIN32_KMT)
+ {
+ VkSemaphoreGetWin32HandleInfoKHR handle_info = {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR,
+ .handleType = einfo.handleTypes,
+ .semaphore = sem,
+ };
+
+ VK(vk->GetSemaphoreWin32HandleKHR(vk->dev, &handle_info,
+ &params->out_handle->handle));
+ }
+#endif
+
+ return sem;
+
+error:
+#ifdef PL_HAVE_UNIX
+ if (params->export_handle == PL_HANDLE_FD) {
+ if (params->out_handle->fd > -1)
+ close(params->out_handle->fd);
+ }
+#endif
+#ifdef PL_HAVE_WIN32
+ if (params->export_handle == PL_HANDLE_WIN32) {
+ if (params->out_handle->handle != NULL)
+ CloseHandle(params->out_handle->handle);
+ }
+ // PL_HANDLE_WIN32_KMT is just an identifier. It doesn't get closed.
+#endif
+ vk->DestroySemaphore(vk->dev, sem, PL_VK_ALLOC);
+ return VK_NULL_HANDLE;
+}
+
+static void vk_gpu_flush(pl_gpu gpu)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ CMD_SUBMIT(NULL);
+ vk_rotate_queues(vk);
+ vk_malloc_garbage_collect(vk->ma);
+}
+
+static void vk_gpu_finish(pl_gpu gpu)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ CMD_SUBMIT(NULL);
+ vk_wait_idle(vk);
+}
+
+static bool vk_gpu_is_failed(pl_gpu gpu)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ return vk->failed;
+}
+
+struct vk_cmd *pl_vk_steal_cmd(pl_gpu gpu)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ pl_mutex_lock(&p->recording);
+ struct vk_cmd *cmd = p->cmd;
+ p->cmd = NULL;
+ pl_mutex_unlock(&p->recording);
+
+ struct vk_cmdpool *pool = vk->pool_graphics;
+ if (!cmd || cmd->pool != pool) {
+ vk_cmd_submit(&cmd);
+ cmd = vk_cmd_begin(pool, NULL);
+ }
+
+ return cmd;
+}
+
+void pl_vk_print_heap(pl_gpu gpu, enum pl_log_level lev)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ vk_malloc_print_stats(vk->ma, lev);
+}
+
+static const struct pl_gpu_fns pl_fns_vk = {
+ .destroy = vk_gpu_destroy,
+ .tex_create = vk_tex_create,
+ .tex_destroy = vk_tex_deref,
+ .tex_invalidate = vk_tex_invalidate,
+ .tex_clear_ex = vk_tex_clear_ex,
+ .tex_blit = vk_tex_blit,
+ .tex_upload = vk_tex_upload,
+ .tex_download = vk_tex_download,
+ .tex_poll = vk_tex_poll,
+ .tex_export = vk_tex_export,
+ .buf_create = vk_buf_create,
+ .buf_destroy = vk_buf_deref,
+ .buf_write = vk_buf_write,
+ .buf_read = vk_buf_read,
+ .buf_copy = vk_buf_copy,
+ .buf_export = vk_buf_export,
+ .buf_poll = vk_buf_poll,
+ .desc_namespace = vk_desc_namespace,
+ .pass_create = vk_pass_create,
+ .pass_destroy = vk_pass_destroy,
+ .pass_run = vk_pass_run,
+ .sync_create = vk_sync_create,
+ .sync_destroy = vk_sync_deref,
+ .timer_create = vk_timer_create,
+ .timer_destroy = vk_timer_destroy,
+ .timer_query = vk_timer_query,
+ .gpu_flush = vk_gpu_flush,
+ .gpu_finish = vk_gpu_finish,
+ .gpu_is_failed = vk_gpu_is_failed,
+};
diff --git a/src/vulkan/gpu.h b/src/vulkan/gpu.h
new file mode 100644
index 0000000..041de13
--- /dev/null
+++ b/src/vulkan/gpu.h
@@ -0,0 +1,175 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+#include "command.h"
+#include "formats.h"
+#include "malloc.h"
+#include "utils.h"
+
+#include "../gpu.h"
+#include "../glsl/spirv.h"
+#include "../pl_thread.h"
+
+pl_gpu pl_gpu_create_vk(struct vk_ctx *vk);
+
+// This function takes the current graphics command and steals it from the
+// GPU, so the caller can do custom vk_cmd_ calls on it. The caller should
+// submit it as well.
+struct vk_cmd *pl_vk_steal_cmd(pl_gpu gpu);
+
+// Print memory usage statistics
+void pl_vk_print_heap(pl_gpu, enum pl_log_level);
+
+// --- pl_gpu internal structs and helpers
+
+struct pl_fmt_vk {
+ const struct vk_format *vk_fmt;
+ bool blit_emulated;
+};
+
+enum queue_type {
+ GRAPHICS,
+ COMPUTE,
+ TRANSFER,
+ ANY,
+};
+
+struct pl_vk {
+ struct pl_gpu_fns impl;
+ struct vk_ctx *vk;
+ pl_spirv spirv;
+
+ // Some additional cached device limits and features checks
+ uint32_t max_push_descriptors;
+ size_t min_texel_alignment;
+
+ // The "currently recording" command. This will be queued and replaced by
+ // a new command every time we need to "switch" between queue families.
+ pl_mutex recording;
+ struct vk_cmd *cmd;
+ pl_timer cmd_timer;
+
+ // Array of VkSamplers for every combination of sample/address modes
+ VkSampler samplers[PL_TEX_SAMPLE_MODE_COUNT][PL_TEX_ADDRESS_MODE_COUNT];
+
+ // To avoid spamming warnings
+ bool warned_modless;
+};
+
+struct vk_cmd *_begin_cmd(pl_gpu, enum queue_type, const char *label, pl_timer);
+bool _end_cmd(pl_gpu, struct vk_cmd **, bool submit);
+
+#define CMD_BEGIN(type) _begin_cmd(gpu, type, __func__, NULL)
+#define CMD_BEGIN_TIMED(type, timer) _begin_cmd(gpu, type, __func__, timer)
+#define CMD_FINISH(cmd) _end_cmd(gpu, cmd, false)
+#define CMD_SUBMIT(cmd) _end_cmd(gpu, cmd, true)
+
+// Helper to fire a callback the next time the `pl_gpu` is in an idle state
+//
+// Use this instead of `vk_dev_callback` when you need to clean up after
+// resources that might possibly still be in use by the `pl_gpu` at the time of
+// creating the callback.
+void vk_gpu_idle_callback(pl_gpu, vk_cb, const void *priv, const void *arg);
+
+struct pl_tex_vk {
+ pl_rc_t rc;
+ bool external_img;
+ enum queue_type transfer_queue;
+ VkImageType type;
+ VkImage img;
+ VkImageAspectFlags aspect;
+ struct vk_memslice mem;
+ // cached properties
+ VkFormat img_fmt;
+ VkImageUsageFlags usage_flags;
+ // for sampling
+ VkImageView view;
+ // for rendering
+ VkFramebuffer framebuffer;
+ // for vk_tex_upload/download fallback code
+ pl_fmt texel_fmt;
+ // for planar textures (as a convenience)
+ int num_planes;
+ struct pl_tex_vk *planes[4];
+
+ // synchronization and current state (planes only)
+ struct vk_sem sem;
+ VkImageLayout layout;
+ PL_ARRAY(pl_vulkan_sem) ext_deps; // external semaphore, not owned by the pl_tex
+ pl_sync ext_sync; // indicates an exported image
+ uint32_t qf; // last queue family to access this texture (for barriers)
+ bool may_invalidate;
+ bool held;
+};
+
+pl_tex vk_tex_create(pl_gpu, const struct pl_tex_params *);
+void vk_tex_deref(pl_gpu, pl_tex);
+void vk_tex_invalidate(pl_gpu, pl_tex);
+void vk_tex_clear_ex(pl_gpu, pl_tex, const union pl_clear_color);
+void vk_tex_blit(pl_gpu, const struct pl_tex_blit_params *);
+bool vk_tex_upload(pl_gpu, const struct pl_tex_transfer_params *);
+bool vk_tex_download(pl_gpu, const struct pl_tex_transfer_params *);
+bool vk_tex_poll(pl_gpu, pl_tex, uint64_t timeout);
+bool vk_tex_export(pl_gpu, pl_tex, pl_sync);
+void vk_tex_barrier(pl_gpu, struct vk_cmd *, pl_tex, VkPipelineStageFlags2,
+ VkAccessFlags2, VkImageLayout, uint32_t qf);
+
+struct pl_buf_vk {
+ pl_rc_t rc;
+ struct vk_memslice mem;
+ enum queue_type update_queue;
+ VkBufferView view; // for texel buffers
+
+ // synchronization and current state
+ struct vk_sem sem;
+ bool exported;
+ bool needs_flush;
+};
+
+pl_buf vk_buf_create(pl_gpu, const struct pl_buf_params *);
+void vk_buf_deref(pl_gpu, pl_buf);
+void vk_buf_write(pl_gpu, pl_buf, size_t offset, const void *src, size_t size);
+bool vk_buf_read(pl_gpu, pl_buf, size_t offset, void *dst, size_t size);
+void vk_buf_copy(pl_gpu, pl_buf dst, size_t dst_offset,
+ pl_buf src, size_t src_offset, size_t size);
+bool vk_buf_export(pl_gpu, pl_buf);
+bool vk_buf_poll(pl_gpu, pl_buf, uint64_t timeout);
+
+// Helper to ease buffer barrier creation. (`offset` is relative to pl_buf)
+void vk_buf_barrier(pl_gpu, struct vk_cmd *, pl_buf, VkPipelineStageFlags2,
+ VkAccessFlags2, size_t offset, size_t size, bool export);
+
+// Flush visible writes to a buffer made by the API
+void vk_buf_flush(pl_gpu, struct vk_cmd *, pl_buf, size_t offset, size_t size);
+
+struct pl_pass_vk;
+
+int vk_desc_namespace(pl_gpu, enum pl_desc_type);
+pl_pass vk_pass_create(pl_gpu, const struct pl_pass_params *);
+void vk_pass_destroy(pl_gpu, pl_pass);
+void vk_pass_run(pl_gpu, const struct pl_pass_run_params *);
+
+struct pl_sync_vk {
+ pl_rc_t rc;
+ VkSemaphore wait;
+ VkSemaphore signal;
+};
+
+void vk_sync_deref(pl_gpu, pl_sync);
diff --git a/src/vulkan/gpu_buf.c b/src/vulkan/gpu_buf.c
new file mode 100644
index 0000000..2f317bc
--- /dev/null
+++ b/src/vulkan/gpu_buf.c
@@ -0,0 +1,470 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+
+void vk_buf_barrier(pl_gpu gpu, struct vk_cmd *cmd, pl_buf buf,
+ VkPipelineStageFlags2 stage, VkAccessFlags2 access,
+ size_t offset, size_t size, bool export)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+ pl_assert(!export || !buf_vk->exported); // can't re-export exported buffers
+ pl_rc_ref(&buf_vk->rc);
+
+ bool needs_flush = buf_vk->needs_flush || buf->params.host_mapped ||
+ buf->params.import_handle == PL_HANDLE_HOST_PTR;
+ bool noncoherent = buf_vk->mem.data && !buf_vk->mem.coherent;
+ if (needs_flush && noncoherent) {
+ VK(vk->FlushMappedMemoryRanges(vk->dev, 1, &(struct VkMappedMemoryRange) {
+ .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+ .memory = buf_vk->mem.vkmem,
+ .offset = buf_vk->mem.map_offset,
+ .size = buf_vk->mem.map_size,
+ }));
+
+ // Just ignore errors, not much we can do about them other than
+ // logging them and moving on...
+ error: ;
+ }
+
+ struct vk_sync_scope last;
+ last = vk_sem_barrier(cmd, &buf_vk->sem, stage, access, export);
+
+ // CONCURRENT buffers require transitioning to/from IGNORED, EXCLUSIVE
+ // buffers require transitioning to/from the concrete QF index
+ uint32_t qf = vk->pools.num > 1 ? VK_QUEUE_FAMILY_IGNORED : cmd->pool->qf;
+ uint32_t src_qf = buf_vk->exported ? VK_QUEUE_FAMILY_EXTERNAL_KHR : qf;
+ uint32_t dst_qf = export ? VK_QUEUE_FAMILY_EXTERNAL_KHR : qf;
+
+ if (last.access || src_qf != dst_qf) {
+ vk_cmd_barrier(cmd, &(VkDependencyInfo) {
+ .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+ .bufferMemoryBarrierCount = 1,
+ .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+ .srcStageMask = last.stage,
+ .srcAccessMask = last.access,
+ .dstStageMask = stage,
+ .dstAccessMask = access,
+ .srcQueueFamilyIndex = src_qf,
+ .dstQueueFamilyIndex = dst_qf,
+ .buffer = buf_vk->mem.buf,
+ .offset = buf_vk->mem.offset + offset,
+ .size = size,
+ },
+ });
+ }
+
+ buf_vk->needs_flush = false;
+ buf_vk->exported = export;
+ vk_cmd_callback(cmd, (vk_cb) vk_buf_deref, gpu, buf);
+}
+
+void vk_buf_deref(pl_gpu gpu, pl_buf buf)
+{
+ if (!buf)
+ return;
+
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+ if (pl_rc_deref(&buf_vk->rc)) {
+ vk->DestroyBufferView(vk->dev, buf_vk->view, PL_VK_ALLOC);
+ vk_malloc_free(vk->ma, &buf_vk->mem);
+ pl_free((void *) buf);
+ }
+}
+
+pl_buf vk_buf_create(pl_gpu gpu, const struct pl_buf_params *params)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ struct pl_buf_t *buf = pl_zalloc_obj(NULL, buf, struct pl_buf_vk);
+ buf->params = *params;
+ buf->params.initial_data = NULL;
+
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+ pl_rc_init(&buf_vk->rc);
+
+ struct vk_malloc_params mparams = {
+ .reqs = {
+ .size = PL_ALIGN2(params->size, 4), // for vk_buf_write
+ .memoryTypeBits = UINT32_MAX,
+ .alignment = 1,
+ },
+ // these are always set, because `vk_buf_copy` can always be used
+ .buf_usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+ VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+ .export_handle = params->export_handle,
+ .import_handle = params->import_handle,
+ .shared_mem = params->shared_mem,
+ .debug_tag = params->debug_tag,
+ };
+
+ // Mandatory/optimal buffer offset alignment
+ VkDeviceSize *align = &mparams.reqs.alignment;
+ VkDeviceSize extra_align = vk->props.limits.optimalBufferCopyOffsetAlignment;
+
+ // Try and align all buffers to the minimum texel alignment, to make sure
+ // tex_upload/tex_download always gets aligned buffer copies if possible
+ extra_align = pl_lcm(extra_align, p->min_texel_alignment);
+
+ enum pl_buf_mem_type mem_type = params->memory_type;
+ bool is_texel = false;
+
+ if (params->uniform) {
+ mparams.buf_usage |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
+ *align = pl_lcm(*align, vk->props.limits.minUniformBufferOffsetAlignment);
+ mem_type = PL_BUF_MEM_DEVICE;
+ if (params->format) {
+ mparams.buf_usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
+ is_texel = true;
+ }
+ }
+
+ if (params->storable) {
+ mparams.buf_usage |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+ *align = pl_lcm(*align, vk->props.limits.minStorageBufferOffsetAlignment);
+ buf_vk->update_queue = COMPUTE;
+ mem_type = PL_BUF_MEM_DEVICE;
+ if (params->format) {
+ mparams.buf_usage |= VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT;
+ is_texel = true;
+ }
+ }
+
+ if (is_texel) {
+ *align = pl_lcm(*align, vk->props.limits.minTexelBufferOffsetAlignment);
+ *align = pl_lcm(*align, params->format->texel_size);
+ }
+
+ if (params->drawable) {
+ mparams.buf_usage |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT |
+ VK_BUFFER_USAGE_INDEX_BUFFER_BIT;
+ mem_type = PL_BUF_MEM_DEVICE;
+ }
+
+ if (params->host_writable || params->initial_data) {
+ // Buffers should be written using mapped memory if possible
+ mparams.optimal = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+ // Use the transfer queue for updates on very large buffers (1 MB)
+ if (params->size > 1024*1024)
+ buf_vk->update_queue = TRANSFER;
+ }
+
+ if (params->host_mapped || params->host_readable) {
+ mparams.required |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+
+ if (params->size > 1024) {
+ // Prefer cached memory for large buffers (1 kB) which may be read
+ // from, because uncached reads are extremely slow
+ mparams.optimal |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+ }
+ }
+
+ switch (mem_type) {
+ case PL_BUF_MEM_AUTO:
+ // We generally prefer VRAM since it's faster than RAM, but any number
+ // of other requirements could potentially exclude it, so just mark it
+ // as optimal by default.
+ if (!(mparams.optimal & VK_MEMORY_PROPERTY_HOST_CACHED_BIT))
+ mparams.optimal |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+ break;
+ case PL_BUF_MEM_DEVICE:
+ // Force device local memory.
+ mparams.required |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+ break;
+ case PL_BUF_MEM_HOST:
+ // This isn't a true guarantee, but actually trying to restrict the
+ // device-local bit locks out all memory heaps on iGPUs. Requiring
+ // the memory be host-mapped is the easiest compromise.
+ mparams.required |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+ mparams.optimal |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+ break;
+ case PL_BUF_MEM_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ if (params->import_handle) {
+ size_t offset = params->shared_mem.offset;
+ if (PL_ALIGN(offset, *align) != offset) {
+ PL_ERR(gpu, "Imported memory offset %zu violates minimum alignment "
+ "requirement of enabled usage flags (%zu)!",
+ offset, (size_t) *align);
+ goto error;
+ }
+ } else {
+ *align = pl_lcm(*align, extra_align);
+ }
+
+ if (!vk_malloc_slice(vk->ma, &buf_vk->mem, &mparams))
+ goto error;
+
+ if (params->host_mapped)
+ buf->data = buf_vk->mem.data;
+
+ if (params->export_handle) {
+ buf->shared_mem = buf_vk->mem.shared_mem;
+ buf->shared_mem.drm_format_mod = DRM_FORMAT_MOD_LINEAR;
+ buf_vk->exported = true;
+ }
+
+ if (is_texel) {
+ struct pl_fmt_vk *fmtp = PL_PRIV(params->format);
+ VkBufferViewCreateInfo vinfo = {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
+ .buffer = buf_vk->mem.buf,
+ .format = PL_DEF(fmtp->vk_fmt->bfmt, fmtp->vk_fmt->tfmt),
+ .offset = buf_vk->mem.offset,
+ .range = buf_vk->mem.size,
+ };
+
+ VK(vk->CreateBufferView(vk->dev, &vinfo, PL_VK_ALLOC, &buf_vk->view));
+ PL_VK_NAME(BUFFER_VIEW, buf_vk->view, PL_DEF(params->debug_tag, "texel"));
+ }
+
+ if (params->initial_data)
+ vk_buf_write(gpu, buf, 0, params->initial_data, params->size);
+
+ return buf;
+
+error:
+ vk_buf_deref(gpu, buf);
+ return NULL;
+}
+
+static void invalidate_buf(pl_gpu gpu, pl_buf buf)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+ if (buf_vk->mem.data && !buf_vk->mem.coherent) {
+ VK(vk->InvalidateMappedMemoryRanges(vk->dev, 1, &(VkMappedMemoryRange) {
+ .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+ .memory = buf_vk->mem.vkmem,
+ .offset = buf_vk->mem.map_offset,
+ .size = buf_vk->mem.map_size,
+ }));
+ }
+
+ // Ignore errors (after logging), nothing useful we can do anyway
+error: ;
+ vk_buf_deref(gpu, buf);
+}
+
+void vk_buf_flush(pl_gpu gpu, struct vk_cmd *cmd, pl_buf buf,
+ size_t offset, size_t size)
+{
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+ // We need to perform a flush if the host is capable of reading back from
+ // the buffer, or if we intend to overwrite it using mapped memory
+ bool can_read = buf->params.host_readable;
+ bool can_write = buf_vk->mem.data && buf->params.host_writable;
+ if (buf->params.host_mapped || buf->params.import_handle == PL_HANDLE_HOST_PTR)
+ can_read = can_write = true;
+
+ if (!can_read && !can_write)
+ return;
+
+ vk_cmd_barrier(cmd, &(VkDependencyInfo) {
+ .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+ .bufferMemoryBarrierCount = 1,
+ .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+ .srcStageMask = buf_vk->sem.write.stage,
+ .srcAccessMask = buf_vk->sem.write.access,
+ .dstStageMask = VK_PIPELINE_STAGE_2_HOST_BIT,
+ .dstAccessMask = (can_read ? VK_ACCESS_2_HOST_READ_BIT : 0)
+ | (can_write ? VK_ACCESS_2_HOST_WRITE_BIT : 0),
+ .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+ .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+ .buffer = buf_vk->mem.buf,
+ .offset = buf_vk->mem.offset + offset,
+ .size = size,
+ },
+ });
+
+ // We need to hold on to the buffer until this barrier completes
+ vk_cmd_callback(cmd, (vk_cb) invalidate_buf, gpu, buf);
+ pl_rc_ref(&buf_vk->rc);
+}
+
+bool vk_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t timeout)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+ // Opportunistically check if we can re-use this buffer without flush
+ vk_poll_commands(vk, 0);
+ if (pl_rc_count(&buf_vk->rc) == 1)
+ return false;
+
+ // Otherwise, we're force to submit any queued command so that the
+ // user is guaranteed to see progress eventually, even if they call
+ // this in a tight loop
+ CMD_SUBMIT(NULL);
+ vk_poll_commands(vk, timeout);
+
+ return pl_rc_count(&buf_vk->rc) > 1;
+}
+
+void vk_buf_write(pl_gpu gpu, pl_buf buf, size_t offset,
+ const void *data, size_t size)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+ // For host-mapped buffers, we can just directly memcpy the buffer contents.
+ // Otherwise, we can update the buffer from the GPU using a command buffer.
+ if (buf_vk->mem.data) {
+ // ensure no queued operations
+ while (vk_buf_poll(gpu, buf, UINT64_MAX))
+ ; // do nothing
+
+ uintptr_t addr = (uintptr_t) buf_vk->mem.data + offset;
+ memcpy((void *) addr, data, size);
+ buf_vk->needs_flush = true;
+ } else {
+ struct vk_cmd *cmd = CMD_BEGIN(buf_vk->update_queue);
+ if (!cmd) {
+ PL_ERR(gpu, "Failed updating buffer!");
+ return;
+ }
+
+ vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_WRITE_BIT, offset, size, false);
+
+ // Vulkan requires `size` to be a multiple of 4, so we need to make
+ // sure to handle the end separately if the original data is not
+ const size_t max_transfer = 64 * 1024;
+ size_t size_rem = size % 4;
+ size_t size_base = size - size_rem;
+ VkDeviceSize buf_offset = buf_vk->mem.offset + offset;
+
+ if (size_base > max_transfer) {
+ PL_TRACE(gpu, "Using multiple vkCmdUpdateBuffer calls to upload "
+ "large buffer. Consider using buffer-buffer transfers "
+ "instead!");
+ }
+
+ for (size_t xfer = 0; xfer < size_base; xfer += max_transfer) {
+ vk->CmdUpdateBuffer(cmd->buf, buf_vk->mem.buf,
+ buf_offset + xfer,
+ PL_MIN(size_base, max_transfer),
+ (void *) ((uint8_t *) data + xfer));
+ }
+
+ if (size_rem) {
+ uint8_t tail[4] = {0};
+ memcpy(tail, data, size_rem);
+ vk->CmdUpdateBuffer(cmd->buf, buf_vk->mem.buf, buf_offset + size_base,
+ sizeof(tail), tail);
+ }
+
+ pl_assert(!buf->params.host_readable); // no flush needed due to this
+ CMD_FINISH(&cmd);
+ }
+}
+
+bool vk_buf_read(pl_gpu gpu, pl_buf buf, size_t offset, void *dest, size_t size)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+ pl_assert(buf_vk->mem.data);
+
+ if (vk_buf_poll(gpu, buf, 0) && buf_vk->sem.write.sync.sem) {
+ // ensure no more queued writes
+ VK(vk->WaitSemaphores(vk->dev, &(VkSemaphoreWaitInfo) {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
+ .semaphoreCount = 1,
+ .pSemaphores = &buf_vk->sem.write.sync.sem,
+ .pValues = &buf_vk->sem.write.sync.value,
+ }, UINT64_MAX));
+
+ // process callbacks
+ vk_poll_commands(vk, 0);
+ }
+
+ uintptr_t addr = (uintptr_t) buf_vk->mem.data + (size_t) offset;
+ memcpy(dest, (void *) addr, size);
+ return true;
+
+error:
+ return false;
+}
+
+void vk_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset,
+ pl_buf src, size_t src_offset, size_t size)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_buf_vk *dst_vk = PL_PRIV(dst);
+ struct pl_buf_vk *src_vk = PL_PRIV(src);
+
+ struct vk_cmd *cmd = CMD_BEGIN(dst_vk->update_queue);
+ if (!cmd) {
+ PL_ERR(gpu, "Failed copying buffer!");
+ return;
+ }
+
+ vk_buf_barrier(gpu, cmd, dst, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_WRITE_BIT, dst_offset, size, false);
+ vk_buf_barrier(gpu, cmd, src, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_READ_BIT, src_offset, size, false);
+
+ VkBufferCopy region = {
+ .srcOffset = src_vk->mem.offset + src_offset,
+ .dstOffset = dst_vk->mem.offset + dst_offset,
+ .size = size,
+ };
+
+ vk->CmdCopyBuffer(cmd->buf, src_vk->mem.buf, dst_vk->mem.buf,
+ 1, &region);
+
+ vk_buf_flush(gpu, cmd, dst, dst_offset, size);
+ CMD_FINISH(&cmd);
+}
+
+bool vk_buf_export(pl_gpu gpu, pl_buf buf)
+{
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+ if (buf_vk->exported)
+ return true;
+
+ struct vk_cmd *cmd = CMD_BEGIN(ANY);
+ if (!cmd) {
+ PL_ERR(gpu, "Failed exporting buffer!");
+ return false;
+ }
+
+ // For the queue family ownership transfer, we can ignore all pipeline
+ // stages since the synchronization via fences/semaphores is required
+ vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_NONE, 0, 0,
+ buf->params.size, true);
+
+
+ return CMD_SUBMIT(&cmd);
+}
diff --git a/src/vulkan/gpu_pass.c b/src/vulkan/gpu_pass.c
new file mode 100644
index 0000000..5ffe77d
--- /dev/null
+++ b/src/vulkan/gpu_pass.c
@@ -0,0 +1,964 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+#include "cache.h"
+#include "glsl/spirv.h"
+
+// For pl_pass.priv
+struct pl_pass_vk {
+ // Pipeline / render pass
+ VkPipeline base;
+ VkPipeline pipe;
+ VkPipelineLayout pipeLayout;
+ VkRenderPass renderPass;
+ // Descriptor set (bindings)
+ bool use_pushd;
+ VkDescriptorSetLayout dsLayout;
+ VkDescriptorPool dsPool;
+ // To keep track of which descriptor sets are and aren't available, we
+ // allocate a fixed number and use a bitmask of all available sets.
+ VkDescriptorSet dss[16];
+ uint16_t dmask;
+
+ // For recompilation
+ VkVertexInputAttributeDescription *attrs;
+ VkPipelineCache cache;
+ VkShaderModule vert;
+ VkShaderModule shader;
+
+ // For updating
+ VkWriteDescriptorSet *dswrite;
+ VkDescriptorImageInfo *dsiinfo;
+ VkDescriptorBufferInfo *dsbinfo;
+ VkSpecializationInfo specInfo;
+ size_t spec_size;
+};
+
+int vk_desc_namespace(pl_gpu gpu, enum pl_desc_type type)
+{
+ return 0;
+}
+
+static void pass_destroy_cb(pl_gpu gpu, pl_pass pass)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+
+ vk->DestroyPipeline(vk->dev, pass_vk->pipe, PL_VK_ALLOC);
+ vk->DestroyPipeline(vk->dev, pass_vk->base, PL_VK_ALLOC);
+ vk->DestroyRenderPass(vk->dev, pass_vk->renderPass, PL_VK_ALLOC);
+ vk->DestroyPipelineLayout(vk->dev, pass_vk->pipeLayout, PL_VK_ALLOC);
+ vk->DestroyPipelineCache(vk->dev, pass_vk->cache, PL_VK_ALLOC);
+ vk->DestroyDescriptorPool(vk->dev, pass_vk->dsPool, PL_VK_ALLOC);
+ vk->DestroyDescriptorSetLayout(vk->dev, pass_vk->dsLayout, PL_VK_ALLOC);
+ vk->DestroyShaderModule(vk->dev, pass_vk->vert, PL_VK_ALLOC);
+ vk->DestroyShaderModule(vk->dev, pass_vk->shader, PL_VK_ALLOC);
+
+ pl_free((void *) pass);
+}
+
+void vk_pass_destroy(pl_gpu gpu, pl_pass pass)
+{
+ vk_gpu_idle_callback(gpu, (vk_cb) pass_destroy_cb, gpu, pass);
+}
+
+static const VkDescriptorType dsType[] = {
+ [PL_DESC_SAMPLED_TEX] = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+ [PL_DESC_STORAGE_IMG] = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+ [PL_DESC_BUF_UNIFORM] = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+ [PL_DESC_BUF_STORAGE] = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ [PL_DESC_BUF_TEXEL_UNIFORM] = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
+ [PL_DESC_BUF_TEXEL_STORAGE] = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER,
+};
+
+static VkResult vk_compile_glsl(pl_gpu gpu, void *alloc,
+ enum glsl_shader_stage stage,
+ const char *shader,
+ pl_cache_obj *out_spirv)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ pl_cache cache = pl_gpu_cache(gpu);
+ uint64_t key = CACHE_KEY_SPIRV;
+ if (cache) { // skip computing key if `cache
+ pl_hash_merge(&key, p->spirv->signature);
+ pl_hash_merge(&key, pl_str0_hash(shader));
+ out_spirv->key = key;
+ if (pl_cache_get(cache, out_spirv)) {
+ PL_DEBUG(gpu, "Re-using cached SPIR-V object 0x%"PRIx64, key);
+ return VK_SUCCESS;
+ }
+ }
+
+ pl_clock_t start = pl_clock_now();
+ pl_str spirv = pl_spirv_compile_glsl(p->spirv, alloc, gpu->glsl, stage, shader);
+ pl_log_cpu_time(gpu->log, start, pl_clock_now(), "translating SPIR-V");
+ out_spirv->data = spirv.buf;
+ out_spirv->size = spirv.len;
+ out_spirv->free = pl_free;
+ return spirv.len ? VK_SUCCESS : VK_ERROR_INITIALIZATION_FAILED;
+}
+
+static const VkShaderStageFlags stageFlags[] = {
+ [PL_PASS_RASTER] = VK_SHADER_STAGE_FRAGMENT_BIT |
+ VK_SHADER_STAGE_VERTEX_BIT,
+ [PL_PASS_COMPUTE] = VK_SHADER_STAGE_COMPUTE_BIT,
+};
+
+static void destroy_pipeline(struct vk_ctx *vk, void *pipeline)
+{
+ vk->DestroyPipeline(vk->dev, vk_unwrap_handle(pipeline), PL_VK_ALLOC);
+}
+
+static VkResult vk_recreate_pipelines(struct vk_ctx *vk, pl_pass pass,
+ bool derivable, VkPipeline base,
+ VkPipeline *out_pipe)
+{
+ struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+ const struct pl_pass_params *params = &pass->params;
+
+ // The old pipeline might still be in use, so we have to destroy it
+ // asynchronously with a device idle callback
+ if (*out_pipe) {
+ // We don't need to use `vk_gpu_idle_callback` because the only command
+ // that can access a VkPipeline, `vk_pass_run`, always flushes `p->cmd`.
+ vk_dev_callback(vk, (vk_cb) destroy_pipeline, vk, vk_wrap_handle(*out_pipe));
+ *out_pipe = VK_NULL_HANDLE;
+ }
+
+ VkPipelineCreateFlags flags = 0;
+ if (derivable)
+ flags |= VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT;
+ if (base)
+ flags |= VK_PIPELINE_CREATE_DERIVATIVE_BIT;
+
+ const VkSpecializationInfo *specInfo = &pass_vk->specInfo;
+ if (!specInfo->dataSize)
+ specInfo = NULL;
+
+ switch (params->type) {
+ case PL_PASS_RASTER: {
+ static const VkBlendFactor blendFactors[] = {
+ [PL_BLEND_ZERO] = VK_BLEND_FACTOR_ZERO,
+ [PL_BLEND_ONE] = VK_BLEND_FACTOR_ONE,
+ [PL_BLEND_SRC_ALPHA] = VK_BLEND_FACTOR_SRC_ALPHA,
+ [PL_BLEND_ONE_MINUS_SRC_ALPHA] = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA,
+ };
+
+ VkPipelineColorBlendAttachmentState blendState = {
+ .colorBlendOp = VK_BLEND_OP_ADD,
+ .alphaBlendOp = VK_BLEND_OP_ADD,
+ .colorWriteMask = VK_COLOR_COMPONENT_R_BIT |
+ VK_COLOR_COMPONENT_G_BIT |
+ VK_COLOR_COMPONENT_B_BIT |
+ VK_COLOR_COMPONENT_A_BIT,
+ };
+
+ const struct pl_blend_params *blend = params->blend_params;
+ if (blend) {
+ blendState.blendEnable = true;
+ blendState.srcColorBlendFactor = blendFactors[blend->src_rgb];
+ blendState.dstColorBlendFactor = blendFactors[blend->dst_rgb];
+ blendState.srcAlphaBlendFactor = blendFactors[blend->src_alpha];
+ blendState.dstAlphaBlendFactor = blendFactors[blend->dst_alpha];
+ }
+
+ static const VkPrimitiveTopology topologies[PL_PRIM_TYPE_COUNT] = {
+ [PL_PRIM_TRIANGLE_LIST] = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
+ [PL_PRIM_TRIANGLE_STRIP] = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP,
+ };
+
+ VkGraphicsPipelineCreateInfo cinfo = {
+ .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
+ .flags = flags,
+ .stageCount = 2,
+ .pStages = (VkPipelineShaderStageCreateInfo[]) {
+ {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+ .stage = VK_SHADER_STAGE_VERTEX_BIT,
+ .module = pass_vk->vert,
+ .pName = "main",
+ }, {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+ .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
+ .module = pass_vk->shader,
+ .pName = "main",
+ .pSpecializationInfo = specInfo,
+ }
+ },
+ .pVertexInputState = &(VkPipelineVertexInputStateCreateInfo) {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+ .vertexBindingDescriptionCount = 1,
+ .pVertexBindingDescriptions = &(VkVertexInputBindingDescription) {
+ .binding = 0,
+ .stride = params->vertex_stride,
+ .inputRate = VK_VERTEX_INPUT_RATE_VERTEX,
+ },
+ .vertexAttributeDescriptionCount = params->num_vertex_attribs,
+ .pVertexAttributeDescriptions = pass_vk->attrs,
+ },
+ .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+ .topology = topologies[params->vertex_type],
+ },
+ .pViewportState = &(VkPipelineViewportStateCreateInfo) {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
+ .viewportCount = 1,
+ .scissorCount = 1,
+ },
+ .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
+ .polygonMode = VK_POLYGON_MODE_FILL,
+ .cullMode = VK_CULL_MODE_NONE,
+ .lineWidth = 1.0f,
+ },
+ .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
+ .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
+ },
+ .pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
+ .attachmentCount = 1,
+ .pAttachments = &blendState,
+ },
+ .pDynamicState = &(VkPipelineDynamicStateCreateInfo) {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
+ .dynamicStateCount = 2,
+ .pDynamicStates = (VkDynamicState[]){
+ VK_DYNAMIC_STATE_VIEWPORT,
+ VK_DYNAMIC_STATE_SCISSOR,
+ },
+ },
+ .layout = pass_vk->pipeLayout,
+ .renderPass = pass_vk->renderPass,
+ .basePipelineHandle = base,
+ .basePipelineIndex = -1,
+ };
+
+ return vk->CreateGraphicsPipelines(vk->dev, pass_vk->cache, 1, &cinfo,
+ PL_VK_ALLOC, out_pipe);
+ }
+
+ case PL_PASS_COMPUTE: {
+ VkComputePipelineCreateInfo cinfo = {
+ .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+ .flags = flags,
+ .stage = {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+ .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+ .module = pass_vk->shader,
+ .pName = "main",
+ .pSpecializationInfo = specInfo,
+ },
+ .layout = pass_vk->pipeLayout,
+ .basePipelineHandle = base,
+ .basePipelineIndex = -1,
+ };
+
+ return vk->CreateComputePipelines(vk->dev, pass_vk->cache, 1, &cinfo,
+ PL_VK_ALLOC, out_pipe);
+ }
+
+ case PL_PASS_INVALID:
+ case PL_PASS_TYPE_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+}
+
+pl_pass vk_pass_create(pl_gpu gpu, const struct pl_pass_params *params)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ bool success = false;
+
+ struct pl_pass_t *pass = pl_zalloc_obj(NULL, pass, struct pl_pass_vk);
+ pass->params = pl_pass_params_copy(pass, params);
+
+ struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+ pass_vk->dmask = -1; // all descriptors available
+
+ // temporary allocations
+ void *tmp = pl_tmp(NULL);
+
+ int num_desc = params->num_descriptors;
+ if (!num_desc)
+ goto no_descriptors;
+ if (num_desc > vk->props.limits.maxPerStageResources) {
+ PL_ERR(gpu, "Pass with %d descriptors exceeds the maximum number of "
+ "per-stage resources %" PRIu32"!",
+ num_desc, vk->props.limits.maxPerStageResources);
+ goto error;
+ }
+
+ pass_vk->dswrite = pl_calloc(pass, num_desc, sizeof(VkWriteDescriptorSet));
+ pass_vk->dsiinfo = pl_calloc(pass, num_desc, sizeof(VkDescriptorImageInfo));
+ pass_vk->dsbinfo = pl_calloc(pass, num_desc, sizeof(VkDescriptorBufferInfo));
+
+#define NUM_DS (PL_ARRAY_SIZE(pass_vk->dss))
+
+ int dsSize[PL_DESC_TYPE_COUNT] = {0};
+ VkDescriptorSetLayoutBinding *bindings = pl_calloc_ptr(tmp, num_desc, bindings);
+
+ uint32_t max_tex = vk->props.limits.maxPerStageDescriptorSampledImages,
+ max_img = vk->props.limits.maxPerStageDescriptorStorageImages,
+ max_ubo = vk->props.limits.maxPerStageDescriptorUniformBuffers,
+ max_ssbo = vk->props.limits.maxPerStageDescriptorStorageBuffers;
+
+ uint32_t *dsLimits[PL_DESC_TYPE_COUNT] = {
+ [PL_DESC_SAMPLED_TEX] = &max_tex,
+ [PL_DESC_STORAGE_IMG] = &max_img,
+ [PL_DESC_BUF_UNIFORM] = &max_ubo,
+ [PL_DESC_BUF_STORAGE] = &max_ssbo,
+ [PL_DESC_BUF_TEXEL_UNIFORM] = &max_tex,
+ [PL_DESC_BUF_TEXEL_STORAGE] = &max_img,
+ };
+
+ for (int i = 0; i < num_desc; i++) {
+ struct pl_desc *desc = &params->descriptors[i];
+ if (!(*dsLimits[desc->type])--) {
+ PL_ERR(gpu, "Pass exceeds the maximum number of per-stage "
+ "descriptors of type %u!", (unsigned) desc->type);
+ goto error;
+ }
+
+ dsSize[desc->type]++;
+ bindings[i] = (VkDescriptorSetLayoutBinding) {
+ .binding = desc->binding,
+ .descriptorType = dsType[desc->type],
+ .descriptorCount = 1,
+ .stageFlags = stageFlags[params->type],
+ };
+ }
+
+ VkDescriptorSetLayoutCreateInfo dinfo = {
+ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+ .pBindings = bindings,
+ .bindingCount = num_desc,
+ };
+
+ if (p->max_push_descriptors && num_desc <= p->max_push_descriptors) {
+ dinfo.flags |= VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR;
+ pass_vk->use_pushd = true;
+ } else if (p->max_push_descriptors) {
+ PL_INFO(gpu, "Pass with %d descriptors exceeds the maximum push "
+ "descriptor count (%d). Falling back to descriptor sets!",
+ num_desc, p->max_push_descriptors);
+ }
+
+ VK(vk->CreateDescriptorSetLayout(vk->dev, &dinfo, PL_VK_ALLOC,
+ &pass_vk->dsLayout));
+
+ if (!pass_vk->use_pushd) {
+ PL_ARRAY(VkDescriptorPoolSize) dsPoolSizes = {0};
+
+ for (enum pl_desc_type t = 0; t < PL_DESC_TYPE_COUNT; t++) {
+ if (dsSize[t] > 0) {
+ PL_ARRAY_APPEND(tmp, dsPoolSizes, (VkDescriptorPoolSize) {
+ .type = dsType[t],
+ .descriptorCount = dsSize[t] * NUM_DS,
+ });
+ }
+ }
+
+ if (dsPoolSizes.num) {
+ VkDescriptorPoolCreateInfo pinfo = {
+ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+ .maxSets = NUM_DS,
+ .pPoolSizes = dsPoolSizes.elem,
+ .poolSizeCount = dsPoolSizes.num,
+ };
+
+ VK(vk->CreateDescriptorPool(vk->dev, &pinfo, PL_VK_ALLOC, &pass_vk->dsPool));
+
+ VkDescriptorSetLayout layouts[NUM_DS];
+ for (int i = 0; i < NUM_DS; i++)
+ layouts[i] = pass_vk->dsLayout;
+
+ VkDescriptorSetAllocateInfo ainfo = {
+ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+ .descriptorPool = pass_vk->dsPool,
+ .descriptorSetCount = NUM_DS,
+ .pSetLayouts = layouts,
+ };
+
+ VK(vk->AllocateDescriptorSets(vk->dev, &ainfo, pass_vk->dss));
+ }
+ }
+
+no_descriptors: ;
+
+ bool has_spec = params->num_constants;
+ if (has_spec) {
+ PL_ARRAY(VkSpecializationMapEntry) entries = {0};
+ PL_ARRAY_RESIZE(pass, entries, params->num_constants);
+ size_t spec_size = 0;
+
+ for (int i = 0; i < params->num_constants; i++) {
+ const struct pl_constant *con = &params->constants[i];
+ size_t con_size = pl_var_type_size(con->type);
+ entries.elem[i] = (VkSpecializationMapEntry) {
+ .constantID = con->id,
+ .offset = con->offset,
+ .size = con_size,
+ };
+
+ size_t req_size = con->offset + con_size;
+ spec_size = PL_MAX(spec_size, req_size);
+ }
+
+ pass_vk->spec_size = spec_size;
+ pass_vk->specInfo = (VkSpecializationInfo) {
+ .mapEntryCount = params->num_constants,
+ .pMapEntries = entries.elem,
+ };
+
+ if (params->constant_data) {
+ pass_vk->specInfo.pData = pl_memdup(pass, params->constant_data, spec_size);
+ pass_vk->specInfo.dataSize = spec_size;
+ }
+ }
+
+ VkPipelineLayoutCreateInfo linfo = {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+ .setLayoutCount = num_desc ? 1 : 0,
+ .pSetLayouts = &pass_vk->dsLayout,
+ .pushConstantRangeCount = params->push_constants_size ? 1 : 0,
+ .pPushConstantRanges = &(VkPushConstantRange){
+ .stageFlags = stageFlags[params->type],
+ .offset = 0,
+ .size = params->push_constants_size,
+ },
+ };
+
+ VK(vk->CreatePipelineLayout(vk->dev, &linfo, PL_VK_ALLOC,
+ &pass_vk->pipeLayout));
+
+ pl_cache_obj vert = {0}, frag = {0}, comp = {0};
+ switch (params->type) {
+ case PL_PASS_RASTER: ;
+ VK(vk_compile_glsl(gpu, tmp, GLSL_SHADER_VERTEX, params->vertex_shader, &vert));
+ VK(vk_compile_glsl(gpu, tmp, GLSL_SHADER_FRAGMENT, params->glsl_shader, &frag));
+ break;
+ case PL_PASS_COMPUTE:
+ VK(vk_compile_glsl(gpu, tmp, GLSL_SHADER_COMPUTE, params->glsl_shader, &comp));
+ break;
+ case PL_PASS_INVALID:
+ case PL_PASS_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ // Use hash of generated SPIR-V as key for pipeline cache
+ const pl_cache cache = pl_gpu_cache(gpu);
+ pl_cache_obj pipecache = {0};
+ if (cache) {
+ pipecache.key = CACHE_KEY_VK_PIPE;
+ pl_hash_merge(&pipecache.key, pl_var_hash(vk->props.pipelineCacheUUID));
+ pl_hash_merge(&pipecache.key, pl_mem_hash(vert.data, vert.size));
+ pl_hash_merge(&pipecache.key, pl_mem_hash(frag.data, frag.size));
+ pl_hash_merge(&pipecache.key, pl_mem_hash(comp.data, comp.size));
+ pl_cache_get(cache, &pipecache);
+ }
+
+ if (cache || has_spec) {
+ // Don't create pipeline cache unless we either plan on caching the
+ // result of this shader to a pl_cache, or if we will possibly re-use
+ // it due to the presence of specialization constants
+ VkPipelineCacheCreateInfo pcinfo = {
+ .sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO,
+ .pInitialData = pipecache.data,
+ .initialDataSize = pipecache.size,
+ };
+
+ VK(vk->CreatePipelineCache(vk->dev, &pcinfo, PL_VK_ALLOC, &pass_vk->cache));
+ }
+
+ VkShaderModuleCreateInfo sinfo = {
+ .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+ };
+
+ pl_clock_t start = pl_clock_now();
+ switch (params->type) {
+ case PL_PASS_RASTER: {
+ sinfo.pCode = (uint32_t *) vert.data;
+ sinfo.codeSize = vert.size;
+ VK(vk->CreateShaderModule(vk->dev, &sinfo, PL_VK_ALLOC, &pass_vk->vert));
+ PL_VK_NAME(SHADER_MODULE, pass_vk->vert, "vertex");
+
+ sinfo.pCode = (uint32_t *) frag.data;
+ sinfo.codeSize = frag.size;
+ VK(vk->CreateShaderModule(vk->dev, &sinfo, PL_VK_ALLOC, &pass_vk->shader));
+ PL_VK_NAME(SHADER_MODULE, pass_vk->shader, "fragment");
+
+ pass_vk->attrs = pl_calloc_ptr(pass, params->num_vertex_attribs, pass_vk->attrs);
+ for (int i = 0; i < params->num_vertex_attribs; i++) {
+ struct pl_vertex_attrib *va = &params->vertex_attribs[i];
+ const struct vk_format **pfmt_vk = PL_PRIV(va->fmt);
+
+ pass_vk->attrs[i] = (VkVertexInputAttributeDescription) {
+ .binding = 0,
+ .location = va->location,
+ .offset = va->offset,
+ .format = PL_DEF((*pfmt_vk)->bfmt, (*pfmt_vk)->tfmt),
+ };
+ }
+
+ VkRenderPassCreateInfo rinfo = {
+ .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
+ .attachmentCount = 1,
+ .pAttachments = &(VkAttachmentDescription) {
+ .format = (VkFormat) params->target_format->signature,
+ .samples = VK_SAMPLE_COUNT_1_BIT,
+ .loadOp = pass->params.load_target
+ ? VK_ATTACHMENT_LOAD_OP_LOAD
+ : VK_ATTACHMENT_LOAD_OP_DONT_CARE,
+ .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
+ .initialLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+ .finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+ },
+ .subpassCount = 1,
+ .pSubpasses = &(VkSubpassDescription) {
+ .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
+ .colorAttachmentCount = 1,
+ .pColorAttachments = &(VkAttachmentReference) {
+ .attachment = 0,
+ .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+ },
+ },
+ };
+
+ VK(vk->CreateRenderPass(vk->dev, &rinfo, PL_VK_ALLOC, &pass_vk->renderPass));
+ break;
+ }
+ case PL_PASS_COMPUTE: {
+ sinfo.pCode = (uint32_t *) comp.data;
+ sinfo.codeSize = comp.size;
+ VK(vk->CreateShaderModule(vk->dev, &sinfo, PL_VK_ALLOC, &pass_vk->shader));
+ PL_VK_NAME(SHADER_MODULE, pass_vk->shader, "compute");
+ break;
+ }
+ case PL_PASS_INVALID:
+ case PL_PASS_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ pl_clock_t after_compilation = pl_clock_now();
+ pl_log_cpu_time(gpu->log, start, after_compilation, "compiling shader");
+
+ // Update cache entries on successful compilation
+ pl_cache_steal(cache, &vert);
+ pl_cache_steal(cache, &frag);
+ pl_cache_steal(cache, &comp);
+
+ // Create the graphics/compute pipeline
+ VkPipeline *pipe = has_spec ? &pass_vk->base : &pass_vk->pipe;
+ VK(vk_recreate_pipelines(vk, pass, has_spec, VK_NULL_HANDLE, pipe));
+ pl_log_cpu_time(gpu->log, after_compilation, pl_clock_now(), "creating pipeline");
+
+ // Update pipeline cache
+ if (cache) {
+ size_t size = 0;
+ VK(vk->GetPipelineCacheData(vk->dev, pass_vk->cache, &size, NULL));
+ pl_cache_obj_resize(tmp, &pipecache, size);
+ VK(vk->GetPipelineCacheData(vk->dev, pass_vk->cache, &size, pipecache.data));
+ pl_cache_steal(cache, &pipecache);
+ }
+
+ if (!has_spec) {
+ // We can free these if we no longer need them for specialization
+ pl_free_ptr(&pass_vk->attrs);
+ vk->DestroyShaderModule(vk->dev, pass_vk->vert, PL_VK_ALLOC);
+ vk->DestroyShaderModule(vk->dev, pass_vk->shader, PL_VK_ALLOC);
+ vk->DestroyPipelineCache(vk->dev, pass_vk->cache, PL_VK_ALLOC);
+ pass_vk->vert = VK_NULL_HANDLE;
+ pass_vk->shader = VK_NULL_HANDLE;
+ pass_vk->cache = VK_NULL_HANDLE;
+ }
+
+ PL_DEBUG(vk, "Pass statistics: size %zu, SPIR-V: vert %zu frag %zu comp %zu",
+ pipecache.size, vert.size, frag.size, comp.size);
+
+ success = true;
+
+error:
+ if (!success) {
+ pass_destroy_cb(gpu, pass);
+ pass = NULL;
+ }
+
+#undef NUM_DS
+
+ pl_free(tmp);
+ return pass;
+}
+
+static const VkPipelineStageFlags2 shaderStages[] = {
+ [PL_PASS_RASTER] = VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT,
+ [PL_PASS_COMPUTE] = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+};
+
+static void vk_update_descriptor(pl_gpu gpu, struct vk_cmd *cmd, pl_pass pass,
+ struct pl_desc_binding db,
+ VkDescriptorSet ds, int idx)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+ struct pl_desc *desc = &pass->params.descriptors[idx];
+
+ VkWriteDescriptorSet *wds = &pass_vk->dswrite[idx];
+ *wds = (VkWriteDescriptorSet) {
+ .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+ .dstSet = ds,
+ .dstBinding = desc->binding,
+ .descriptorCount = 1,
+ .descriptorType = dsType[desc->type],
+ };
+
+ static const VkAccessFlags2 storageAccess[PL_DESC_ACCESS_COUNT] = {
+ [PL_DESC_ACCESS_READONLY] = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
+ [PL_DESC_ACCESS_WRITEONLY] = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+ [PL_DESC_ACCESS_READWRITE] = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
+ VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+ };
+
+ switch (desc->type) {
+ case PL_DESC_SAMPLED_TEX: {
+ pl_tex tex = db.object;
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+ vk_tex_barrier(gpu, cmd, tex, shaderStages[pass->params.type],
+ VK_ACCESS_2_SHADER_SAMPLED_READ_BIT,
+ VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+ VK_QUEUE_FAMILY_IGNORED);
+
+ VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx];
+ *iinfo = (VkDescriptorImageInfo) {
+ .sampler = p->samplers[db.sample_mode][db.address_mode],
+ .imageView = tex_vk->view,
+ .imageLayout = tex_vk->layout,
+ };
+
+ wds->pImageInfo = iinfo;
+ return;
+ }
+ case PL_DESC_STORAGE_IMG: {
+ pl_tex tex = db.object;
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+ vk_tex_barrier(gpu, cmd, tex, shaderStages[pass->params.type],
+ storageAccess[desc->access], VK_IMAGE_LAYOUT_GENERAL,
+ VK_QUEUE_FAMILY_IGNORED);
+
+ VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx];
+ *iinfo = (VkDescriptorImageInfo) {
+ .imageView = tex_vk->view,
+ .imageLayout = tex_vk->layout,
+ };
+
+ wds->pImageInfo = iinfo;
+ return;
+ }
+ case PL_DESC_BUF_UNIFORM:
+ case PL_DESC_BUF_STORAGE: {
+ pl_buf buf = db.object;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+ VkAccessFlags2 access = VK_ACCESS_2_UNIFORM_READ_BIT;
+ if (desc->type == PL_DESC_BUF_STORAGE)
+ access = storageAccess[desc->access];
+
+ vk_buf_barrier(gpu, cmd, buf, shaderStages[pass->params.type],
+ access, 0, buf->params.size, false);
+
+ VkDescriptorBufferInfo *binfo = &pass_vk->dsbinfo[idx];
+ *binfo = (VkDescriptorBufferInfo) {
+ .buffer = buf_vk->mem.buf,
+ .offset = buf_vk->mem.offset,
+ .range = buf->params.size,
+ };
+
+ wds->pBufferInfo = binfo;
+ return;
+ }
+ case PL_DESC_BUF_TEXEL_UNIFORM:
+ case PL_DESC_BUF_TEXEL_STORAGE: {
+ pl_buf buf = db.object;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+ VkAccessFlags2 access = VK_ACCESS_2_SHADER_SAMPLED_READ_BIT;
+ if (desc->type == PL_DESC_BUF_TEXEL_STORAGE)
+ access = storageAccess[desc->access];
+
+ vk_buf_barrier(gpu, cmd, buf, shaderStages[pass->params.type],
+ access, 0, buf->params.size, false);
+
+ wds->pTexelBufferView = &buf_vk->view;
+ return;
+ }
+ case PL_DESC_INVALID:
+ case PL_DESC_TYPE_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+}
+
+static void vk_release_descriptor(pl_gpu gpu, struct vk_cmd *cmd, pl_pass pass,
+ struct pl_desc_binding db, int idx)
+{
+ const struct pl_desc *desc = &pass->params.descriptors[idx];
+
+ switch (desc->type) {
+ case PL_DESC_BUF_UNIFORM:
+ case PL_DESC_BUF_STORAGE:
+ case PL_DESC_BUF_TEXEL_UNIFORM:
+ case PL_DESC_BUF_TEXEL_STORAGE:
+ if (desc->access != PL_DESC_ACCESS_READONLY) {
+ pl_buf buf = db.object;
+ vk_buf_flush(gpu, cmd, buf, 0, buf->params.size);
+ }
+ return;
+ case PL_DESC_SAMPLED_TEX:
+ case PL_DESC_STORAGE_IMG:
+ return;
+ case PL_DESC_INVALID:
+ case PL_DESC_TYPE_COUNT:
+ break;
+ }
+
+ pl_unreachable();
+}
+
+static void set_ds(struct pl_pass_vk *pass_vk, void *dsbit)
+{
+ pass_vk->dmask |= (uintptr_t) dsbit;
+}
+
+static bool need_respec(pl_pass pass, const struct pl_pass_run_params *params)
+{
+ struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+ if (!pass_vk->spec_size || !params->constant_data)
+ return false;
+
+ VkSpecializationInfo *specInfo = &pass_vk->specInfo;
+ size_t size = pass_vk->spec_size;
+ if (!specInfo->pData) {
+ // Shader was never specialized before
+ specInfo->pData = pl_memdup((void *) pass, params->constant_data, size);
+ specInfo->dataSize = size;
+ return true;
+ }
+
+ // Shader is being re-specialized with new values
+ if (memcmp(specInfo->pData, params->constant_data, size) != 0) {
+ memcpy((void *) specInfo->pData, params->constant_data, size);
+ return true;
+ }
+
+ return false;
+}
+
+void vk_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ pl_pass pass = params->pass;
+ struct pl_pass_vk *pass_vk = PL_PRIV(pass);
+
+ if (params->vertex_data || params->index_data)
+ return pl_pass_run_vbo(gpu, params);
+
+ // Check if we need to re-specialize this pipeline
+ if (need_respec(pass, params)) {
+ pl_clock_t start = pl_clock_now();
+ VK(vk_recreate_pipelines(vk, pass, false, pass_vk->base, &pass_vk->pipe));
+ pl_log_cpu_time(gpu->log, start, pl_clock_now(), "re-specializing shader");
+ }
+
+ if (!pass_vk->use_pushd) {
+ // Wait for a free descriptor set
+ while (!pass_vk->dmask) {
+ PL_TRACE(gpu, "No free descriptor sets! ...blocking (slow path)");
+ vk_poll_commands(vk, 10000000); // 10 ms
+ }
+ }
+
+ static const enum queue_type types[] = {
+ [PL_PASS_RASTER] = GRAPHICS,
+ [PL_PASS_COMPUTE] = COMPUTE,
+ };
+
+ struct vk_cmd *cmd = CMD_BEGIN_TIMED(types[pass->params.type], params->timer);
+ if (!cmd)
+ goto error;
+
+ // Find a descriptor set to use
+ VkDescriptorSet ds = VK_NULL_HANDLE;
+ if (!pass_vk->use_pushd) {
+ for (int i = 0; i < PL_ARRAY_SIZE(pass_vk->dss); i++) {
+ uint16_t dsbit = 1u << i;
+ if (pass_vk->dmask & dsbit) {
+ ds = pass_vk->dss[i];
+ pass_vk->dmask &= ~dsbit; // unset
+ vk_cmd_callback(cmd, (vk_cb) set_ds, pass_vk,
+ (void *)(uintptr_t) dsbit);
+ break;
+ }
+ }
+ }
+
+ // Update the dswrite structure with all of the new values
+ for (int i = 0; i < pass->params.num_descriptors; i++)
+ vk_update_descriptor(gpu, cmd, pass, params->desc_bindings[i], ds, i);
+
+ if (!pass_vk->use_pushd) {
+ vk->UpdateDescriptorSets(vk->dev, pass->params.num_descriptors,
+ pass_vk->dswrite, 0, NULL);
+ }
+
+ // Bind the pipeline, descriptor set, etc.
+ static const VkPipelineBindPoint bindPoint[] = {
+ [PL_PASS_RASTER] = VK_PIPELINE_BIND_POINT_GRAPHICS,
+ [PL_PASS_COMPUTE] = VK_PIPELINE_BIND_POINT_COMPUTE,
+ };
+
+ vk->CmdBindPipeline(cmd->buf, bindPoint[pass->params.type],
+ PL_DEF(pass_vk->pipe, pass_vk->base));
+
+ if (ds) {
+ vk->CmdBindDescriptorSets(cmd->buf, bindPoint[pass->params.type],
+ pass_vk->pipeLayout, 0, 1, &ds, 0, NULL);
+ }
+
+ if (pass_vk->use_pushd) {
+ vk->CmdPushDescriptorSetKHR(cmd->buf, bindPoint[pass->params.type],
+ pass_vk->pipeLayout, 0,
+ pass->params.num_descriptors,
+ pass_vk->dswrite);
+ }
+
+ if (pass->params.push_constants_size) {
+ vk->CmdPushConstants(cmd->buf, pass_vk->pipeLayout,
+ stageFlags[pass->params.type], 0,
+ pass->params.push_constants_size,
+ params->push_constants);
+ }
+
+ switch (pass->params.type) {
+ case PL_PASS_RASTER: {
+ pl_tex tex = params->target;
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+ pl_buf vert = params->vertex_buf;
+ struct pl_buf_vk *vert_vk = PL_PRIV(vert);
+ pl_buf index = params->index_buf;
+ struct pl_buf_vk *index_vk = index ? PL_PRIV(index) : NULL;
+ pl_assert(vert);
+
+ // In the edge case that vert = index buffer, we need to synchronize
+ // for both flags simultaneously
+ VkPipelineStageFlags2 vbo_stage = VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT;
+ VkAccessFlags2 vbo_flags = VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT;
+ if (index == vert) {
+ vbo_stage |= VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT;
+ vbo_flags |= VK_ACCESS_2_INDEX_READ_BIT;
+ }
+
+ vk_buf_barrier(gpu, cmd, vert, vbo_stage, vbo_flags, 0, vert->params.size, false);
+
+ VkDeviceSize offset = vert_vk->mem.offset + params->buf_offset;
+ vk->CmdBindVertexBuffers(cmd->buf, 0, 1, &vert_vk->mem.buf, &offset);
+
+ if (index) {
+ if (index != vert) {
+ vk_buf_barrier(gpu, cmd, index, VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT,
+ VK_ACCESS_2_INDEX_READ_BIT, 0, index->params.size,
+ false);
+ }
+
+ static const VkIndexType index_fmts[PL_INDEX_FORMAT_COUNT] = {
+ [PL_INDEX_UINT16] = VK_INDEX_TYPE_UINT16,
+ [PL_INDEX_UINT32] = VK_INDEX_TYPE_UINT32,
+ };
+
+ vk->CmdBindIndexBuffer(cmd->buf, index_vk->mem.buf,
+ index_vk->mem.offset + params->index_offset,
+ index_fmts[params->index_fmt]);
+ }
+
+
+ VkAccessFlags2 fbo_access = VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT;
+ if (pass->params.load_target)
+ fbo_access |= VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT;
+
+ vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
+ fbo_access, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+ VK_QUEUE_FAMILY_IGNORED);
+
+ VkViewport viewport = {
+ .x = params->viewport.x0,
+ .y = params->viewport.y0,
+ .width = pl_rect_w(params->viewport),
+ .height = pl_rect_h(params->viewport),
+ };
+
+ VkRect2D scissor = {
+ .offset = {params->scissors.x0, params->scissors.y0},
+ .extent = {pl_rect_w(params->scissors), pl_rect_h(params->scissors)},
+ };
+
+ vk->CmdSetViewport(cmd->buf, 0, 1, &viewport);
+ vk->CmdSetScissor(cmd->buf, 0, 1, &scissor);
+
+ VkRenderPassBeginInfo binfo = {
+ .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
+ .renderPass = pass_vk->renderPass,
+ .framebuffer = tex_vk->framebuffer,
+ .renderArea.extent = {tex->params.w, tex->params.h},
+ };
+
+ vk->CmdBeginRenderPass(cmd->buf, &binfo, VK_SUBPASS_CONTENTS_INLINE);
+
+ if (index) {
+ vk->CmdDrawIndexed(cmd->buf, params->vertex_count, 1, 0, 0, 0);
+ } else {
+ vk->CmdDraw(cmd->buf, params->vertex_count, 1, 0, 0);
+ }
+
+ vk->CmdEndRenderPass(cmd->buf);
+ break;
+ }
+ case PL_PASS_COMPUTE:
+ vk->CmdDispatch(cmd->buf, params->compute_groups[0],
+ params->compute_groups[1],
+ params->compute_groups[2]);
+ break;
+ case PL_PASS_INVALID:
+ case PL_PASS_TYPE_COUNT:
+ pl_unreachable();
+ };
+
+ for (int i = 0; i < pass->params.num_descriptors; i++)
+ vk_release_descriptor(gpu, cmd, pass, params->desc_bindings[i], i);
+
+ // submit this command buffer for better intra-frame granularity
+ CMD_SUBMIT(&cmd);
+
+error:
+ return;
+}
diff --git a/src/vulkan/gpu_tex.c b/src/vulkan/gpu_tex.c
new file mode 100644
index 0000000..7ab83b7
--- /dev/null
+++ b/src/vulkan/gpu_tex.c
@@ -0,0 +1,1453 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+
+void vk_tex_barrier(pl_gpu gpu, struct vk_cmd *cmd, pl_tex tex,
+ VkPipelineStageFlags2 stage, VkAccessFlags2 access,
+ VkImageLayout layout, uint32_t qf)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+ pl_rc_ref(&tex_vk->rc);
+ pl_assert(!tex_vk->held);
+ pl_assert(!tex_vk->num_planes);
+
+ // CONCURRENT images require transitioning to/from IGNORED, EXCLUSIVE
+ // images require transitioning to/from the concrete QF index
+ if (vk->pools.num == 1) {
+ if (tex_vk->qf == VK_QUEUE_FAMILY_IGNORED)
+ tex_vk->qf = cmd->pool->qf;
+ if (qf == VK_QUEUE_FAMILY_IGNORED)
+ qf = cmd->pool->qf;
+ }
+
+ struct vk_sync_scope last;
+ bool is_trans = layout != tex_vk->layout, is_xfer = qf != tex_vk->qf;
+ last = vk_sem_barrier(cmd, &tex_vk->sem, stage, access, is_trans || is_xfer);
+
+ VkImageMemoryBarrier2 barr = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2,
+ .srcStageMask = last.stage,
+ .srcAccessMask = last.access,
+ .dstStageMask = stage,
+ .dstAccessMask = access,
+ .oldLayout = tex_vk->layout,
+ .newLayout = layout,
+ .srcQueueFamilyIndex = tex_vk->qf,
+ .dstQueueFamilyIndex = qf,
+ .image = tex_vk->img,
+ .subresourceRange = {
+ .aspectMask = tex_vk->aspect,
+ .levelCount = 1,
+ .layerCount = 1,
+ },
+ };
+
+ if (tex_vk->may_invalidate) {
+ tex_vk->may_invalidate = false;
+ barr.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+ }
+
+ if (last.access || is_trans || is_xfer) {
+ vk_cmd_barrier(cmd, &(VkDependencyInfo) {
+ .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+ .imageMemoryBarrierCount = 1,
+ .pImageMemoryBarriers = &barr,
+ });
+ }
+
+ tex_vk->qf = qf;
+ tex_vk->layout = layout;
+ vk_cmd_callback(cmd, (vk_cb) vk_tex_deref, gpu, tex);
+
+ for (int i = 0; i < tex_vk->ext_deps.num; i++)
+ vk_cmd_dep(cmd, stage, tex_vk->ext_deps.elem[i]);
+ tex_vk->ext_deps.num = 0;
+
+ if (tex_vk->ext_sync) {
+ vk_cmd_callback(cmd, (vk_cb) vk_sync_deref, gpu, tex_vk->ext_sync);
+ tex_vk->ext_sync = NULL;
+ }
+}
+
+static void vk_tex_destroy(pl_gpu gpu, struct pl_tex_t *tex)
+{
+ if (!tex)
+ return;
+
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+ vk_sync_deref(gpu, tex_vk->ext_sync);
+ vk->DestroyFramebuffer(vk->dev, tex_vk->framebuffer, PL_VK_ALLOC);
+ vk->DestroyImageView(vk->dev, tex_vk->view, PL_VK_ALLOC);
+ for (int i = 0; i < tex_vk->num_planes; i++)
+ vk_tex_deref(gpu, tex->planes[i]);
+ if (!tex_vk->external_img) {
+ vk->DestroyImage(vk->dev, tex_vk->img, PL_VK_ALLOC);
+ vk_malloc_free(vk->ma, &tex_vk->mem);
+ }
+
+ pl_free(tex);
+}
+
+void vk_tex_deref(pl_gpu gpu, pl_tex tex)
+{
+ if (!tex)
+ return;
+
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+ if (pl_rc_deref(&tex_vk->rc))
+ vk_tex_destroy(gpu, (struct pl_tex_t *) tex);
+}
+
+
+// Initializes non-VkImage values like the image view, framebuffers, etc.
+static bool vk_init_image(pl_gpu gpu, pl_tex tex, pl_debug_tag debug_tag)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ const struct pl_tex_params *params = &tex->params;
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+ pl_assert(tex_vk->img);
+ PL_VK_NAME(IMAGE, tex_vk->img, debug_tag);
+ pl_rc_init(&tex_vk->rc);
+ if (tex_vk->num_planes)
+ return true;
+ tex_vk->layout = VK_IMAGE_LAYOUT_UNDEFINED;
+ tex_vk->transfer_queue = GRAPHICS;
+ tex_vk->qf = VK_QUEUE_FAMILY_IGNORED; // will be set on first use, if needed
+
+ // Always use the transfer pool if available, for efficiency
+ if ((params->host_writable || params->host_readable) && vk->pool_transfer)
+ tex_vk->transfer_queue = TRANSFER;
+
+ // For emulated formats: force usage of the compute queue, because we
+ // can't properly track cross-queue dependencies for buffers (yet?)
+ if (params->format->emulated)
+ tex_vk->transfer_queue = COMPUTE;
+
+ bool ret = false;
+ VkRenderPass dummyPass = VK_NULL_HANDLE;
+
+ if (params->sampleable || params->renderable || params->storable) {
+ static const VkImageViewType viewType[] = {
+ [VK_IMAGE_TYPE_1D] = VK_IMAGE_VIEW_TYPE_1D,
+ [VK_IMAGE_TYPE_2D] = VK_IMAGE_VIEW_TYPE_2D,
+ [VK_IMAGE_TYPE_3D] = VK_IMAGE_VIEW_TYPE_3D,
+ };
+
+ const VkImageViewCreateInfo vinfo = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+ .image = tex_vk->img,
+ .viewType = viewType[tex_vk->type],
+ .format = tex_vk->img_fmt,
+ .subresourceRange = {
+ .aspectMask = tex_vk->aspect,
+ .levelCount = 1,
+ .layerCount = 1,
+ },
+ };
+
+ VK(vk->CreateImageView(vk->dev, &vinfo, PL_VK_ALLOC, &tex_vk->view));
+ PL_VK_NAME(IMAGE_VIEW, tex_vk->view, debug_tag);
+ }
+
+ if (params->renderable) {
+ // Framebuffers need to be created against a specific render pass
+ // layout, so we need to temporarily create a skeleton/dummy render
+ // pass for vulkan to figure out the compatibility
+ VkRenderPassCreateInfo rinfo = {
+ .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
+ .attachmentCount = 1,
+ .pAttachments = &(VkAttachmentDescription) {
+ .format = tex_vk->img_fmt,
+ .samples = VK_SAMPLE_COUNT_1_BIT,
+ .loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE,
+ .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
+ .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
+ .finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+ },
+ .subpassCount = 1,
+ .pSubpasses = &(VkSubpassDescription) {
+ .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
+ .colorAttachmentCount = 1,
+ .pColorAttachments = &(VkAttachmentReference) {
+ .attachment = 0,
+ .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+ },
+ },
+ };
+
+ VK(vk->CreateRenderPass(vk->dev, &rinfo, PL_VK_ALLOC, &dummyPass));
+
+ VkFramebufferCreateInfo finfo = {
+ .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
+ .renderPass = dummyPass,
+ .attachmentCount = 1,
+ .pAttachments = &tex_vk->view,
+ .width = tex->params.w,
+ .height = tex->params.h,
+ .layers = 1,
+ };
+
+ if (finfo.width > vk->props.limits.maxFramebufferWidth ||
+ finfo.height > vk->props.limits.maxFramebufferHeight)
+ {
+ PL_ERR(gpu, "Framebuffer of size %dx%d exceeds the maximum allowed "
+ "dimensions: %dx%d", finfo.width, finfo.height,
+ vk->props.limits.maxFramebufferWidth,
+ vk->props.limits.maxFramebufferHeight);
+ goto error;
+ }
+
+ VK(vk->CreateFramebuffer(vk->dev, &finfo, PL_VK_ALLOC,
+ &tex_vk->framebuffer));
+ PL_VK_NAME(FRAMEBUFFER, tex_vk->framebuffer, debug_tag);
+ }
+
+ ret = true;
+
+error:
+ vk->DestroyRenderPass(vk->dev, dummyPass, PL_VK_ALLOC);
+ return ret;
+}
+
+pl_tex vk_tex_create(pl_gpu gpu, const struct pl_tex_params *params)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ enum pl_handle_type handle_type = params->export_handle |
+ params->import_handle;
+ VkExternalMemoryHandleTypeFlagBitsKHR vk_handle_type = vk_mem_handle_type(handle_type);
+
+ struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct pl_tex_vk);
+ pl_fmt fmt = params->format;
+ tex->params = *params;
+ tex->params.initial_data = NULL;
+ tex->sampler_type = PL_SAMPLER_NORMAL;
+
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+ struct pl_fmt_vk *fmtp = PL_PRIV(fmt);
+ tex_vk->img_fmt = fmtp->vk_fmt->tfmt;
+ tex_vk->num_planes = fmt->num_planes;
+ for (int i = 0; i < tex_vk->num_planes; i++)
+ tex_vk->aspect |= VK_IMAGE_ASPECT_PLANE_0_BIT << i;
+ tex_vk->aspect = PL_DEF(tex_vk->aspect, VK_IMAGE_ASPECT_COLOR_BIT);
+
+ switch (pl_tex_params_dimension(*params)) {
+ case 1: tex_vk->type = VK_IMAGE_TYPE_1D; break;
+ case 2: tex_vk->type = VK_IMAGE_TYPE_2D; break;
+ case 3: tex_vk->type = VK_IMAGE_TYPE_3D; break;
+ }
+
+ if (fmt->emulated) {
+ tex_vk->texel_fmt = pl_find_fmt(gpu, fmt->type, 1, 0,
+ fmt->host_bits[0],
+ PL_FMT_CAP_TEXEL_UNIFORM);
+ if (!tex_vk->texel_fmt) {
+ PL_ERR(gpu, "Failed picking texel format for emulated texture!");
+ goto error;
+ }
+
+ // Our format emulation requires storage image support. In order to
+ // make a bunch of checks happy, just mark it off as storable (and also
+ // enable VK_IMAGE_USAGE_STORAGE_BIT, which we do below)
+ tex->params.storable = true;
+ }
+
+ if (fmtp->blit_emulated) {
+ // Enable what's required for sampling
+ tex->params.sampleable = fmt->caps & PL_FMT_CAP_SAMPLEABLE;
+ tex->params.storable = true;
+ }
+
+ // Blit emulation on planar textures requires storage
+ if ((params->blit_src || params->blit_dst) && tex_vk->num_planes)
+ tex->params.storable = true;
+
+ VkImageUsageFlags usage = 0;
+ VkImageCreateFlags flags = 0;
+ if (tex->params.sampleable)
+ usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+ if (tex->params.renderable)
+ usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
+ if (tex->params.storable)
+ usage |= VK_IMAGE_USAGE_STORAGE_BIT;
+ if (tex->params.host_readable || tex->params.blit_src)
+ usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
+ if (tex->params.host_writable || tex->params.blit_dst || params->initial_data)
+ usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+
+ if (!usage) {
+ // Vulkan requires images have at least *some* image usage set, but our
+ // API is perfectly happy with a (useless) image. So just put
+ // VK_IMAGE_USAGE_TRANSFER_DST_BIT since this harmless.
+ usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+ }
+
+ if (tex_vk->num_planes) {
+ flags |= VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT |
+ VK_IMAGE_CREATE_EXTENDED_USAGE_BIT;
+ }
+
+ // FIXME: Since we can't keep track of queue family ownership properly,
+ // and we don't know in advance what types of queue families this image
+ // will belong to, we're forced to share all of our images between all
+ // command pools.
+ uint32_t qfs[3] = {0};
+ pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs));
+ for (int i = 0; i < vk->pools.num; i++)
+ qfs[i] = vk->pools.elem[i]->qf;
+
+ VkImageDrmFormatModifierExplicitCreateInfoEXT drm_explicit = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT,
+ .drmFormatModifier = params->shared_mem.drm_format_mod,
+ .drmFormatModifierPlaneCount = 1,
+ .pPlaneLayouts = &(VkSubresourceLayout) {
+ .rowPitch = PL_DEF(params->shared_mem.stride_w, params->w),
+ .depthPitch = params->d ? PL_DEF(params->shared_mem.stride_h, params->h) : 0,
+ .offset = params->shared_mem.offset,
+ },
+ };
+
+#ifdef VK_EXT_metal_objects
+ VkImportMetalTextureInfoEXT import_metal_tex = {
+ .sType = VK_STRUCTURE_TYPE_IMPORT_METAL_TEXTURE_INFO_EXT,
+ .plane = VK_IMAGE_ASPECT_PLANE_0_BIT << params->shared_mem.plane,
+ };
+
+ VkImportMetalIOSurfaceInfoEXT import_iosurface = {
+ .sType = VK_STRUCTURE_TYPE_IMPORT_METAL_IO_SURFACE_INFO_EXT,
+ };
+#endif
+
+ VkImageDrmFormatModifierListCreateInfoEXT drm_list = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT,
+ .drmFormatModifierCount = fmt->num_modifiers,
+ .pDrmFormatModifiers = fmt->modifiers,
+ };
+
+ VkExternalMemoryImageCreateInfoKHR ext_info = {
+ .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO_KHR,
+ .handleTypes = vk_handle_type,
+ };
+
+ VkImageCreateInfo iinfo = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+ .pNext = vk_handle_type ? &ext_info : NULL,
+ .imageType = tex_vk->type,
+ .format = tex_vk->img_fmt,
+ .extent = (VkExtent3D) {
+ .width = params->w,
+ .height = PL_MAX(1, params->h),
+ .depth = PL_MAX(1, params->d)
+ },
+ .mipLevels = 1,
+ .arrayLayers = 1,
+ .samples = VK_SAMPLE_COUNT_1_BIT,
+ .tiling = VK_IMAGE_TILING_OPTIMAL,
+ .usage = usage,
+ .flags = flags,
+ .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
+ .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT
+ : VK_SHARING_MODE_EXCLUSIVE,
+ .queueFamilyIndexCount = vk->pools.num,
+ .pQueueFamilyIndices = qfs,
+ };
+
+ struct vk_malloc_params mparams = {
+ .optimal = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+ .export_handle = params->export_handle,
+ .import_handle = params->import_handle,
+ .shared_mem = params->shared_mem,
+ .debug_tag = params->debug_tag,
+ };
+
+ if (params->import_handle == PL_HANDLE_DMA_BUF) {
+ vk_link_struct(&iinfo, &drm_explicit);
+ iinfo.tiling = VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
+ mparams.shared_mem.offset = 0x0; // handled via plane offsets
+ }
+
+#ifdef VK_EXT_metal_objects
+ if (params->import_handle == PL_HANDLE_MTL_TEX) {
+ vk_link_struct(&iinfo, &import_metal_tex);
+ import_metal_tex.mtlTexture = params->shared_mem.handle.handle;
+ }
+
+ if (params->import_handle == PL_HANDLE_IOSURFACE) {
+ vk_link_struct(&iinfo, &import_iosurface);
+ import_iosurface.ioSurface = params->shared_mem.handle.handle;
+ }
+#endif
+
+ if (params->export_handle == PL_HANDLE_DMA_BUF) {
+ pl_assert(drm_list.drmFormatModifierCount > 0);
+ vk_link_struct(&iinfo, &drm_list);
+ iinfo.tiling = VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
+ }
+
+ // Double-check physical image format limits and fail if invalid
+ VkPhysicalDeviceImageDrmFormatModifierInfoEXT drm_pinfo = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT,
+ .sharingMode = iinfo.sharingMode,
+ .queueFamilyIndexCount = iinfo.queueFamilyIndexCount,
+ .pQueueFamilyIndices = iinfo.pQueueFamilyIndices,
+ };
+
+ VkPhysicalDeviceExternalImageFormatInfoKHR ext_pinfo = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO_KHR,
+ .handleType = ext_info.handleTypes,
+ };
+
+ if (handle_type == PL_HANDLE_DMA_BUF) {
+ if (params->import_handle) {
+ // On import, we know exactly which format modifier to test
+ drm_pinfo.drmFormatModifier = drm_explicit.drmFormatModifier;
+ } else {
+ // On export, the choice of format modifier is ambiguous, because
+ // we offer the implementation a whole list to choose from. In
+ // principle, we must check *all* supported drm format modifiers,
+ // but in practice it should hopefully suffice to just check one
+ drm_pinfo.drmFormatModifier = drm_list.pDrmFormatModifiers[0];
+ }
+ vk_link_struct(&ext_pinfo, &drm_pinfo);
+ }
+
+ VkPhysicalDeviceImageFormatInfo2KHR pinfo = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2_KHR,
+ .pNext = vk_handle_type ? &ext_pinfo : NULL,
+ .format = iinfo.format,
+ .type = iinfo.imageType,
+ .tiling = iinfo.tiling,
+ .usage = iinfo.usage,
+ .flags = iinfo.flags,
+ };
+
+ VkExternalImageFormatPropertiesKHR ext_props = {
+ .sType = VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES_KHR,
+ };
+
+ VkImageFormatProperties2KHR props = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2_KHR,
+ .pNext = vk_handle_type ? &ext_props : NULL,
+ };
+
+ VkResult res;
+ res = vk->GetPhysicalDeviceImageFormatProperties2KHR(vk->physd, &pinfo, &props);
+ if (res == VK_ERROR_FORMAT_NOT_SUPPORTED) {
+ PL_DEBUG(gpu, "Texture creation failed: not supported");
+ goto error;
+ } else {
+ PL_VK_ASSERT(res, "Querying image format properties");
+ }
+
+ VkExtent3D max = props.imageFormatProperties.maxExtent;
+ if (params->w > max.width || params->h > max.height || params->d > max.depth)
+ {
+ PL_ERR(gpu, "Requested image size %dx%dx%d exceeds the maximum allowed "
+ "dimensions %dx%dx%d for vulkan image format %x",
+ params->w, params->h, params->d, max.width, max.height, max.depth,
+ (unsigned) iinfo.format);
+ goto error;
+ }
+
+ // Ensure the handle type is supported
+ if (vk_handle_type) {
+ bool ok = vk_external_mem_check(vk, &ext_props.externalMemoryProperties,
+ handle_type, params->import_handle);
+ if (!ok) {
+ PL_ERR(gpu, "Requested handle type is not compatible with the "
+ "specified combination of image parameters. Possibly the "
+ "handle type is unsupported altogether?");
+ goto error;
+ }
+ }
+
+ VK(vk->CreateImage(vk->dev, &iinfo, PL_VK_ALLOC, &tex_vk->img));
+ tex_vk->usage_flags = iinfo.usage;
+
+ VkMemoryDedicatedRequirements ded_reqs = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR,
+ };
+
+ VkMemoryRequirements2 reqs = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR,
+ .pNext = &ded_reqs,
+ };
+
+ VkImageMemoryRequirementsInfo2 req_info = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2_KHR,
+ .image = tex_vk->img,
+ };
+
+ vk->GetImageMemoryRequirements2(vk->dev, &req_info, &reqs);
+ mparams.reqs = reqs.memoryRequirements;
+ if (ded_reqs.prefersDedicatedAllocation) {
+ mparams.ded_image = tex_vk->img;
+ if (vk_mem_handle_type(params->import_handle))
+ mparams.shared_mem.size = reqs.memoryRequirements.size;
+ }
+
+ const char *debug_tag = params->debug_tag ? params->debug_tag :
+ params->import_handle ? "imported" : "created";
+
+ if (!params->import_handle || vk_mem_handle_type(params->import_handle)) {
+ struct vk_memslice *mem = &tex_vk->mem;
+ if (!vk_malloc_slice(vk->ma, mem, &mparams))
+ goto error;
+
+ VK(vk->BindImageMemory(vk->dev, tex_vk->img, mem->vkmem, mem->offset));
+ }
+
+ static const char * const plane_names[4] = {
+ "plane 0", "plane 1", "plane 2", "plane 3",
+ };
+
+ if (tex_vk->num_planes) {
+ for (int i = 0; i < tex_vk->num_planes; i++) {
+ struct pl_tex_t *plane;
+
+ pl_assert(tex_vk->type == VK_IMAGE_TYPE_2D);
+ plane = (struct pl_tex_t *) pl_vulkan_wrap(gpu, pl_vulkan_wrap_params(
+ .image = tex_vk->img,
+ .aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << i,
+ .width = PL_RSHIFT_UP(tex->params.w, fmt->planes[i].shift_x),
+ .height = PL_RSHIFT_UP(tex->params.h, fmt->planes[i].shift_y),
+ .format = fmtp->vk_fmt->pfmt[i].fmt,
+ .usage = usage,
+ .user_data = params->user_data,
+ .debug_tag = PL_DEF(params->debug_tag, plane_names[i]),
+ ));
+ if (!plane)
+ goto error;
+ plane->parent = tex;
+ tex->planes[i] = plane;
+ tex_vk->planes[i] = PL_PRIV(plane);
+ tex_vk->planes[i]->held = false;
+ tex_vk->planes[i]->layout = tex_vk->layout;
+ }
+
+ // Explicitly mask out all usage flags from planar parent images
+ pl_assert(!fmt->caps);
+ tex->params.sampleable = false;
+ tex->params.renderable = false;
+ tex->params.storable = false;
+ tex->params.blit_src = false;
+ tex->params.blit_dst = false;
+ tex->params.host_writable = false;
+ tex->params.host_readable = false;
+ }
+
+ if (!vk_init_image(gpu, tex, debug_tag))
+ goto error;
+
+ if (params->export_handle)
+ tex->shared_mem = tex_vk->mem.shared_mem;
+
+ if (params->export_handle == PL_HANDLE_DMA_BUF) {
+ if (vk->GetImageDrmFormatModifierPropertiesEXT) {
+
+ // Query the DRM format modifier and plane layout from the driver
+ VkImageDrmFormatModifierPropertiesEXT mod_props = {
+ .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_PROPERTIES_EXT,
+ };
+
+ VK(vk->GetImageDrmFormatModifierPropertiesEXT(vk->dev, tex_vk->img, &mod_props));
+ tex->shared_mem.drm_format_mod = mod_props.drmFormatModifier;
+
+ VkSubresourceLayout layout = {0};
+ VkImageSubresource plane = {
+ .aspectMask = VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT,
+ };
+
+ vk->GetImageSubresourceLayout(vk->dev, tex_vk->img, &plane, &layout);
+ if (layout.offset != 0) {
+ PL_ERR(gpu, "Exported DRM plane 0 has nonzero offset %zu, "
+ "this should never happen! Erroring for safety...",
+ (size_t) layout.offset);
+ goto error;
+ }
+ tex->shared_mem.stride_w = layout.rowPitch;
+ tex->shared_mem.stride_h = layout.depthPitch;
+
+ } else {
+
+ // Fallback for no modifiers, just do something stupid.
+ tex->shared_mem.drm_format_mod = DRM_FORMAT_MOD_INVALID;
+ tex->shared_mem.stride_w = params->w;
+ tex->shared_mem.stride_h = params->h;
+
+ }
+ }
+
+ if (params->initial_data) {
+ struct pl_tex_transfer_params ul_params = {
+ .tex = tex,
+ .ptr = (void *) params->initial_data,
+ .rc = { 0, 0, 0, params->w, params->h, params->d },
+ };
+
+ // Since we re-use GPU helpers which require writable images, just fake it
+ bool writable = tex->params.host_writable;
+ tex->params.host_writable = true;
+ if (!pl_tex_upload(gpu, &ul_params))
+ goto error;
+ tex->params.host_writable = writable;
+ }
+
+ return tex;
+
+error:
+ vk_tex_destroy(gpu, tex);
+ return NULL;
+}
+
+void vk_tex_invalidate(pl_gpu gpu, pl_tex tex)
+{
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+ tex_vk->may_invalidate = true;
+ for (int i = 0; i < tex_vk->num_planes; i++)
+ tex_vk->planes[i]->may_invalidate = true;
+}
+
+static bool tex_clear_fallback(pl_gpu gpu, pl_tex tex,
+ const union pl_clear_color color)
+{
+ pl_tex pixel = pl_tex_create(gpu, pl_tex_params(
+ .w = 1,
+ .h = 1,
+ .format = tex->params.format,
+ .storable = true,
+ .blit_src = true,
+ .blit_dst = true,
+ ));
+ if (!pixel)
+ return false;
+
+ pl_tex_clear_ex(gpu, pixel, color);
+
+ pl_assert(tex->params.storable);
+ pl_tex_blit(gpu, pl_tex_blit_params(
+ .src = pixel,
+ .dst = tex,
+ .sample_mode = PL_TEX_SAMPLE_NEAREST,
+ ));
+
+ pl_tex_destroy(gpu, &pixel);
+ return true;
+}
+
+void vk_tex_clear_ex(pl_gpu gpu, pl_tex tex, const union pl_clear_color color)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+ if (tex_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT) {
+ if (!tex_clear_fallback(gpu, tex, color)) {
+ PL_ERR(gpu, "Failed clearing imported planar image: color aspect "
+ "clears disallowed by spec and no shader fallback "
+ "available");
+ }
+ return;
+ }
+
+ struct vk_cmd *cmd = CMD_BEGIN(GRAPHICS);
+ if (!cmd)
+ return;
+
+ vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_CLEAR_BIT,
+ VK_ACCESS_2_TRANSFER_WRITE_BIT,
+ VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+ VK_QUEUE_FAMILY_IGNORED);
+
+ pl_static_assert(sizeof(VkClearColorValue) == sizeof(union pl_clear_color));
+ const VkClearColorValue *clearColor = (const VkClearColorValue *) &color;
+
+ pl_assert(tex_vk->aspect == VK_IMAGE_ASPECT_COLOR_BIT);
+ static const VkImageSubresourceRange range = {
+ .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+ .levelCount = 1,
+ .layerCount = 1,
+ };
+
+ vk->CmdClearColorImage(cmd->buf, tex_vk->img, tex_vk->layout,
+ clearColor, 1, &range);
+
+ CMD_FINISH(&cmd);
+}
+
+void vk_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_tex_vk *src_vk = PL_PRIV(params->src);
+ struct pl_tex_vk *dst_vk = PL_PRIV(params->dst);
+ struct pl_fmt_vk *src_fmtp = PL_PRIV(params->src->params.format);
+ struct pl_fmt_vk *dst_fmtp = PL_PRIV(params->dst->params.format);
+ bool blit_emulated = src_fmtp->blit_emulated || dst_fmtp->blit_emulated;
+ bool planar_fallback = src_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT ||
+ dst_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT;
+
+ pl_rect3d src_rc = params->src_rc, dst_rc = params->dst_rc;
+ bool requires_scaling = !pl_rect3d_eq(src_rc, dst_rc);
+ if ((requires_scaling && blit_emulated) || planar_fallback) {
+ if (!pl_tex_blit_compute(gpu, params))
+ PL_ERR(gpu, "Failed emulating texture blit, incompatible textures?");
+ return;
+ }
+
+ struct vk_cmd *cmd = CMD_BEGIN(GRAPHICS);
+ if (!cmd)
+ return;
+
+ // When the blit operation doesn't require scaling, we can use the more
+ // efficient vkCmdCopyImage instead of vkCmdBlitImage
+ if (!requires_scaling) {
+ vk_tex_barrier(gpu, cmd, params->src, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_READ_BIT,
+ VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+ VK_QUEUE_FAMILY_IGNORED);
+
+ vk_tex_barrier(gpu, cmd, params->dst, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_WRITE_BIT,
+ VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+ VK_QUEUE_FAMILY_IGNORED);
+
+ pl_rect3d_normalize(&src_rc);
+
+ VkImageCopy region = {
+ .srcSubresource = {
+ .aspectMask = src_vk->aspect,
+ .layerCount = 1,
+ },
+ .dstSubresource = {
+ .aspectMask = dst_vk->aspect,
+ .layerCount = 1,
+ },
+ .srcOffset = {src_rc.x0, src_rc.y0, src_rc.z0},
+ .dstOffset = {src_rc.x0, src_rc.y0, src_rc.z0},
+ .extent = {
+ pl_rect_w(src_rc),
+ pl_rect_h(src_rc),
+ pl_rect_d(src_rc),
+ },
+ };
+
+ vk->CmdCopyImage(cmd->buf, src_vk->img, src_vk->layout,
+ dst_vk->img, dst_vk->layout, 1, &region);
+ } else {
+ vk_tex_barrier(gpu, cmd, params->src, VK_PIPELINE_STAGE_2_BLIT_BIT,
+ VK_ACCESS_2_TRANSFER_READ_BIT,
+ VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+ VK_QUEUE_FAMILY_IGNORED);
+
+ vk_tex_barrier(gpu, cmd, params->dst, VK_PIPELINE_STAGE_2_BLIT_BIT,
+ VK_ACCESS_2_TRANSFER_WRITE_BIT,
+ VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+ VK_QUEUE_FAMILY_IGNORED);
+
+ VkImageBlit region = {
+ .srcSubresource = {
+ .aspectMask = src_vk->aspect,
+ .layerCount = 1,
+ },
+ .dstSubresource = {
+ .aspectMask = dst_vk->aspect,
+ .layerCount = 1,
+ },
+ .srcOffsets = {{src_rc.x0, src_rc.y0, src_rc.z0},
+ {src_rc.x1, src_rc.y1, src_rc.z1}},
+ .dstOffsets = {{dst_rc.x0, dst_rc.y0, dst_rc.z0},
+ {dst_rc.x1, dst_rc.y1, dst_rc.z1}},
+ };
+
+ static const VkFilter filters[PL_TEX_SAMPLE_MODE_COUNT] = {
+ [PL_TEX_SAMPLE_NEAREST] = VK_FILTER_NEAREST,
+ [PL_TEX_SAMPLE_LINEAR] = VK_FILTER_LINEAR,
+ };
+
+ vk->CmdBlitImage(cmd->buf, src_vk->img, src_vk->layout,
+ dst_vk->img, dst_vk->layout, 1, &region,
+ filters[params->sample_mode]);
+ }
+
+ CMD_FINISH(&cmd);
+}
+
+// Determine the best queue type to perform a buffer<->image copy on
+static enum queue_type vk_img_copy_queue(pl_gpu gpu, pl_tex tex,
+ const struct VkBufferImageCopy *region)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ const struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+ enum queue_type queue = tex_vk->transfer_queue;
+ if (queue != TRANSFER)
+ return queue;
+
+ VkExtent3D alignment = vk->pool_transfer->props.minImageTransferGranularity;
+
+ enum queue_type fallback = GRAPHICS;
+ if (gpu->limits.compute_queues > gpu->limits.fragment_queues)
+ fallback = COMPUTE; // prefer async compute queue
+
+ int tex_w = PL_DEF(tex->params.w, 1),
+ tex_h = PL_DEF(tex->params.h, 1),
+ tex_d = PL_DEF(tex->params.d, 1);
+
+ bool full_w = region->imageOffset.x + region->imageExtent.width == tex_w,
+ full_h = region->imageOffset.y + region->imageExtent.height == tex_h,
+ full_d = region->imageOffset.z + region->imageExtent.depth == tex_d;
+
+ if (alignment.width) {
+
+ bool unaligned = false;
+ unaligned |= region->imageOffset.x % alignment.width;
+ unaligned |= region->imageOffset.y % alignment.height;
+ unaligned |= region->imageOffset.z % alignment.depth;
+ unaligned |= (region->imageExtent.width % alignment.width) && !full_w;
+ unaligned |= (region->imageExtent.height % alignment.height) && !full_h;
+ unaligned |= (region->imageExtent.depth % alignment.depth) && !full_d;
+
+ return unaligned ? fallback : queue;
+
+ } else {
+
+ // an alignment of {0} means the copy must span the entire image
+ bool unaligned = false;
+ unaligned |= region->imageOffset.x || !full_w;
+ unaligned |= region->imageOffset.y || !full_h;
+ unaligned |= region->imageOffset.z || !full_d;
+
+ return unaligned ? fallback : queue;
+
+ }
+}
+
+static void tex_xfer_cb(void *ctx, void *arg)
+{
+ void (*fun)(void *priv) = ctx;
+ fun(arg);
+}
+
+bool vk_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ pl_tex tex = params->tex;
+ pl_fmt fmt = tex->params.format;
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+ struct pl_tex_transfer_params *slices = NULL;
+ int num_slices = 0;
+
+ if (!params->buf)
+ return pl_tex_upload_pbo(gpu, params);
+
+ pl_buf buf = params->buf;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+ pl_rect3d rc = params->rc;
+ const size_t size = pl_tex_transfer_size(params);
+ const size_t buf_offset = buf_vk->mem.offset + params->buf_offset;
+ bool unaligned = buf_offset % fmt->texel_size;
+ if (unaligned)
+ PL_TRACE(gpu, "vk_tex_upload: unaligned transfer (slow path)");
+
+ if (fmt->emulated || unaligned) {
+
+ // Create all slice buffers first, to early-fail if OOM, and to avoid
+ // blocking unnecessarily on waiting for these buffers to get read from
+ num_slices = pl_tex_transfer_slices(gpu, tex_vk->texel_fmt, params, &slices);
+ for (int i = 0; i < num_slices; i++) {
+ slices[i].buf = pl_buf_create(gpu, pl_buf_params(
+ .memory_type = PL_BUF_MEM_DEVICE,
+ .format = tex_vk->texel_fmt,
+ .size = pl_tex_transfer_size(&slices[i]),
+ .storable = fmt->emulated,
+ ));
+
+ if (!slices[i].buf) {
+ PL_ERR(gpu, "Failed creating buffer for tex upload fallback!");
+ num_slices = i; // only clean up buffers up to here
+ goto error;
+ }
+ }
+
+ // All temporary buffers successfully created, begin copying source data
+ struct vk_cmd *cmd = CMD_BEGIN_TIMED(tex_vk->transfer_queue,
+ params->timer);
+ if (!cmd)
+ goto error;
+
+ vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_READ_BIT, params->buf_offset, size,
+ false);
+
+ for (int i = 0; i < num_slices; i++) {
+ pl_buf slice = slices[i].buf;
+ struct pl_buf_vk *slice_vk = PL_PRIV(slice);
+ vk_buf_barrier(gpu, cmd, slice, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_WRITE_BIT, 0, slice->params.size,
+ false);
+
+ vk->CmdCopyBuffer(cmd->buf, buf_vk->mem.buf, slice_vk->mem.buf, 1, &(VkBufferCopy) {
+ .srcOffset = buf_vk->mem.offset + slices[i].buf_offset,
+ .dstOffset = slice_vk->mem.offset,
+ .size = slice->params.size,
+ });
+ }
+
+ if (params->callback)
+ vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv);
+
+ bool ok = CMD_FINISH(&cmd);
+
+ // Finally, dispatch the (texel) upload asynchronously. We can fire
+ // the callback already at the completion of previous command because
+ // these temporary buffers already hold persistent copies of the data
+ for (int i = 0; i < num_slices; i++) {
+ if (ok) {
+ slices[i].buf_offset = 0;
+ ok = fmt->emulated ? pl_tex_upload_texel(gpu, &slices[i])
+ : pl_tex_upload(gpu, &slices[i]);
+ }
+ pl_buf_destroy(gpu, &slices[i].buf);
+ }
+
+ pl_free(slices);
+ return ok;
+
+ } else {
+
+ pl_assert(fmt->texel_align == fmt->texel_size);
+ const VkBufferImageCopy region = {
+ .bufferOffset = buf_offset,
+ .bufferRowLength = params->row_pitch / fmt->texel_size,
+ .bufferImageHeight = params->depth_pitch / params->row_pitch,
+ .imageOffset = { rc.x0, rc.y0, rc.z0 },
+ .imageExtent = { rc.x1, rc.y1, rc.z1 },
+ .imageSubresource = {
+ .aspectMask = tex_vk->aspect,
+ .layerCount = 1,
+ },
+ };
+
+ enum queue_type queue = vk_img_copy_queue(gpu, tex, &region);
+ struct vk_cmd *cmd = CMD_BEGIN_TIMED(queue, params->timer);
+ if (!cmd)
+ goto error;
+
+ vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_READ_BIT, params->buf_offset, size,
+ false);
+ vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_WRITE_BIT,
+ VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+ VK_QUEUE_FAMILY_IGNORED);
+ vk->CmdCopyBufferToImage(cmd->buf, buf_vk->mem.buf, tex_vk->img,
+ tex_vk->layout, 1, &region);
+
+ if (params->callback)
+ vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv);
+
+ return CMD_FINISH(&cmd);
+ }
+
+ pl_unreachable();
+
+error:
+ for (int i = 0; i < num_slices; i++)
+ pl_buf_destroy(gpu, &slices[i].buf);
+ pl_free(slices);
+ return false;
+}
+
+bool vk_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ pl_tex tex = params->tex;
+ pl_fmt fmt = tex->params.format;
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+ struct pl_tex_transfer_params *slices = NULL;
+ int num_slices = 0;
+
+ if (!params->buf)
+ return pl_tex_download_pbo(gpu, params);
+
+ pl_buf buf = params->buf;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+ pl_rect3d rc = params->rc;
+ const size_t size = pl_tex_transfer_size(params);
+ const size_t buf_offset = buf_vk->mem.offset + params->buf_offset;
+ bool unaligned = buf_offset % fmt->texel_size;
+ if (unaligned)
+ PL_TRACE(gpu, "vk_tex_download: unaligned transfer (slow path)");
+
+ if (fmt->emulated || unaligned) {
+
+ num_slices = pl_tex_transfer_slices(gpu, tex_vk->texel_fmt, params, &slices);
+ for (int i = 0; i < num_slices; i++) {
+ slices[i].buf = pl_buf_create(gpu, pl_buf_params(
+ .memory_type = PL_BUF_MEM_DEVICE,
+ .format = tex_vk->texel_fmt,
+ .size = pl_tex_transfer_size(&slices[i]),
+ .storable = fmt->emulated,
+ ));
+
+ if (!slices[i].buf) {
+ PL_ERR(gpu, "Failed creating buffer for tex download fallback!");
+ num_slices = i;
+ goto error;
+ }
+ }
+
+ for (int i = 0; i < num_slices; i++) {
+ // Restore buffer offset after downloading into temporary buffer,
+ // because we still need to copy the data from the temporary buffer
+ // into this offset in the original buffer
+ const size_t tmp_offset = slices[i].buf_offset;
+ slices[i].buf_offset = 0;
+ bool ok = fmt->emulated ? pl_tex_download_texel(gpu, &slices[i])
+ : pl_tex_download(gpu, &slices[i]);
+ slices[i].buf_offset = tmp_offset;
+ if (!ok)
+ goto error;
+ }
+
+ // Finally, download into the user buffer
+ struct vk_cmd *cmd = CMD_BEGIN_TIMED(tex_vk->transfer_queue, params->timer);
+ if (!cmd)
+ goto error;
+
+ vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_WRITE_BIT, params->buf_offset, size,
+ false);
+
+ for (int i = 0; i < num_slices; i++) {
+ pl_buf slice = slices[i].buf;
+ struct pl_buf_vk *slice_vk = PL_PRIV(slice);
+ vk_buf_barrier(gpu, cmd, slice, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_READ_BIT, 0, slice->params.size,
+ false);
+
+ vk->CmdCopyBuffer(cmd->buf, slice_vk->mem.buf, buf_vk->mem.buf, 1, &(VkBufferCopy) {
+ .srcOffset = slice_vk->mem.offset,
+ .dstOffset = buf_vk->mem.offset + slices[i].buf_offset,
+ .size = slice->params.size,
+ });
+
+ pl_buf_destroy(gpu, &slices[i].buf);
+ }
+
+ vk_buf_flush(gpu, cmd, buf, params->buf_offset, size);
+
+ if (params->callback)
+ vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv);
+
+ pl_free(slices);
+ return CMD_FINISH(&cmd);
+
+ } else {
+
+ pl_assert(params->row_pitch % fmt->texel_size == 0);
+ pl_assert(params->depth_pitch % params->row_pitch == 0);
+ const VkBufferImageCopy region = {
+ .bufferOffset = buf_offset,
+ .bufferRowLength = params->row_pitch / fmt->texel_size,
+ .bufferImageHeight = params->depth_pitch / params->row_pitch,
+ .imageOffset = { rc.x0, rc.y0, rc.z0 },
+ .imageExtent = { rc.x1, rc.y1, rc.z1 },
+ .imageSubresource = {
+ .aspectMask = tex_vk->aspect,
+ .layerCount = 1,
+ },
+ };
+
+ enum queue_type queue = vk_img_copy_queue(gpu, tex, &region);
+
+ struct vk_cmd *cmd = CMD_BEGIN_TIMED(queue, params->timer);
+ if (!cmd)
+ goto error;
+
+ vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_WRITE_BIT, params->buf_offset, size,
+ false);
+ vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_READ_BIT,
+ VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+ VK_QUEUE_FAMILY_IGNORED);
+ vk->CmdCopyImageToBuffer(cmd->buf, tex_vk->img, tex_vk->layout,
+ buf_vk->mem.buf, 1, &region);
+ vk_buf_flush(gpu, cmd, buf, params->buf_offset, size);
+
+ if (params->callback)
+ vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv);
+
+ return CMD_FINISH(&cmd);
+ }
+
+ pl_unreachable();
+
+error:
+ for (int i = 0; i < num_slices; i++)
+ pl_buf_destroy(gpu, &slices[i].buf);
+ pl_free(slices);
+ return false;
+}
+
+bool vk_tex_poll(pl_gpu gpu, pl_tex tex, uint64_t timeout)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+ // Opportunistically check if we can re-use this texture without flush
+ vk_poll_commands(vk, 0);
+ if (pl_rc_count(&tex_vk->rc) == 1)
+ goto skip_blocking;
+
+ // Otherwise, we're force to submit any queued command so that the user is
+ // guaranteed to see progress eventually, even if they call this in a loop
+ CMD_SUBMIT(NULL);
+ vk_poll_commands(vk, timeout);
+ if (pl_rc_count(&tex_vk->rc) > 1)
+ return true;
+
+ // fall through
+skip_blocking:
+ for (int i = 0; i < tex_vk->num_planes; i++) {
+ if (vk_tex_poll(gpu, tex->planes[i], timeout))
+ return true;
+ }
+
+ return false;
+}
+
+bool vk_tex_export(pl_gpu gpu, pl_tex tex, pl_sync sync)
+{
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+ struct pl_sync_vk *sync_vk = PL_PRIV(sync);
+
+ if (tex_vk->num_planes) {
+ PL_ERR(gpu, "`pl_tex_export` cannot be called on planar textures."
+ "Please see `pl_vulkan_hold_ex` for a replacement.");
+ return false;
+ }
+
+ struct vk_cmd *cmd = CMD_BEGIN(ANY);
+ if (!cmd)
+ goto error;
+
+ vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_NONE,
+ 0, VK_IMAGE_LAYOUT_GENERAL, VK_QUEUE_FAMILY_EXTERNAL);
+
+ // Make the next barrier appear as though coming from a different queue
+ tex_vk->sem.write.queue = tex_vk->sem.read.queue = NULL;
+
+ vk_cmd_sig(cmd, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, (pl_vulkan_sem){ sync_vk->wait });
+ if (!CMD_SUBMIT(&cmd))
+ goto error;
+
+ // Remember the other dependency and hold on to the sync object
+ PL_ARRAY_APPEND(tex, tex_vk->ext_deps, (pl_vulkan_sem){ sync_vk->signal });
+ pl_rc_ref(&sync_vk->rc);
+ tex_vk->ext_sync = sync;
+ tex_vk->qf = VK_QUEUE_FAMILY_EXTERNAL;
+ return true;
+
+error:
+ PL_ERR(gpu, "Failed exporting shared texture!");
+ return false;
+}
+
+pl_tex pl_vulkan_wrap(pl_gpu gpu, const struct pl_vulkan_wrap_params *params)
+{
+ pl_fmt fmt = NULL;
+ for (int i = 0; i < gpu->num_formats; i++) {
+ const struct vk_format **vkfmt = PL_PRIV(gpu->formats[i]);
+ if ((*vkfmt)->tfmt == params->format) {
+ fmt = gpu->formats[i];
+ break;
+ }
+ }
+
+ if (!fmt) {
+ PL_ERR(gpu, "Could not find pl_fmt suitable for wrapped image "
+ "with format %s", vk_fmt_name(params->format));
+ return NULL;
+ }
+
+ VkImageUsageFlags usage = params->usage;
+ if (fmt->num_planes)
+ usage = 0; // mask capabilities from the base texture
+
+ struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct pl_tex_vk);
+ tex->params = (struct pl_tex_params) {
+ .format = fmt,
+ .w = params->width,
+ .h = params->height,
+ .d = params->depth,
+ .sampleable = !!(usage & VK_IMAGE_USAGE_SAMPLED_BIT),
+ .renderable = !!(usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT),
+ .storable = !!(usage & VK_IMAGE_USAGE_STORAGE_BIT),
+ .blit_src = !!(usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT),
+ .blit_dst = !!(usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT),
+ .host_writable = !!(usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT),
+ .host_readable = !!(usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT),
+ .user_data = params->user_data,
+ .debug_tag = params->debug_tag,
+ };
+
+ // Mask out capabilities not permitted by the `pl_fmt`
+#define MASK(field, cap) \
+ do { \
+ if (tex->params.field && !(fmt->caps & cap)) { \
+ PL_WARN(gpu, "Masking `" #field "` from wrapped texture because " \
+ "the corresponding format '%s' does not support " #cap, \
+ fmt->name); \
+ tex->params.field = false; \
+ } \
+ } while (0)
+
+ MASK(sampleable, PL_FMT_CAP_SAMPLEABLE);
+ MASK(renderable, PL_FMT_CAP_RENDERABLE);
+ MASK(storable, PL_FMT_CAP_STORABLE);
+ MASK(blit_src, PL_FMT_CAP_BLITTABLE);
+ MASK(blit_dst, PL_FMT_CAP_BLITTABLE);
+ MASK(host_readable, PL_FMT_CAP_HOST_READABLE);
+#undef MASK
+
+ // For simplicity, explicitly mask out blit emulation for wrapped textures
+ struct pl_fmt_vk *fmtp = PL_PRIV(fmt);
+ if (fmtp->blit_emulated) {
+ tex->params.blit_src = false;
+ tex->params.blit_dst = false;
+ }
+
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+ switch (pl_tex_params_dimension(tex->params)) {
+ case 1: tex_vk->type = VK_IMAGE_TYPE_1D; break;
+ case 2: tex_vk->type = VK_IMAGE_TYPE_2D; break;
+ case 3: tex_vk->type = VK_IMAGE_TYPE_3D; break;
+ }
+ tex_vk->external_img = true;
+ tex_vk->held = !fmt->num_planes;
+ tex_vk->img = params->image;
+ tex_vk->img_fmt = params->format;
+ tex_vk->num_planes = fmt->num_planes;
+ tex_vk->usage_flags = usage;
+ tex_vk->aspect = params->aspect;
+
+ if (!tex_vk->aspect) {
+ for (int i = 0; i < tex_vk->num_planes; i++)
+ tex_vk->aspect |= VK_IMAGE_ASPECT_PLANE_0_BIT << i;
+ tex_vk->aspect = PL_DEF(tex_vk->aspect, VK_IMAGE_ASPECT_COLOR_BIT);
+ }
+
+ // Blitting to planar images requires fallback via compute shaders
+ if (tex_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT) {
+ tex->params.blit_src &= tex->params.storable;
+ tex->params.blit_dst &= tex->params.storable;
+ }
+
+ static const char * const wrapped_plane_names[4] = {
+ "wrapped plane 0", "wrapped plane 1", "wrapped plane 2", "wrapped plane 3",
+ };
+
+ for (int i = 0; i < tex_vk->num_planes; i++) {
+ struct pl_tex_t *plane;
+ VkImageAspectFlags aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << i;
+ if (!(aspect & tex_vk->aspect)) {
+ PL_INFO(gpu, "Not wrapping plane %d due to aspect bit 0x%x not "
+ "being contained in supplied params->aspect 0x%x!",
+ i, (unsigned) aspect, (unsigned) tex_vk->aspect);
+ continue;
+ }
+
+ pl_assert(tex_vk->type == VK_IMAGE_TYPE_2D);
+ plane = (struct pl_tex_t *) pl_vulkan_wrap(gpu, pl_vulkan_wrap_params(
+ .image = tex_vk->img,
+ .aspect = aspect,
+ .width = PL_RSHIFT_UP(tex->params.w, fmt->planes[i].shift_x),
+ .height = PL_RSHIFT_UP(tex->params.h, fmt->planes[i].shift_y),
+ .format = fmtp->vk_fmt->pfmt[i].fmt,
+ .usage = params->usage,
+ .user_data = params->user_data,
+ .debug_tag = PL_DEF(params->debug_tag, wrapped_plane_names[i]),
+ ));
+ if (!plane)
+ goto error;
+ plane->parent = tex;
+ tex->planes[i] = plane;
+ tex_vk->planes[i] = PL_PRIV(plane);
+ }
+
+ if (!vk_init_image(gpu, tex, PL_DEF(params->debug_tag, "wrapped")))
+ goto error;
+
+ return tex;
+
+error:
+ vk_tex_destroy(gpu, tex);
+ return NULL;
+}
+
+VkImage pl_vulkan_unwrap(pl_gpu gpu, pl_tex tex, VkFormat *out_format,
+ VkImageUsageFlags *out_flags)
+{
+ struct pl_tex_vk *tex_vk = PL_PRIV(tex);
+
+ if (out_format)
+ *out_format = tex_vk->img_fmt;
+ if (out_flags)
+ *out_flags = tex_vk->usage_flags;
+
+ return tex_vk->img;
+}
+
+bool pl_vulkan_hold_ex(pl_gpu gpu, const struct pl_vulkan_hold_params *params)
+{
+ struct pl_tex_vk *tex_vk = PL_PRIV(params->tex);
+ pl_assert(params->semaphore.sem);
+
+ bool held = tex_vk->held;
+ for (int i = 0; i < tex_vk->num_planes; i++)
+ held |= tex_vk->planes[i]->held;
+
+ if (held) {
+ PL_ERR(gpu, "Attempting to hold an already held image!");
+ return false;
+ }
+
+ struct vk_cmd *cmd = CMD_BEGIN(GRAPHICS);
+ if (!cmd) {
+ PL_ERR(gpu, "Failed holding external image!");
+ return false;
+ }
+
+ VkImageLayout layout = params->layout;
+ if (params->out_layout) {
+ // For planar images, arbitrarily pick the current image layout of the
+ // first plane. This should be fine in practice, since all planes will
+ // share the same usage capabilities.
+ if (tex_vk->num_planes) {
+ layout = tex_vk->planes[0]->layout;
+ } else {
+ layout = tex_vk->layout;
+ }
+ }
+
+ bool may_invalidate = true;
+ if (!tex_vk->num_planes) {
+ may_invalidate &= tex_vk->may_invalidate;
+ vk_tex_barrier(gpu, cmd, params->tex, VK_PIPELINE_STAGE_2_NONE,
+ 0, layout, params->qf);
+ }
+
+ for (int i = 0; i < tex_vk->num_planes; i++) {
+ may_invalidate &= tex_vk->planes[i]->may_invalidate;
+ vk_tex_barrier(gpu, cmd, params->tex->planes[i],
+ VK_PIPELINE_STAGE_2_NONE, 0, layout, params->qf);
+ }
+
+ vk_cmd_sig(cmd, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, params->semaphore);
+ bool ok = CMD_SUBMIT(&cmd);
+
+ if (!tex_vk->num_planes) {
+ tex_vk->sem.write.queue = tex_vk->sem.read.queue = NULL;
+ tex_vk->held = ok;
+ }
+
+ for (int i = 0; i < tex_vk->num_planes; i++) {
+ struct pl_tex_vk *plane_vk = tex_vk->planes[i];
+ plane_vk->sem.write.queue = plane_vk->sem.read.queue = NULL;
+ plane_vk->held = ok;
+ }
+
+ if (ok && params->out_layout)
+ *params->out_layout = may_invalidate ? VK_IMAGE_LAYOUT_UNDEFINED : layout;
+
+ return ok;
+}
+
+void pl_vulkan_release_ex(pl_gpu gpu, const struct pl_vulkan_release_params *params)
+{
+ struct pl_tex_vk *tex_vk = PL_PRIV(params->tex);
+ if (tex_vk->num_planes) {
+ struct pl_vulkan_release_params plane_pars = *params;
+ for (int i = 0; i < tex_vk->num_planes; i++) {
+ plane_pars.tex = params->tex->planes[i];
+ pl_vulkan_release_ex(gpu, &plane_pars);
+ }
+ return;
+ }
+
+ if (!tex_vk->held) {
+ PL_ERR(gpu, "Attempting to release an unheld image?");
+ return;
+ }
+
+ if (params->semaphore.sem)
+ PL_ARRAY_APPEND(params->tex, tex_vk->ext_deps, params->semaphore);
+
+ tex_vk->qf = params->qf;
+ tex_vk->layout = params->layout;
+ tex_vk->held = false;
+}
+
+bool pl_vulkan_hold(pl_gpu gpu, pl_tex tex, VkImageLayout layout,
+ pl_vulkan_sem sem_out)
+{
+ return pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params(
+ .tex = tex,
+ .layout = layout,
+ .semaphore = sem_out,
+ .qf = VK_QUEUE_FAMILY_IGNORED,
+ ));
+}
+
+bool pl_vulkan_hold_raw(pl_gpu gpu, pl_tex tex,
+ VkImageLayout *out_layout,
+ pl_vulkan_sem sem_out)
+{
+ return pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params(
+ .tex = tex,
+ .out_layout = out_layout,
+ .semaphore = sem_out,
+ .qf = VK_QUEUE_FAMILY_IGNORED,
+ ));
+}
+
+void pl_vulkan_release(pl_gpu gpu, pl_tex tex, VkImageLayout layout,
+ pl_vulkan_sem sem_in)
+{
+ pl_vulkan_release_ex(gpu, pl_vulkan_release_params(
+ .tex = tex,
+ .layout = layout,
+ .semaphore = sem_in,
+ .qf = VK_QUEUE_FAMILY_IGNORED,
+ ));
+}
diff --git a/src/vulkan/malloc.c b/src/vulkan/malloc.c
new file mode 100644
index 0000000..c35183b
--- /dev/null
+++ b/src/vulkan/malloc.c
@@ -0,0 +1,1058 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "malloc.h"
+#include "command.h"
+#include "utils.h"
+#include "pl_thread.h"
+
+#ifdef PL_HAVE_UNIX
+#include <errno.h>
+#include <unistd.h>
+#endif
+
+// Controls the page size alignment, to help coalesce allocations into the same
+// slab. Pages are rounded up to multiples of this value. (Default: 4 KB)
+#define PAGE_SIZE_ALIGN (1LLU << 12)
+
+// Controls the minimum/maximum number of pages for new slabs. As slabs are
+// exhausted of memory, the number of pages per new slab grows exponentially,
+// starting with the minimum until the maximum is reached.
+//
+// Note: The maximum must never exceed the size of `vk_slab.spacemap`.
+#define MINIMUM_PAGE_COUNT 4
+#define MAXIMUM_PAGE_COUNT (sizeof(uint64_t) * 8)
+
+// Controls the maximum page size. Any allocations above this threshold
+// (absolute size or fraction of VRAM, whichever is higher) will be served by
+// dedicated allocations. (Default: 64 MB or 1/16 of VRAM)
+#define MAXIMUM_PAGE_SIZE_ABSOLUTE (1LLU << 26)
+#define MAXIMUM_PAGE_SIZE_RELATIVE 16
+
+// Controls the minimum slab size, to avoid excessive re-allocation of very
+// small slabs. (Default: 256 KB)
+#define MINIMUM_SLAB_SIZE (1LLU << 18)
+
+// How long to wait before garbage collecting empty slabs. Slabs older than
+// this many invocations of `vk_malloc_garbage_collect` will be released.
+#define MAXIMUM_SLAB_AGE 32
+
+// A single slab represents a contiguous region of allocated memory. Actual
+// allocations are served as pages of this. Slabs are organized into pools,
+// each of which contains a list of slabs of differing page sizes.
+struct vk_slab {
+ pl_mutex lock;
+ pl_debug_tag debug_tag; // debug tag of the triggering allocation
+ VkDeviceMemory mem; // underlying device allocation
+ VkDeviceSize size; // total allocated size of `mem`
+ VkMemoryType mtype; // underlying memory type
+ bool dedicated; // slab is allocated specifically for one object
+ bool imported; // slab represents an imported memory allocation
+
+ // free space accounting (only for non-dedicated slabs)
+ uint64_t spacemap; // bitset of available pages
+ size_t pagesize; // size in bytes per page
+ size_t used; // number of bytes actually in use
+ uint64_t age; // timestamp of last use
+
+ // optional, depends on the memory type:
+ VkBuffer buffer; // buffer spanning the entire slab
+ void *data; // mapped memory corresponding to `mem`
+ bool coherent; // mapped memory is coherent
+ union pl_handle handle; // handle associated with this device memory
+ enum pl_handle_type handle_type;
+};
+
+// Represents a single memory pool. We keep track of a vk_pool for each
+// combination of malloc parameters. This shouldn't actually be that many in
+// practice, because some combinations simply never occur, and others will
+// generally be the same for the same objects.
+//
+// Note: `vk_pool` addresses are not immutable, so we mustn't expose any
+// dangling references to a `vk_pool` from e.g. `vk_memslice.priv = vk_slab`.
+struct vk_pool {
+ struct vk_malloc_params params; // allocation params (with some fields nulled)
+ PL_ARRAY(struct vk_slab *) slabs; // array of slabs, unsorted
+ int index; // running index in `vk_malloc.pools`
+};
+
+// The overall state of the allocator, which keeps track of a vk_pool for each
+// memory type.
+struct vk_malloc {
+ struct vk_ctx *vk;
+ pl_mutex lock;
+ VkPhysicalDeviceMemoryProperties props;
+ size_t maximum_page_size;
+ PL_ARRAY(struct vk_pool) pools;
+ uint64_t age;
+};
+
+static inline float efficiency(size_t used, size_t total)
+{
+ if (!total)
+ return 100.0;
+
+ return 100.0f * used / total;
+}
+
+static const char *print_size(char buf[8], size_t size)
+{
+ const char *suffixes = "\0KMG";
+ while (suffixes[1] && size > 9999) {
+ size >>= 10;
+ suffixes++;
+ }
+
+ int ret = *suffixes ? snprintf(buf, 8, "%4zu%c", size, *suffixes)
+ : snprintf(buf, 8, "%5zu", size);
+
+ return ret >= 0 ? buf : "(error)";
+}
+
+#define PRINT_SIZE(x) (print_size((char[8]){0}, (size_t) (x)))
+
+void vk_malloc_print_stats(struct vk_malloc *ma, enum pl_log_level lev)
+{
+ struct vk_ctx *vk = ma->vk;
+ size_t total_size = 0;
+ size_t total_used = 0;
+ size_t total_res = 0;
+
+ PL_MSG(vk, lev, "Memory heaps supported by device:");
+ for (int i = 0; i < ma->props.memoryHeapCount; i++) {
+ VkMemoryHeap heap = ma->props.memoryHeaps[i];
+ PL_MSG(vk, lev, " %d: flags 0x%x size %s",
+ i, (unsigned) heap.flags, PRINT_SIZE(heap.size));
+ }
+
+ PL_DEBUG(vk, "Memory types supported by device:");
+ for (int i = 0; i < ma->props.memoryTypeCount; i++) {
+ VkMemoryType type = ma->props.memoryTypes[i];
+ PL_DEBUG(vk, " %d: flags 0x%x heap %d",
+ i, (unsigned) type.propertyFlags, (int) type.heapIndex);
+ }
+
+ pl_mutex_lock(&ma->lock);
+ for (int i = 0; i < ma->pools.num; i++) {
+ struct vk_pool *pool = &ma->pools.elem[i];
+ const struct vk_malloc_params *par = &pool->params;
+
+ PL_MSG(vk, lev, "Memory pool %d:", i);
+ PL_MSG(vk, lev, " Compatible types: 0x%"PRIx32, par->reqs.memoryTypeBits);
+ if (par->required)
+ PL_MSG(vk, lev, " Required flags: 0x%"PRIx32, par->required);
+ if (par->optimal)
+ PL_MSG(vk, lev, " Optimal flags: 0x%"PRIx32, par->optimal);
+ if (par->buf_usage)
+ PL_MSG(vk, lev, " Buffer flags: 0x%"PRIx32, par->buf_usage);
+ if (par->export_handle)
+ PL_MSG(vk, lev, " Export handle: 0x%x", par->export_handle);
+
+ size_t pool_size = 0;
+ size_t pool_used = 0;
+ size_t pool_res = 0;
+
+ for (int j = 0; j < pool->slabs.num; j++) {
+ struct vk_slab *slab = pool->slabs.elem[j];
+ pl_mutex_lock(&slab->lock);
+
+ size_t avail = __builtin_popcountll(slab->spacemap) * slab->pagesize;
+ size_t slab_res = slab->size - avail;
+
+ PL_MSG(vk, lev, " Slab %2d: %8"PRIx64" x %s: "
+ "%s used %s res %s alloc from heap %d, efficiency %.2f%% [%s]",
+ j, slab->spacemap, PRINT_SIZE(slab->pagesize),
+ PRINT_SIZE(slab->used), PRINT_SIZE(slab_res),
+ PRINT_SIZE(slab->size), (int) slab->mtype.heapIndex,
+ efficiency(slab->used, slab_res),
+ PL_DEF(slab->debug_tag, "unknown"));
+
+ pool_size += slab->size;
+ pool_used += slab->used;
+ pool_res += slab_res;
+ pl_mutex_unlock(&slab->lock);
+ }
+
+ PL_MSG(vk, lev, " Pool summary: %s used %s res %s alloc, "
+ "efficiency %.2f%%, utilization %.2f%%",
+ PRINT_SIZE(pool_used), PRINT_SIZE(pool_res),
+ PRINT_SIZE(pool_size), efficiency(pool_used, pool_res),
+ efficiency(pool_res, pool_size));
+
+ total_size += pool_size;
+ total_used += pool_used;
+ total_res += pool_res;
+ }
+ pl_mutex_unlock(&ma->lock);
+
+ PL_MSG(vk, lev, "Memory summary: %s used %s res %s alloc, "
+ "efficiency %.2f%%, utilization %.2f%%, max page: %s",
+ PRINT_SIZE(total_used), PRINT_SIZE(total_res),
+ PRINT_SIZE(total_size), efficiency(total_used, total_res),
+ efficiency(total_res, total_size),
+ PRINT_SIZE(ma->maximum_page_size));
+}
+
+static void slab_free(struct vk_ctx *vk, struct vk_slab *slab)
+{
+ if (!slab)
+ return;
+
+#ifndef NDEBUG
+ if (!slab->dedicated && slab->used > 0) {
+ PL_WARN(vk, "Leaked %zu bytes of vulkan memory!", slab->used);
+ PL_WARN(vk, "slab total size: %zu bytes, heap: %d, flags: 0x%"PRIX64,
+ (size_t) slab->size, (int) slab->mtype.heapIndex,
+ (uint64_t) slab->mtype.propertyFlags);
+ if (slab->debug_tag)
+ PL_WARN(vk, "last used for: %s", slab->debug_tag);
+ pl_log_stack_trace(vk->log, PL_LOG_WARN);
+ pl_debug_abort();
+ }
+#endif
+
+ if (slab->imported) {
+ switch (slab->handle_type) {
+ case PL_HANDLE_FD:
+ case PL_HANDLE_DMA_BUF:
+ PL_TRACE(vk, "Unimporting slab of size %s from fd: %d",
+ PRINT_SIZE(slab->size), slab->handle.fd);
+ break;
+ case PL_HANDLE_WIN32:
+ case PL_HANDLE_WIN32_KMT:
+#ifdef PL_HAVE_WIN32
+ PL_TRACE(vk, "Unimporting slab of size %s from handle: %p",
+ PRINT_SIZE(slab->size), (void *) slab->handle.handle);
+#endif
+ break;
+ case PL_HANDLE_HOST_PTR:
+ PL_TRACE(vk, "Unimporting slab of size %s from ptr: %p",
+ PRINT_SIZE(slab->size), (void *) slab->handle.ptr);
+ break;
+ case PL_HANDLE_IOSURFACE:
+ case PL_HANDLE_MTL_TEX:
+ pl_unreachable();
+ }
+ } else {
+ switch (slab->handle_type) {
+ case PL_HANDLE_FD:
+ case PL_HANDLE_DMA_BUF:
+#ifdef PL_HAVE_UNIX
+ if (slab->handle.fd > -1)
+ close(slab->handle.fd);
+#endif
+ break;
+ case PL_HANDLE_WIN32:
+#ifdef PL_HAVE_WIN32
+ if (slab->handle.handle != NULL)
+ CloseHandle(slab->handle.handle);
+#endif
+ break;
+ case PL_HANDLE_WIN32_KMT:
+ // PL_HANDLE_WIN32_KMT is just an identifier. It doesn't get closed.
+ break;
+ case PL_HANDLE_HOST_PTR:
+ // Implicitly unmapped
+ break;
+ case PL_HANDLE_IOSURFACE:
+ case PL_HANDLE_MTL_TEX:
+ pl_unreachable();
+ }
+
+ PL_DEBUG(vk, "Freeing slab of size %s", PRINT_SIZE(slab->size));
+ }
+
+ vk->DestroyBuffer(vk->dev, slab->buffer, PL_VK_ALLOC);
+ // also implicitly unmaps the memory if needed
+ vk->FreeMemory(vk->dev, slab->mem, PL_VK_ALLOC);
+
+ pl_mutex_destroy(&slab->lock);
+ pl_free(slab);
+}
+
+// type_mask: optional
+// thread-safety: safe
+static bool find_best_memtype(const struct vk_malloc *ma, uint32_t type_mask,
+ const struct vk_malloc_params *params,
+ uint32_t *out_index)
+{
+ struct vk_ctx *vk = ma->vk;
+ int best = -1;
+
+ // The vulkan spec requires memory types to be sorted in the "optimal"
+ // order, so the first matching type we find will be the best/fastest one.
+ // That being said, we still want to prioritize memory types that have
+ // better optional flags.
+
+ type_mask &= params->reqs.memoryTypeBits;
+ for (int i = 0; i < ma->props.memoryTypeCount; i++) {
+ const VkMemoryType *mtype = &ma->props.memoryTypes[i];
+
+ // The memory type flags must include our properties
+ if ((mtype->propertyFlags & params->required) != params->required)
+ continue;
+
+ // The memory heap must be large enough for the allocation
+ VkDeviceSize heapSize = ma->props.memoryHeaps[mtype->heapIndex].size;
+ if (params->reqs.size > heapSize)
+ continue;
+
+ // The memory type must be supported by the type mask (bitfield)
+ if (!(type_mask & (1LU << i)))
+ continue;
+
+ // Calculate the score as the number of optimal property flags matched
+ int score = __builtin_popcountl(mtype->propertyFlags & params->optimal);
+ if (score > best) {
+ *out_index = i;
+ best = score;
+ }
+ }
+
+ if (best < 0) {
+ PL_ERR(vk, "Found no memory type matching property flags 0x%x and type "
+ "bits 0x%x!",
+ (unsigned) params->required, (unsigned) type_mask);
+ return false;
+ }
+
+ return true;
+}
+
+static bool buf_external_check(struct vk_ctx *vk, VkBufferUsageFlags usage,
+ enum pl_handle_type handle_type, bool import)
+{
+ if (!handle_type)
+ return true;
+
+ VkPhysicalDeviceExternalBufferInfo info = {
+ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO_KHR,
+ .usage = usage,
+ .handleType = vk_mem_handle_type(handle_type),
+ };
+
+ VkExternalBufferProperties props = {
+ .sType = VK_STRUCTURE_TYPE_EXTERNAL_BUFFER_PROPERTIES_KHR,
+ };
+
+ if (!info.handleType)
+ return false;
+
+ vk->GetPhysicalDeviceExternalBufferProperties(vk->physd, &info, &props);
+ return vk_external_mem_check(vk, &props.externalMemoryProperties,
+ handle_type, import);
+}
+
+// thread-safety: safe
+static struct vk_slab *slab_alloc(struct vk_malloc *ma,
+ const struct vk_malloc_params *params)
+{
+ struct vk_ctx *vk = ma->vk;
+ struct vk_slab *slab = pl_alloc_ptr(NULL, slab);
+ *slab = (struct vk_slab) {
+ .age = ma->age,
+ .size = params->reqs.size,
+ .handle_type = params->export_handle,
+ .debug_tag = params->debug_tag,
+ };
+ pl_mutex_init(&slab->lock);
+
+ switch (slab->handle_type) {
+ case PL_HANDLE_FD:
+ case PL_HANDLE_DMA_BUF:
+ slab->handle.fd = -1;
+ break;
+ case PL_HANDLE_WIN32:
+ case PL_HANDLE_WIN32_KMT:
+ case PL_HANDLE_MTL_TEX:
+ case PL_HANDLE_IOSURFACE:
+ slab->handle.handle = NULL;
+ break;
+ case PL_HANDLE_HOST_PTR:
+ slab->handle.ptr = NULL;
+ break;
+ }
+
+ VkExportMemoryAllocateInfoKHR ext_info = {
+ .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR,
+ .handleTypes = vk_mem_handle_type(slab->handle_type),
+ };
+
+ uint32_t type_mask = UINT32_MAX;
+ if (params->buf_usage) {
+ // Queue family sharing modes don't matter for buffers, so we just
+ // set them as concurrent and stop worrying about it.
+ uint32_t qfs[3] = {0};
+ pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs));
+ for (int i = 0; i < vk->pools.num; i++)
+ qfs[i] = vk->pools.elem[i]->qf;
+
+ VkExternalMemoryBufferCreateInfoKHR ext_buf_info = {
+ .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR,
+ .handleTypes = ext_info.handleTypes,
+ };
+
+ VkBufferCreateInfo binfo = {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+ .pNext = slab->handle_type ? &ext_buf_info : NULL,
+ .size = slab->size,
+ .usage = params->buf_usage,
+ .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT
+ : VK_SHARING_MODE_EXCLUSIVE,
+ .queueFamilyIndexCount = vk->pools.num,
+ .pQueueFamilyIndices = qfs,
+ };
+
+ if (!buf_external_check(vk, binfo.usage, slab->handle_type, false)) {
+ PL_ERR(vk, "Failed allocating shared memory buffer: possibly "
+ "the handle type is unsupported?");
+ goto error;
+ }
+
+ VK(vk->CreateBuffer(vk->dev, &binfo, PL_VK_ALLOC, &slab->buffer));
+ PL_VK_NAME(BUFFER, slab->buffer, "slab");
+
+ VkMemoryRequirements reqs = {0};
+ vk->GetBufferMemoryRequirements(vk->dev, slab->buffer, &reqs);
+ slab->size = reqs.size; // this can be larger than `slab->size`
+ type_mask = reqs.memoryTypeBits;
+
+ // Note: we can ignore `reqs.align` because we always bind the buffer
+ // memory to offset 0
+ }
+
+ VkMemoryAllocateInfo minfo = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+ .allocationSize = slab->size,
+ };
+
+ if (params->export_handle)
+ vk_link_struct(&minfo, &ext_info);
+
+ VkMemoryDedicatedAllocateInfoKHR dinfo = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR,
+ .image = params->ded_image,
+ };
+
+ if (params->ded_image)
+ vk_link_struct(&minfo, &dinfo);
+
+ if (!find_best_memtype(ma, type_mask, params, &minfo.memoryTypeIndex))
+ goto error;
+
+ const VkMemoryType *mtype = &ma->props.memoryTypes[minfo.memoryTypeIndex];
+ PL_DEBUG(vk, "Allocating %zu memory of type 0x%x (id %d) in heap %d: %s",
+ (size_t) slab->size, (unsigned) mtype->propertyFlags,
+ (int) minfo.memoryTypeIndex, (int) mtype->heapIndex,
+ PL_DEF(params->debug_tag, "unknown"));
+
+ pl_clock_t start = pl_clock_now();
+
+ VkResult res = vk->AllocateMemory(vk->dev, &minfo, PL_VK_ALLOC, &slab->mem);
+ switch (res) {
+ case VK_ERROR_OUT_OF_DEVICE_MEMORY:
+ case VK_ERROR_OUT_OF_HOST_MEMORY:
+ PL_ERR(vk, "Allocation of size %s failed: %s!",
+ PRINT_SIZE(slab->size), vk_res_str(res));
+ vk_malloc_print_stats(ma, PL_LOG_ERR);
+ pl_log_stack_trace(vk->log, PL_LOG_ERR);
+ pl_debug_abort();
+ goto error;
+
+ default:
+ PL_VK_ASSERT(res, "vkAllocateMemory");
+ }
+
+ slab->mtype = *mtype;
+ if (mtype->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
+ VK(vk->MapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data));
+ slab->coherent = mtype->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+ }
+
+ if (slab->buffer)
+ VK(vk->BindBufferMemory(vk->dev, slab->buffer, slab->mem, 0));
+
+#ifdef PL_HAVE_UNIX
+ if (slab->handle_type == PL_HANDLE_FD ||
+ slab->handle_type == PL_HANDLE_DMA_BUF)
+ {
+ VkMemoryGetFdInfoKHR fd_info = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR,
+ .memory = slab->mem,
+ .handleType = ext_info.handleTypes,
+ };
+
+ VK(vk->GetMemoryFdKHR(vk->dev, &fd_info, &slab->handle.fd));
+ }
+#endif
+
+#ifdef PL_HAVE_WIN32
+ if (slab->handle_type == PL_HANDLE_WIN32 ||
+ slab->handle_type == PL_HANDLE_WIN32_KMT)
+ {
+ VkMemoryGetWin32HandleInfoKHR handle_info = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR,
+ .memory = slab->mem,
+ .handleType = ext_info.handleTypes,
+ };
+
+ VK(vk->GetMemoryWin32HandleKHR(vk->dev, &handle_info,
+ &slab->handle.handle));
+ }
+#endif
+
+ pl_log_cpu_time(vk->log, start, pl_clock_now(), "allocating slab");
+
+ // free space accounting is done by the caller
+ return slab;
+
+error:
+ if (params->debug_tag)
+ PL_ERR(vk, " for malloc: %s", params->debug_tag);
+ slab_free(vk, slab);
+ return NULL;
+}
+
+static void pool_uninit(struct vk_ctx *vk, struct vk_pool *pool)
+{
+ for (int i = 0; i < pool->slabs.num; i++)
+ slab_free(vk, pool->slabs.elem[i]);
+
+ pl_free(pool->slabs.elem);
+ *pool = (struct vk_pool) {0};
+}
+
+struct vk_malloc *vk_malloc_create(struct vk_ctx *vk)
+{
+ struct vk_malloc *ma = pl_zalloc_ptr(NULL, ma);
+ pl_mutex_init(&ma->lock);
+ vk->GetPhysicalDeviceMemoryProperties(vk->physd, &ma->props);
+ ma->vk = vk;
+
+ // Determine maximum page size
+ ma->maximum_page_size = MAXIMUM_PAGE_SIZE_ABSOLUTE;
+ for (int i = 0; i < ma->props.memoryHeapCount; i++) {
+ VkMemoryHeap heap = ma->props.memoryHeaps[i];
+ if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) {
+ size_t size_max = heap.size / MAXIMUM_PAGE_SIZE_RELATIVE;
+ ma->maximum_page_size = PL_MAX(ma->maximum_page_size, size_max);
+ }
+ }
+
+ vk_malloc_print_stats(ma, PL_LOG_INFO);
+ return ma;
+}
+
+void vk_malloc_destroy(struct vk_malloc **ma_ptr)
+{
+ struct vk_malloc *ma = *ma_ptr;
+ if (!ma)
+ return;
+
+ vk_malloc_print_stats(ma, PL_LOG_DEBUG);
+ for (int i = 0; i < ma->pools.num; i++)
+ pool_uninit(ma->vk, &ma->pools.elem[i]);
+
+ pl_mutex_destroy(&ma->lock);
+ pl_free_ptr(ma_ptr);
+}
+
+void vk_malloc_garbage_collect(struct vk_malloc *ma)
+{
+ struct vk_ctx *vk = ma->vk;
+
+ pl_mutex_lock(&ma->lock);
+ ma->age++;
+
+ for (int i = 0; i < ma->pools.num; i++) {
+ struct vk_pool *pool = &ma->pools.elem[i];
+ for (int n = 0; n < pool->slabs.num; n++) {
+ struct vk_slab *slab = pool->slabs.elem[n];
+ pl_mutex_lock(&slab->lock);
+ if (slab->used || (ma->age - slab->age) <= MAXIMUM_SLAB_AGE) {
+ pl_mutex_unlock(&slab->lock);
+ continue;
+ }
+
+ PL_DEBUG(vk, "Garbage collected slab of size %s from pool %d",
+ PRINT_SIZE(slab->size), pool->index);
+
+ pl_mutex_unlock(&slab->lock);
+ slab_free(ma->vk, slab);
+ PL_ARRAY_REMOVE_AT(pool->slabs, n--);
+ }
+ }
+
+ pl_mutex_unlock(&ma->lock);
+}
+
+pl_handle_caps vk_malloc_handle_caps(const struct vk_malloc *ma, bool import)
+{
+ struct vk_ctx *vk = ma->vk;
+ pl_handle_caps caps = 0;
+
+ for (int i = 0; vk_mem_handle_list[i]; i++) {
+ // Try seeing if we could allocate a "basic" buffer using these
+ // capabilities, with no fancy buffer usage. More specific checks will
+ // happen down the line at VkBuffer creation time, but this should give
+ // us a rough idea of what the driver supports.
+ enum pl_handle_type type = vk_mem_handle_list[i];
+ if (buf_external_check(vk, VK_BUFFER_USAGE_TRANSFER_DST_BIT, type, import))
+ caps |= type;
+ }
+
+ return caps;
+}
+
+void vk_malloc_free(struct vk_malloc *ma, struct vk_memslice *slice)
+{
+ struct vk_ctx *vk = ma->vk;
+ struct vk_slab *slab = slice->priv;
+ if (!slab || slab->dedicated) {
+ slab_free(vk, slab);
+ goto done;
+ }
+
+ pl_mutex_lock(&slab->lock);
+
+ int page_idx = slice->offset / slab->pagesize;
+ slab->spacemap |= 0x1LLU << page_idx;
+ slab->used -= slice->size;
+ slab->age = ma->age;
+ pl_assert(slab->used >= 0);
+
+ pl_mutex_unlock(&slab->lock);
+
+done:
+ *slice = (struct vk_memslice) {0};
+}
+
+static inline bool pool_params_eq(const struct vk_malloc_params *a,
+ const struct vk_malloc_params *b)
+{
+ return a->reqs.size == b->reqs.size &&
+ a->reqs.alignment == b->reqs.alignment &&
+ a->reqs.memoryTypeBits == b->reqs.memoryTypeBits &&
+ a->required == b->required &&
+ a->optimal == b->optimal &&
+ a->buf_usage == b->buf_usage &&
+ a->export_handle == b->export_handle;
+}
+
+static struct vk_pool *find_pool(struct vk_malloc *ma,
+ const struct vk_malloc_params *params)
+{
+ pl_assert(!params->import_handle);
+ pl_assert(!params->ded_image);
+
+ struct vk_malloc_params fixed = *params;
+ fixed.reqs.alignment = 0;
+ fixed.reqs.size = 0;
+ fixed.shared_mem = (struct pl_shared_mem) {0};
+
+ for (int i = 0; i < ma->pools.num; i++) {
+ if (pool_params_eq(&ma->pools.elem[i].params, &fixed))
+ return &ma->pools.elem[i];
+ }
+
+ // Not found => add it
+ PL_ARRAY_GROW(ma, ma->pools);
+ size_t idx = ma->pools.num++;
+ ma->pools.elem[idx] = (struct vk_pool) {
+ .params = fixed,
+ .index = idx,
+ };
+ return &ma->pools.elem[idx];
+}
+
+// Returns a suitable memory page from the pool. A new slab will be allocated
+// under the hood, if necessary.
+//
+// Note: This locks the slab it returns
+static struct vk_slab *pool_get_page(struct vk_malloc *ma, struct vk_pool *pool,
+ size_t size, size_t align,
+ VkDeviceSize *offset)
+{
+ struct vk_slab *slab = NULL;
+ int slab_pages = MINIMUM_PAGE_COUNT;
+ size = PL_ALIGN2(size, PAGE_SIZE_ALIGN);
+ const size_t pagesize = PL_ALIGN(size, align);
+
+ for (int i = 0; i < pool->slabs.num; i++) {
+ slab = pool->slabs.elem[i];
+ if (slab->pagesize < size)
+ continue;
+ if (slab->pagesize > pagesize * MINIMUM_PAGE_COUNT) // rough heuristic
+ continue;
+ if (slab->pagesize % align)
+ continue;
+
+ pl_mutex_lock(&slab->lock);
+ int page_idx = __builtin_ffsll(slab->spacemap);
+ if (!page_idx--) {
+ pl_mutex_unlock(&slab->lock);
+ // Increase the number of slabs to allocate for new slabs the
+ // more existing full slabs exist for this size range
+ slab_pages = PL_MIN(slab_pages << 1, MAXIMUM_PAGE_COUNT);
+ continue;
+ }
+
+ slab->spacemap ^= 0x1LLU << page_idx;
+ *offset = page_idx * slab->pagesize;
+ return slab;
+ }
+
+ // Otherwise, allocate a new vk_slab and append it to the list.
+ VkDeviceSize slab_size = slab_pages * pagesize;
+ pl_static_assert(MINIMUM_SLAB_SIZE <= PAGE_SIZE_ALIGN * MAXIMUM_PAGE_COUNT);
+ const VkDeviceSize max_slab_size = ma->maximum_page_size * MINIMUM_PAGE_COUNT;
+ pl_assert(pagesize <= ma->maximum_page_size);
+ slab_size = PL_CLAMP(slab_size, MINIMUM_SLAB_SIZE, max_slab_size);
+ slab_pages = slab_size / pagesize;
+ slab_size = slab_pages * pagesize; // max_slab_size may be npot2, trim excess
+
+ struct vk_malloc_params params = pool->params;
+ params.reqs.size = slab_size;
+
+ // Don't hold the lock while allocating the slab, because it can be a
+ // potentially very costly operation.
+ pl_mutex_unlock(&ma->lock);
+ slab = slab_alloc(ma, &params);
+ pl_mutex_lock(&ma->lock);
+ if (!slab)
+ return NULL;
+ pl_mutex_lock(&slab->lock);
+
+ slab->spacemap = (slab_pages == sizeof(uint64_t) * 8) ? ~0LLU : ~(~0LLU << slab_pages);
+ slab->pagesize = pagesize;
+ PL_ARRAY_APPEND(NULL, pool->slabs, slab);
+
+ // Return the first page in this newly allocated slab
+ slab->spacemap ^= 0x1;
+ *offset = 0;
+ return slab;
+}
+
+static bool vk_malloc_import(struct vk_malloc *ma, struct vk_memslice *out,
+ const struct vk_malloc_params *params)
+{
+ struct vk_ctx *vk = ma->vk;
+ VkExternalMemoryHandleTypeFlagBitsKHR vk_handle_type;
+ vk_handle_type = vk_mem_handle_type(params->import_handle);
+
+ struct vk_slab *slab = NULL;
+ const struct pl_shared_mem *shmem = &params->shared_mem;
+
+ VkMemoryDedicatedAllocateInfoKHR dinfo = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR,
+ .image = params->ded_image,
+ };
+
+ VkImportMemoryFdInfoKHR fdinfo = {
+ .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR,
+ .handleType = vk_handle_type,
+ .fd = -1,
+ };
+
+ VkImportMemoryHostPointerInfoEXT ptrinfo = {
+ .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT,
+ .handleType = vk_handle_type,
+ };
+
+ VkMemoryAllocateInfo ainfo = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+ .allocationSize = shmem->size,
+ };
+
+ if (params->ded_image)
+ vk_link_struct(&ainfo, &dinfo);
+
+ VkBuffer buffer = VK_NULL_HANDLE;
+ VkMemoryRequirements reqs = params->reqs;
+
+ if (params->buf_usage) {
+ uint32_t qfs[3] = {0};
+ pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs));
+ for (int i = 0; i < vk->pools.num; i++)
+ qfs[i] = vk->pools.elem[i]->qf;
+
+ VkExternalMemoryBufferCreateInfoKHR ext_buf_info = {
+ .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR,
+ .handleTypes = vk_handle_type,
+ };
+
+ VkBufferCreateInfo binfo = {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+ .pNext = &ext_buf_info,
+ .size = shmem->size,
+ .usage = params->buf_usage,
+ .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT
+ : VK_SHARING_MODE_EXCLUSIVE,
+ .queueFamilyIndexCount = vk->pools.num,
+ .pQueueFamilyIndices = qfs,
+ };
+
+ VK(vk->CreateBuffer(vk->dev, &binfo, PL_VK_ALLOC, &buffer));
+ PL_VK_NAME(BUFFER, buffer, "imported");
+
+ vk->GetBufferMemoryRequirements(vk->dev, buffer, &reqs);
+ }
+
+ if (reqs.size > shmem->size) {
+ PL_ERR(vk, "Imported object requires %zu bytes, larger than the "
+ "provided size %zu!",
+ (size_t) reqs.size, shmem->size);
+ goto error;
+ }
+
+ if (shmem->offset % reqs.alignment || shmem->offset % params->reqs.alignment) {
+ PL_ERR(vk, "Imported object offset %zu conflicts with alignment %zu!",
+ shmem->offset, pl_lcm(reqs.alignment, params->reqs.alignment));
+ goto error;
+ }
+
+ switch (params->import_handle) {
+#ifdef PL_HAVE_UNIX
+ case PL_HANDLE_DMA_BUF: {
+ if (!vk->GetMemoryFdPropertiesKHR) {
+ PL_ERR(vk, "Importing PL_HANDLE_DMA_BUF requires %s.",
+ VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME);
+ goto error;
+ }
+
+ VkMemoryFdPropertiesKHR fdprops = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_FD_PROPERTIES_KHR,
+ };
+
+ VK(vk->GetMemoryFdPropertiesKHR(vk->dev,
+ vk_handle_type,
+ shmem->handle.fd,
+ &fdprops));
+
+ // We dup() the fd to make it safe to import the same original fd
+ // multiple times.
+ fdinfo.fd = dup(shmem->handle.fd);
+ if (fdinfo.fd == -1) {
+ PL_ERR(vk, "Failed to dup() fd (%d) when importing memory: %s",
+ fdinfo.fd, strerror(errno));
+ goto error;
+ }
+
+ reqs.memoryTypeBits &= fdprops.memoryTypeBits;
+ vk_link_struct(&ainfo, &fdinfo);
+ break;
+ }
+#else // !PL_HAVE_UNIX
+ case PL_HANDLE_DMA_BUF:
+ PL_ERR(vk, "PL_HANDLE_DMA_BUF requires building with UNIX support!");
+ goto error;
+#endif
+
+ case PL_HANDLE_HOST_PTR: {
+ VkMemoryHostPointerPropertiesEXT ptrprops = {
+ .sType = VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT,
+ };
+
+ VK(vk->GetMemoryHostPointerPropertiesEXT(vk->dev, vk_handle_type,
+ shmem->handle.ptr,
+ &ptrprops));
+
+ ptrinfo.pHostPointer = (void *) shmem->handle.ptr;
+ reqs.memoryTypeBits &= ptrprops.memoryTypeBits;
+ vk_link_struct(&ainfo, &ptrinfo);
+ break;
+ }
+
+ case PL_HANDLE_FD:
+ case PL_HANDLE_WIN32:
+ case PL_HANDLE_WIN32_KMT:
+ case PL_HANDLE_IOSURFACE:
+ case PL_HANDLE_MTL_TEX:
+ PL_ERR(vk, "vk_malloc_import: unsupported handle type %d",
+ params->import_handle);
+ goto error;
+ }
+
+ if (!find_best_memtype(ma, reqs.memoryTypeBits, params, &ainfo.memoryTypeIndex)) {
+ PL_ERR(vk, "No compatible memory types offered for imported memory!");
+ goto error;
+ }
+
+ VkDeviceMemory vkmem = VK_NULL_HANDLE;
+ VK(vk->AllocateMemory(vk->dev, &ainfo, PL_VK_ALLOC, &vkmem));
+
+ slab = pl_alloc_ptr(NULL, slab);
+ *slab = (struct vk_slab) {
+ .mem = vkmem,
+ .dedicated = true,
+ .imported = true,
+ .buffer = buffer,
+ .size = shmem->size,
+ .handle_type = params->import_handle,
+ };
+ pl_mutex_init(&slab->lock);
+
+ *out = (struct vk_memslice) {
+ .vkmem = vkmem,
+ .buf = buffer,
+ .size = shmem->size - shmem->offset,
+ .offset = shmem->offset,
+ .shared_mem = *shmem,
+ .priv = slab,
+ };
+
+ switch (params->import_handle) {
+ case PL_HANDLE_DMA_BUF:
+ case PL_HANDLE_FD:
+ PL_TRACE(vk, "Imported %s bytes from fd: %d%s",
+ PRINT_SIZE(slab->size), shmem->handle.fd,
+ params->ded_image ? " (dedicated)" : "");
+ // fd ownership is transferred at this point.
+ slab->handle.fd = fdinfo.fd;
+ fdinfo.fd = -1;
+ break;
+ case PL_HANDLE_HOST_PTR:
+ PL_TRACE(vk, "Imported %s bytes from ptr: %p%s",
+ PRINT_SIZE(slab->size), shmem->handle.ptr,
+ params->ded_image ? " (dedicated" : "");
+ slab->handle.ptr = ptrinfo.pHostPointer;
+ break;
+ case PL_HANDLE_WIN32:
+ case PL_HANDLE_WIN32_KMT:
+ case PL_HANDLE_IOSURFACE:
+ case PL_HANDLE_MTL_TEX:
+ break;
+ }
+
+ VkMemoryPropertyFlags flags = ma->props.memoryTypes[ainfo.memoryTypeIndex].propertyFlags;
+ if (flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
+ VK(vk->MapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data));
+ slab->coherent = flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+ out->data = (uint8_t *) slab->data + out->offset;
+ out->coherent = slab->coherent;
+ if (!slab->coherent) {
+ // Use entire buffer range, since this is a dedicated memory
+ // allocation. This avoids issues with noncoherent atomicity
+ out->map_offset = 0;
+ out->map_size = VK_WHOLE_SIZE;
+
+ // Mapping does not implicitly invalidate mapped memory
+ VK(vk->InvalidateMappedMemoryRanges(vk->dev, 1, &(VkMappedMemoryRange) {
+ .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+ .memory = slab->mem,
+ .offset = out->map_offset,
+ .size = out->map_size,
+ }));
+ }
+ }
+
+ if (buffer)
+ VK(vk->BindBufferMemory(vk->dev, buffer, vkmem, 0));
+
+ return true;
+
+error:
+ if (params->debug_tag)
+ PL_ERR(vk, " for malloc: %s", params->debug_tag);
+ vk->DestroyBuffer(vk->dev, buffer, PL_VK_ALLOC);
+#ifdef PL_HAVE_UNIX
+ if (fdinfo.fd > -1)
+ close(fdinfo.fd);
+#endif
+ pl_free(slab);
+ *out = (struct vk_memslice) {0};
+ return false;
+}
+
+size_t vk_malloc_avail(struct vk_malloc *ma, VkMemoryPropertyFlags flags)
+{
+ size_t avail = 0;
+ for (int i = 0; i < ma->props.memoryTypeCount; i++) {
+ const VkMemoryType *mtype = &ma->props.memoryTypes[i];
+ if ((mtype->propertyFlags & flags) != flags)
+ continue;
+ avail = PL_MAX(avail, ma->props.memoryHeaps[mtype->heapIndex].size);
+ }
+
+ return avail;
+}
+
+bool vk_malloc_slice(struct vk_malloc *ma, struct vk_memslice *out,
+ const struct vk_malloc_params *params)
+{
+ struct vk_ctx *vk = ma->vk;
+ pl_assert(!params->import_handle || !params->export_handle);
+ if (params->import_handle)
+ return vk_malloc_import(ma, out, params);
+
+ pl_assert(params->reqs.size);
+ size_t size = params->reqs.size;
+ size_t align = params->reqs.alignment;
+ align = pl_lcm(align, vk->props.limits.bufferImageGranularity);
+ align = pl_lcm(align, vk->props.limits.nonCoherentAtomSize);
+
+ struct vk_slab *slab;
+ VkDeviceSize offset;
+
+ if (params->ded_image || size > ma->maximum_page_size) {
+ slab = slab_alloc(ma, params);
+ if (!slab)
+ return false;
+ slab->dedicated = true;
+ offset = 0;
+ } else {
+ pl_mutex_lock(&ma->lock);
+ struct vk_pool *pool = find_pool(ma, params);
+ slab = pool_get_page(ma, pool, size, align, &offset);
+ pl_mutex_unlock(&ma->lock);
+ if (!slab) {
+ PL_ERR(ma->vk, "No slab to serve request for %s bytes (with "
+ "alignment 0x%zx) in pool %d!",
+ PRINT_SIZE(size), align, pool->index);
+ return false;
+ }
+
+ // For accounting, just treat the alignment as part of the used size.
+ // Doing it this way makes sure that the sizes reported to vk_memslice
+ // consumers are always aligned properly.
+ size = PL_ALIGN(size, align);
+ slab->used += size;
+ slab->age = ma->age;
+ if (params->debug_tag)
+ slab->debug_tag = params->debug_tag;
+ pl_mutex_unlock(&slab->lock);
+ }
+
+ pl_assert(offset % align == 0);
+ *out = (struct vk_memslice) {
+ .vkmem = slab->mem,
+ .offset = offset,
+ .size = size,
+ .buf = slab->buffer,
+ .data = slab->data ? (uint8_t *) slab->data + offset : 0x0,
+ .coherent = slab->coherent,
+ .map_offset = slab->data ? offset : 0,
+ .map_size = slab->data ? size : 0,
+ .priv = slab,
+ .shared_mem = {
+ .handle = slab->handle,
+ .offset = offset,
+ .size = slab->size,
+ },
+ };
+ return true;
+}
diff --git a/src/vulkan/malloc.h b/src/vulkan/malloc.h
new file mode 100644
index 0000000..115352e
--- /dev/null
+++ b/src/vulkan/malloc.h
@@ -0,0 +1,72 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+// All memory allocated from a vk_malloc MUST be explicitly released by
+// the caller before vk_malloc_destroy is called.
+struct vk_malloc *vk_malloc_create(struct vk_ctx *vk);
+void vk_malloc_destroy(struct vk_malloc **ma);
+
+// Get the supported handle types for this malloc instance
+pl_handle_caps vk_malloc_handle_caps(const struct vk_malloc *ma, bool import);
+
+// Represents a single "slice" of generic (non-buffer) memory, plus some
+// metadata for accounting. This struct is essentially read-only.
+struct vk_memslice {
+ VkDeviceMemory vkmem;
+ VkDeviceSize offset;
+ VkDeviceSize size;
+ void *priv;
+ // depending on the type/flags:
+ struct pl_shared_mem shared_mem;
+ VkBuffer buf; // associated buffer (when `buf_usage` is nonzero)
+ void *data; // pointer to slice (for persistently mapped slices)
+ bool coherent; // whether `data` is coherent
+ VkDeviceSize map_offset; // can be larger than offset/size
+ VkDeviceSize map_size;
+};
+
+struct vk_malloc_params {
+ VkMemoryRequirements reqs;
+ VkMemoryPropertyFlags required;
+ VkMemoryPropertyFlags optimal;
+ VkBufferUsageFlags buf_usage;
+ VkImage ded_image; // for dedicated image allocations
+ enum pl_handle_type export_handle;
+ enum pl_handle_type import_handle;
+ struct pl_shared_mem shared_mem; // for `import_handle`
+ pl_debug_tag debug_tag;
+};
+
+// Returns the amount of available memory matching a given set of property
+// flags. Always returns the highest single allocation, not the combined total.
+size_t vk_malloc_avail(struct vk_malloc *ma, VkMemoryPropertyFlags flags);
+
+bool vk_malloc_slice(struct vk_malloc *ma, struct vk_memslice *out,
+ const struct vk_malloc_params *params);
+
+void vk_malloc_free(struct vk_malloc *ma, struct vk_memslice *slice);
+
+// Clean up unused slabs. Call this roughly once per frame to reduce
+// memory pressure / memory leaks.
+void vk_malloc_garbage_collect(struct vk_malloc *ma);
+
+// For debugging purposes. Doesn't include dedicated slab allocations!
+void vk_malloc_print_stats(struct vk_malloc *ma, enum pl_log_level);
diff --git a/src/vulkan/meson.build b/src/vulkan/meson.build
new file mode 100644
index 0000000..64c5572
--- /dev/null
+++ b/src/vulkan/meson.build
@@ -0,0 +1,59 @@
+vulkan_build = get_option('vulkan')
+vulkan_link = get_option('vk-proc-addr')
+vulkan_loader = dependency('vulkan', required: false)
+vulkan_headers = vulkan_loader.partial_dependency(includes: true, compile_args: true)
+registry_xml = get_option('vulkan-registry')
+
+# Prefer our Vulkan headers for portability
+vulkan_headers_dir = thirdparty/'Vulkan-Headers'
+vulkan_headers_inc = include_directories()
+if fs.is_dir(vulkan_headers_dir/'include')
+ vulkan_headers = declare_dependency()
+ vulkan_headers_inc = include_directories('../../3rdparty/Vulkan-Headers/include')
+ # Force the use of this vk.xml because it has to be in sync with the headers
+ registry_xml = vulkan_headers_dir/'registry/vk.xml'
+endif
+
+vulkan_build = vulkan_build.require(
+ cc.has_header_symbol('vulkan/vulkan_core.h', 'VK_VERSION_1_3',
+ include_directories: vulkan_headers_inc,
+ dependencies: vulkan_headers),
+ error_message: 'vulkan.h was not found on the system, nor inside ' +
+ '`3rdparty/Vulkan-Headers`. Please run `git submodule update --init` ' +
+ 'followed by `meson --wipe`.')
+components.set('vulkan', vulkan_build.allowed())
+
+vulkan_link = vulkan_link.require(vulkan_loader.found() and vulkan_build.allowed())
+components.set('vk-proc-addr', vulkan_link.allowed())
+
+build_deps += vulkan_headers
+
+if vulkan_build.allowed()
+ sources += [
+ 'vulkan/command.c',
+ 'vulkan/context.c',
+ 'vulkan/formats.c',
+ 'vulkan/gpu.c',
+ 'vulkan/gpu_buf.c',
+ 'vulkan/gpu_tex.c',
+ 'vulkan/gpu_pass.c',
+ 'vulkan/malloc.c',
+ 'vulkan/swapchain.c',
+ 'vulkan/utils.c',
+ ]
+
+ datadir = get_option('prefix') / get_option('datadir')
+ sources += custom_target('utils_gen.c',
+ input: 'utils_gen.py',
+ output: 'utils_gen.c',
+ command: [python, '@INPUT@', datadir, registry_xml, '@OUTPUT@'],
+ env: python_env,
+ )
+
+ if vulkan_link.allowed()
+ build_deps += vulkan_loader
+ tests += 'vulkan.c'
+ endif
+else
+ sources += 'vulkan/stubs.c'
+endif
diff --git a/src/vulkan/stubs.c b/src/vulkan/stubs.c
new file mode 100644
index 0000000..0c0738e
--- /dev/null
+++ b/src/vulkan/stubs.c
@@ -0,0 +1,108 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "../common.h"
+#include "log.h"
+
+#include <libplacebo/vulkan.h>
+
+const struct pl_vk_inst_params pl_vk_inst_default_params = {0};
+const struct pl_vulkan_params pl_vulkan_default_params = { PL_VULKAN_DEFAULTS };
+
+pl_vk_inst pl_vk_inst_create(pl_log log, const struct pl_vk_inst_params *params)
+{
+ pl_fatal(log, "libplacebo compiled without Vulkan support!");
+ return NULL;
+}
+
+void pl_vk_inst_destroy(pl_vk_inst *pinst)
+{
+ pl_vk_inst inst = *pinst;
+ pl_assert(!inst);
+}
+
+pl_vulkan pl_vulkan_create(pl_log log, const struct pl_vulkan_params *params)
+{
+ pl_fatal(log, "libplacebo compiled without Vulkan support!");
+ return NULL;
+}
+
+void pl_vulkan_destroy(pl_vulkan *pvk)
+{
+ pl_vulkan vk = *pvk;
+ pl_assert(!vk);
+}
+
+pl_vulkan pl_vulkan_get(pl_gpu gpu)
+{
+ return NULL;
+}
+
+VkPhysicalDevice pl_vulkan_choose_device(pl_log log,
+ const struct pl_vulkan_device_params *params)
+{
+ pl_err(log, "libplacebo compiled without Vulkan support!");
+ return NULL;
+}
+
+pl_swapchain pl_vulkan_create_swapchain(pl_vulkan vk,
+ const struct pl_vulkan_swapchain_params *params)
+{
+ pl_unreachable();
+}
+
+bool pl_vulkan_swapchain_suboptimal(pl_swapchain sw)
+{
+ pl_unreachable();
+}
+
+pl_vulkan pl_vulkan_import(pl_log log, const struct pl_vulkan_import_params *params)
+{
+ pl_fatal(log, "libplacebo compiled without Vulkan support!");
+ return NULL;
+}
+
+pl_tex pl_vulkan_wrap(pl_gpu gpu, const struct pl_vulkan_wrap_params *params)
+{
+ pl_unreachable();
+}
+
+VkImage pl_vulkan_unwrap(pl_gpu gpu, pl_tex tex,
+ VkFormat *out_format, VkImageUsageFlags *out_flags)
+{
+ pl_unreachable();
+}
+
+bool pl_vulkan_hold_ex(pl_gpu gpu, const struct pl_vulkan_hold_params *params)
+{
+ pl_unreachable();
+}
+
+void pl_vulkan_release_ex(pl_gpu gpu, const struct pl_vulkan_release_params *params)
+{
+ pl_unreachable();
+}
+
+VkSemaphore pl_vulkan_sem_create(pl_gpu gpu, const struct pl_vulkan_sem_params *params)
+{
+ pl_unreachable();
+}
+
+void pl_vulkan_sem_destroy(pl_gpu gpu, VkSemaphore *semaphore)
+{
+ pl_unreachable();
+}
diff --git a/src/vulkan/swapchain.c b/src/vulkan/swapchain.c
new file mode 100644
index 0000000..0741fbf
--- /dev/null
+++ b/src/vulkan/swapchain.c
@@ -0,0 +1,911 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common.h"
+#include "command.h"
+#include "formats.h"
+#include "utils.h"
+#include "gpu.h"
+#include "swapchain.h"
+#include "pl_thread.h"
+
+struct sem_pair {
+ VkSemaphore in;
+ VkSemaphore out;
+};
+
+struct priv {
+ struct pl_sw_fns impl;
+
+ pl_mutex lock;
+ struct vk_ctx *vk;
+ VkSurfaceKHR surf;
+ PL_ARRAY(VkSurfaceFormatKHR) formats;
+
+ // current swapchain and metadata:
+ struct pl_vulkan_swapchain_params params;
+ VkSwapchainCreateInfoKHR protoInfo; // partially filled-in prototype
+ VkSwapchainKHR swapchain;
+ int cur_width, cur_height;
+ int swapchain_depth;
+ pl_rc_t frames_in_flight; // number of frames currently queued
+ bool suboptimal; // true once VK_SUBOPTIMAL_KHR is returned
+ bool needs_recreate; // swapchain needs to be recreated
+ struct pl_color_repr color_repr;
+ struct pl_color_space color_space;
+ struct pl_hdr_metadata hdr_metadata;
+
+ // state of the images:
+ PL_ARRAY(pl_tex) images; // pl_tex wrappers for the VkImages
+ PL_ARRAY(struct sem_pair) sems; // pool of semaphores used to synchronize images
+ int idx_sems; // index of next free semaphore pair
+ int last_imgidx; // the image index last acquired (for submit)
+};
+
+static const struct pl_sw_fns vulkan_swapchain;
+
+static bool map_color_space(VkColorSpaceKHR space, struct pl_color_space *out)
+{
+ switch (space) {
+ // Note: This is technically against the spec, but more often than not
+ // it's the correct result since `SRGB_NONLINEAR` is just a catch-all
+ // for any sort of typical SDR curve, which is better approximated by
+ // `pl_color_space_monitor`.
+ case VK_COLOR_SPACE_SRGB_NONLINEAR_KHR:
+ *out = pl_color_space_monitor;
+ return true;
+
+ case VK_COLOR_SPACE_BT709_NONLINEAR_EXT:
+ *out = pl_color_space_monitor;
+ return true;
+ case VK_COLOR_SPACE_DISPLAY_P3_NONLINEAR_EXT:
+ *out = (struct pl_color_space) {
+ .primaries = PL_COLOR_PRIM_DISPLAY_P3,
+ .transfer = PL_COLOR_TRC_BT_1886,
+ };
+ return true;
+ case VK_COLOR_SPACE_DCI_P3_LINEAR_EXT:
+ *out = (struct pl_color_space) {
+ .primaries = PL_COLOR_PRIM_DCI_P3,
+ .transfer = PL_COLOR_TRC_LINEAR,
+ };
+ return true;
+ case VK_COLOR_SPACE_DCI_P3_NONLINEAR_EXT:
+ *out = (struct pl_color_space) {
+ .primaries = PL_COLOR_PRIM_DCI_P3,
+ .transfer = PL_COLOR_TRC_BT_1886,
+ };
+ return true;
+ case VK_COLOR_SPACE_EXTENDED_SRGB_LINEAR_EXT:
+ case VK_COLOR_SPACE_EXTENDED_SRGB_NONLINEAR_EXT:
+ // TODO
+ return false;
+ case VK_COLOR_SPACE_BT709_LINEAR_EXT:
+ *out = (struct pl_color_space) {
+ .primaries = PL_COLOR_PRIM_DCI_P3,
+ .transfer = PL_COLOR_TRC_LINEAR,
+ };
+ return true;
+ case VK_COLOR_SPACE_BT2020_LINEAR_EXT:
+ *out = (struct pl_color_space) {
+ .primaries = PL_COLOR_PRIM_BT_2020,
+ .transfer = PL_COLOR_TRC_LINEAR,
+ };
+ return true;
+ case VK_COLOR_SPACE_HDR10_ST2084_EXT:
+ *out = (struct pl_color_space) {
+ .primaries = PL_COLOR_PRIM_BT_2020,
+ .transfer = PL_COLOR_TRC_PQ,
+ };
+ return true;
+ case VK_COLOR_SPACE_DOLBYVISION_EXT:
+ // Unlikely to ever be implemented
+ return false;
+ case VK_COLOR_SPACE_HDR10_HLG_EXT:
+ *out = (struct pl_color_space) {
+ .primaries = PL_COLOR_PRIM_BT_2020,
+ .transfer = PL_COLOR_TRC_HLG,
+ };
+ return true;
+ case VK_COLOR_SPACE_ADOBERGB_LINEAR_EXT:
+ *out = (struct pl_color_space) {
+ .primaries = PL_COLOR_PRIM_ADOBE,
+ .transfer = PL_COLOR_TRC_LINEAR,
+ };
+ return true;
+ case VK_COLOR_SPACE_ADOBERGB_NONLINEAR_EXT:
+ *out = (struct pl_color_space) {
+ .primaries = PL_COLOR_PRIM_ADOBE,
+ .transfer = PL_COLOR_TRC_GAMMA22,
+ };
+ return true;
+ case VK_COLOR_SPACE_PASS_THROUGH_EXT:
+ *out = pl_color_space_unknown;
+ return true;
+
+#ifdef VK_AMD_display_native_hdr
+ case VK_COLOR_SPACE_DISPLAY_NATIVE_AMD:
+ // TODO
+ return false;
+#endif
+
+ default: return false;
+ }
+}
+
+static bool pick_surf_format(pl_swapchain sw, const struct pl_color_space *hint)
+{
+ struct priv *p = PL_PRIV(sw);
+ struct vk_ctx *vk = p->vk;
+ pl_gpu gpu = sw->gpu;
+
+ int best_score = 0, best_id;
+ bool wide_gamut = pl_color_primaries_is_wide_gamut(hint->primaries);
+ bool prefer_hdr = pl_color_transfer_is_hdr(hint->transfer);
+
+ for (int i = 0; i < p->formats.num; i++) {
+ // Color space / format whitelist
+ struct pl_color_space space;
+ if (!map_color_space(p->formats.elem[i].colorSpace, &space))
+ continue;
+
+ bool disable10 = !pl_color_transfer_is_hdr(space.transfer) &&
+ p->params.disable_10bit_sdr;
+
+ switch (p->formats.elem[i].format) {
+ // Only accept floating point formats for linear curves
+ case VK_FORMAT_R16G16B16_SFLOAT:
+ case VK_FORMAT_R16G16B16A16_SFLOAT:
+ case VK_FORMAT_R32G32B32_SFLOAT:
+ case VK_FORMAT_R32G32B32A32_SFLOAT:
+ case VK_FORMAT_R64G64B64_SFLOAT:
+ case VK_FORMAT_R64G64B64A64_SFLOAT:
+ if (space.transfer == PL_COLOR_TRC_LINEAR)
+ break; // accept
+ continue;
+
+ // Only accept 8 bit for non-HDR curves
+ case VK_FORMAT_R8G8B8_UNORM:
+ case VK_FORMAT_B8G8R8_UNORM:
+ case VK_FORMAT_R8G8B8A8_UNORM:
+ case VK_FORMAT_B8G8R8A8_UNORM:
+ case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+ if (!pl_color_transfer_is_hdr(space.transfer))
+ break; // accept
+ continue;
+
+ // Only accept 10 bit formats for non-linear curves
+ case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
+ case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+ if (space.transfer != PL_COLOR_TRC_LINEAR && !disable10)
+ break; // accept
+ continue;
+
+ // Accept 16-bit formats for everything
+ case VK_FORMAT_R16G16B16_UNORM:
+ case VK_FORMAT_R16G16B16A16_UNORM:
+ if (!disable10)
+ break; // accept
+ continue;
+
+ default: continue;
+ }
+
+ // Make sure we can wrap this format to a meaningful, valid pl_fmt
+ for (int n = 0; n < gpu->num_formats; n++) {
+ pl_fmt plfmt = gpu->formats[n];
+ const struct vk_format **pvkfmt = PL_PRIV(plfmt);
+ if ((*pvkfmt)->tfmt != p->formats.elem[i].format)
+ continue;
+
+ enum pl_fmt_caps render_caps = 0;
+ render_caps |= PL_FMT_CAP_RENDERABLE;
+ render_caps |= PL_FMT_CAP_BLITTABLE;
+ if ((plfmt->caps & render_caps) != render_caps)
+ continue;
+
+ // format valid, use it if it has a higher score
+ int score = 0;
+ for (int c = 0; c < 3; c++)
+ score += plfmt->component_depth[c];
+ if (pl_color_primaries_is_wide_gamut(space.primaries) == wide_gamut)
+ score += 1000;
+ if (space.primaries == hint->primaries)
+ score += 2000;
+ if (pl_color_transfer_is_hdr(space.transfer) == prefer_hdr)
+ score += 10000;
+ if (space.transfer == hint->transfer)
+ score += 20000;
+
+ switch (plfmt->type) {
+ case PL_FMT_UNKNOWN: break;
+ case PL_FMT_UINT: break;
+ case PL_FMT_SINT: break;
+ case PL_FMT_UNORM: score += 500; break;
+ case PL_FMT_SNORM: score += 400; break;
+ case PL_FMT_FLOAT: score += 300; break;
+ case PL_FMT_TYPE_COUNT: pl_unreachable();
+ };
+
+ if (score > best_score) {
+ best_score = score;
+ best_id = i;
+ break;
+ }
+ }
+ }
+
+ if (!best_score) {
+ PL_ERR(vk, "Failed picking any valid, renderable surface format!");
+ return false;
+ }
+
+ VkSurfaceFormatKHR new_sfmt = p->formats.elem[best_id];
+ if (p->protoInfo.imageFormat != new_sfmt.format ||
+ p->protoInfo.imageColorSpace != new_sfmt.colorSpace)
+ {
+ PL_INFO(vk, "Picked surface configuration %d: %s + %s", best_id,
+ vk_fmt_name(new_sfmt.format),
+ vk_csp_name(new_sfmt.colorSpace));
+
+ p->protoInfo.imageFormat = new_sfmt.format;
+ p->protoInfo.imageColorSpace = new_sfmt.colorSpace;
+ p->needs_recreate = true;
+ }
+
+ return true;
+}
+
+static void set_hdr_metadata(struct priv *p, const struct pl_hdr_metadata *metadata)
+{
+ struct vk_ctx *vk = p->vk;
+ if (!vk->SetHdrMetadataEXT)
+ return;
+
+ // Whitelist only values that we support signalling metadata for
+ struct pl_hdr_metadata fix = {
+ .prim = metadata->prim,
+ .min_luma = metadata->min_luma,
+ .max_luma = metadata->max_luma,
+ .max_cll = metadata->max_cll,
+ .max_fall = metadata->max_fall,
+ };
+
+ // Ignore no-op changes
+ if (pl_hdr_metadata_equal(&fix, &p->hdr_metadata))
+ return;
+
+ // Remember the metadata so we can re-apply it after swapchain recreation
+ p->hdr_metadata = fix;
+
+ // Ignore HDR metadata requests for SDR swapchains
+ if (!pl_color_transfer_is_hdr(p->color_space.transfer))
+ return;
+
+ if (!p->swapchain)
+ return;
+
+ vk->SetHdrMetadataEXT(vk->dev, 1, &p->swapchain, &(VkHdrMetadataEXT) {
+ .sType = VK_STRUCTURE_TYPE_HDR_METADATA_EXT,
+ .displayPrimaryRed = { fix.prim.red.x, fix.prim.red.y },
+ .displayPrimaryGreen = { fix.prim.green.x, fix.prim.green.y },
+ .displayPrimaryBlue = { fix.prim.blue.x, fix.prim.blue.y },
+ .whitePoint = { fix.prim.white.x, fix.prim.white.y },
+ .maxLuminance = fix.max_luma,
+ .minLuminance = fix.min_luma,
+ .maxContentLightLevel = fix.max_cll,
+ .maxFrameAverageLightLevel = fix.max_fall,
+ });
+
+ // Keep track of applied HDR colorimetry metadata
+ p->color_space.hdr = p->hdr_metadata;
+}
+
+pl_swapchain pl_vulkan_create_swapchain(pl_vulkan plvk,
+ const struct pl_vulkan_swapchain_params *params)
+{
+ struct vk_ctx *vk = PL_PRIV(plvk);
+ pl_gpu gpu = plvk->gpu;
+
+ if (!vk->CreateSwapchainKHR) {
+ PL_ERR(gpu, VK_KHR_SWAPCHAIN_EXTENSION_NAME " not enabled!");
+ return NULL;
+ }
+
+ struct pl_swapchain_t *sw = pl_zalloc_obj(NULL, sw, struct priv);
+ sw->log = vk->log;
+ sw->gpu = gpu;
+
+ struct priv *p = PL_PRIV(sw);
+ pl_mutex_init(&p->lock);
+ p->impl = vulkan_swapchain;
+ p->params = *params;
+ p->vk = vk;
+ p->surf = params->surface;
+ p->swapchain_depth = PL_DEF(params->swapchain_depth, 3);
+ pl_assert(p->swapchain_depth > 0);
+ atomic_init(&p->frames_in_flight, 0);
+ p->last_imgidx = -1;
+ p->protoInfo = (VkSwapchainCreateInfoKHR) {
+ .sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR,
+ .surface = p->surf,
+ .imageArrayLayers = 1, // non-stereoscopic
+ .imageSharingMode = VK_SHARING_MODE_EXCLUSIVE,
+ .minImageCount = p->swapchain_depth + 1, // +1 for the FB
+ .presentMode = params->present_mode,
+ .clipped = true,
+ };
+
+ // These fields will be updated by `vk_sw_recreate`
+ p->color_space = pl_color_space_unknown;
+ p->color_repr = (struct pl_color_repr) {
+ .sys = PL_COLOR_SYSTEM_RGB,
+ .levels = PL_COLOR_LEVELS_FULL,
+ .alpha = PL_ALPHA_UNKNOWN,
+ };
+
+ // Make sure the swapchain present mode is supported
+ VkPresentModeKHR *modes = NULL;
+ uint32_t num_modes = 0;
+ VK(vk->GetPhysicalDeviceSurfacePresentModesKHR(vk->physd, p->surf, &num_modes, NULL));
+ modes = pl_calloc_ptr(NULL, num_modes, modes);
+ VK(vk->GetPhysicalDeviceSurfacePresentModesKHR(vk->physd, p->surf, &num_modes, modes));
+
+ bool supported = false;
+ for (int i = 0; i < num_modes; i++)
+ supported |= (modes[i] == p->protoInfo.presentMode);
+ pl_free_ptr(&modes);
+
+ if (!supported) {
+ PL_WARN(vk, "Requested swap mode unsupported by this device, falling "
+ "back to VK_PRESENT_MODE_FIFO_KHR");
+ p->protoInfo.presentMode = VK_PRESENT_MODE_FIFO_KHR;
+ }
+
+ // Enumerate the supported surface color spaces
+ uint32_t num_formats = 0;
+ VK(vk->GetPhysicalDeviceSurfaceFormatsKHR(vk->physd, p->surf, &num_formats, NULL));
+ PL_ARRAY_RESIZE(sw, p->formats, num_formats);
+ VK(vk->GetPhysicalDeviceSurfaceFormatsKHR(vk->physd, p->surf, &num_formats, p->formats.elem));
+ p->formats.num = num_formats;
+
+ PL_INFO(gpu, "Available surface configurations:");
+ for (int i = 0; i < p->formats.num; i++) {
+ PL_INFO(gpu, " %d: %-40s %s", i,
+ vk_fmt_name(p->formats.elem[i].format),
+ vk_csp_name(p->formats.elem[i].colorSpace));
+ }
+
+ // Ensure there exists at least some valid renderable surface format
+ struct pl_color_space hint = {0};
+ if (!pick_surf_format(sw, &hint))
+ goto error;
+
+ return sw;
+
+error:
+ pl_free(modes);
+ pl_free(sw);
+ return NULL;
+}
+
+static void vk_sw_destroy(pl_swapchain sw)
+{
+ pl_gpu gpu = sw->gpu;
+ struct priv *p = PL_PRIV(sw);
+ struct vk_ctx *vk = p->vk;
+
+ pl_gpu_flush(gpu);
+ vk_wait_idle(vk);
+
+ // Vulkan offers no way to know when a queue presentation command is done,
+ // leading to spec-mandated undefined behavior when destroying resources
+ // tied to the swapchain. Use an extra `vkQueueWaitIdle` on all of the
+ // queues we may have oustanding presentation calls on, to hopefully inform
+ // the driver that we want to wait until the device is truly idle.
+ for (int i = 0; i < vk->pool_graphics->num_queues; i++)
+ vk->QueueWaitIdle(vk->pool_graphics->queues[i]);
+
+ for (int i = 0; i < p->images.num; i++)
+ pl_tex_destroy(gpu, &p->images.elem[i]);
+ for (int i = 0; i < p->sems.num; i++) {
+ vk->DestroySemaphore(vk->dev, p->sems.elem[i].in, PL_VK_ALLOC);
+ vk->DestroySemaphore(vk->dev, p->sems.elem[i].out, PL_VK_ALLOC);
+ }
+
+ vk->DestroySwapchainKHR(vk->dev, p->swapchain, PL_VK_ALLOC);
+ pl_mutex_destroy(&p->lock);
+ pl_free((void *) sw);
+}
+
+static int vk_sw_latency(pl_swapchain sw)
+{
+ struct priv *p = PL_PRIV(sw);
+ return p->swapchain_depth;
+}
+
+static bool update_swapchain_info(struct priv *p, VkSwapchainCreateInfoKHR *info,
+ int w, int h)
+{
+ struct vk_ctx *vk = p->vk;
+
+ // Query the supported capabilities and update this struct as needed
+ VkSurfaceCapabilitiesKHR caps = {0};
+ VK(vk->GetPhysicalDeviceSurfaceCapabilitiesKHR(vk->physd, p->surf, &caps));
+
+ // Check for hidden/invisible window
+ if (!caps.currentExtent.width || !caps.currentExtent.height) {
+ PL_DEBUG(vk, "maxImageExtent reported as 0x0, hidden window? skipping");
+ return false;
+ }
+
+ // Sorted by preference
+ static const struct { VkCompositeAlphaFlagsKHR vk_mode;
+ enum pl_alpha_mode pl_mode;
+ } alphaModes[] = {
+ {VK_COMPOSITE_ALPHA_POST_MULTIPLIED_BIT_KHR, PL_ALPHA_INDEPENDENT},
+ {VK_COMPOSITE_ALPHA_PRE_MULTIPLIED_BIT_KHR, PL_ALPHA_PREMULTIPLIED},
+ {VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR, PL_ALPHA_UNKNOWN},
+ {VK_COMPOSITE_ALPHA_INHERIT_BIT_KHR, PL_ALPHA_UNKNOWN},
+ };
+
+ for (int i = 0; i < PL_ARRAY_SIZE(alphaModes); i++) {
+ if (caps.supportedCompositeAlpha & alphaModes[i].vk_mode) {
+ info->compositeAlpha = alphaModes[i].vk_mode;
+ p->color_repr.alpha = alphaModes[i].pl_mode;
+ PL_DEBUG(vk, "Requested alpha compositing mode: %s",
+ vk_alpha_mode(info->compositeAlpha));
+ break;
+ }
+ }
+
+ if (!info->compositeAlpha) {
+ PL_ERR(vk, "Failed picking alpha compositing mode (caps: 0x%x)",
+ caps.supportedCompositeAlpha);
+ goto error;
+ }
+
+ // Note: We could probably also allow picking a surface transform that
+ // flips the framebuffer and set `pl_swapchain_frame.flipped`, but this
+ // doesn't appear to be necessary for any vulkan implementations.
+ static const VkSurfaceTransformFlagsKHR rotModes[] = {
+ VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR,
+ VK_SURFACE_TRANSFORM_INHERIT_BIT_KHR,
+ };
+
+ for (int i = 0; i < PL_ARRAY_SIZE(rotModes); i++) {
+ if (caps.supportedTransforms & rotModes[i]) {
+ info->preTransform = rotModes[i];
+ PL_DEBUG(vk, "Requested surface transform: %s",
+ vk_surface_transform(info->preTransform));
+ break;
+ }
+ }
+
+ if (!info->preTransform) {
+ PL_ERR(vk, "Failed picking surface transform mode (caps: 0x%x)",
+ caps.supportedTransforms);
+ goto error;
+ }
+
+ // Image count as required
+ PL_DEBUG(vk, "Requested image count: %d (min %d max %d)",
+ (int) info->minImageCount, (int) caps.minImageCount,
+ (int) caps.maxImageCount);
+
+ info->minImageCount = PL_MAX(info->minImageCount, caps.minImageCount);
+ if (caps.maxImageCount)
+ info->minImageCount = PL_MIN(info->minImageCount, caps.maxImageCount);
+
+ PL_DEBUG(vk, "Requested image size: %dx%d (min %dx%d < cur %dx%d < max %dx%d)",
+ w, h, caps.minImageExtent.width, caps.minImageExtent.height,
+ caps.currentExtent.width, caps.currentExtent.height,
+ caps.maxImageExtent.width, caps.maxImageExtent.height);
+
+ // Default the requested size based on the reported extent
+ if (caps.currentExtent.width != 0xFFFFFFFF)
+ w = PL_DEF(w, caps.currentExtent.width);
+ if (caps.currentExtent.height != 0xFFFFFFFF)
+ h = PL_DEF(h, caps.currentExtent.height);
+
+ // Otherwise, re-use the existing size if available
+ w = PL_DEF(w, info->imageExtent.width);
+ h = PL_DEF(h, info->imageExtent.height);
+
+ if (!w || !h) {
+ PL_ERR(vk, "Failed resizing swapchain: unknown size?");
+ goto error;
+ }
+
+ // Clamp the extent based on the supported limits
+ w = PL_CLAMP(w, caps.minImageExtent.width, caps.maxImageExtent.width);
+ h = PL_CLAMP(h, caps.minImageExtent.height, caps.maxImageExtent.height);
+ info->imageExtent = (VkExtent2D) { w, h };
+
+ // We just request whatever makes sense, and let the pl_vk decide what
+ // pl_tex_params that translates to. That said, we still need to intersect
+ // the swapchain usage flags with the format usage flags
+ VkImageUsageFlags req_flags = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
+ VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+ VkImageUsageFlags opt_flags = VK_IMAGE_USAGE_STORAGE_BIT;
+
+ info->imageUsage = caps.supportedUsageFlags & (req_flags | opt_flags);
+ VkFormatProperties fmtprop = {0};
+ vk->GetPhysicalDeviceFormatProperties(vk->physd, info->imageFormat, &fmtprop);
+
+#define CHECK(usage, feature) \
+ if (!((fmtprop.optimalTilingFeatures & VK_FORMAT_FEATURE_##feature##_BIT))) \
+ info->imageUsage &= ~VK_IMAGE_USAGE_##usage##_BIT
+
+ CHECK(COLOR_ATTACHMENT, COLOR_ATTACHMENT);
+ CHECK(TRANSFER_DST, TRANSFER_DST);
+ CHECK(STORAGE, STORAGE_IMAGE);
+
+ if ((info->imageUsage & req_flags) != req_flags) {
+ PL_ERR(vk, "The swapchain doesn't support rendering and blitting!");
+ goto error;
+ }
+
+ return true;
+
+error:
+ return false;
+}
+
+static void destroy_swapchain(struct vk_ctx *vk, void *swapchain)
+{
+ vk->DestroySwapchainKHR(vk->dev, vk_unwrap_handle(swapchain), PL_VK_ALLOC);
+}
+
+static bool vk_sw_recreate(pl_swapchain sw, int w, int h)
+{
+ pl_gpu gpu = sw->gpu;
+ struct priv *p = PL_PRIV(sw);
+ struct vk_ctx *vk = p->vk;
+
+ VkImage *vkimages = NULL;
+ uint32_t num_images = 0;
+
+ if (!update_swapchain_info(p, &p->protoInfo, w, h))
+ return false;
+
+ VkSwapchainCreateInfoKHR sinfo = p->protoInfo;
+#ifdef VK_EXT_full_screen_exclusive
+ // Explicitly disallow full screen exclusive mode if possible
+ static const VkSurfaceFullScreenExclusiveInfoEXT fsinfo = {
+ .sType = VK_STRUCTURE_TYPE_SURFACE_FULL_SCREEN_EXCLUSIVE_INFO_EXT,
+ .fullScreenExclusive = VK_FULL_SCREEN_EXCLUSIVE_DISALLOWED_EXT,
+ };
+ if (vk->AcquireFullScreenExclusiveModeEXT)
+ vk_link_struct(&sinfo, &fsinfo);
+#endif
+
+ p->suboptimal = false;
+ p->needs_recreate = false;
+ p->cur_width = sinfo.imageExtent.width;
+ p->cur_height = sinfo.imageExtent.height;
+
+ PL_DEBUG(sw, "(Re)creating swapchain of size %dx%d",
+ sinfo.imageExtent.width,
+ sinfo.imageExtent.height);
+
+#ifdef PL_HAVE_UNIX
+ if (vk->props.vendorID == VK_VENDOR_ID_NVIDIA) {
+ vk->DeviceWaitIdle(vk->dev);
+ vk_wait_idle(vk);
+ }
+#endif
+
+ // Calling `vkCreateSwapchainKHR` puts sinfo.oldSwapchain into a retired
+ // state whether the call succeeds or not, so we always need to garbage
+ // collect it afterwards - asynchronously as it may still be in use
+ sinfo.oldSwapchain = p->swapchain;
+ p->swapchain = VK_NULL_HANDLE;
+ VkResult res = vk->CreateSwapchainKHR(vk->dev, &sinfo, PL_VK_ALLOC, &p->swapchain);
+ vk_dev_callback(vk, (vk_cb) destroy_swapchain, vk, vk_wrap_handle(sinfo.oldSwapchain));
+ PL_VK_ASSERT(res, "vk->CreateSwapchainKHR(...)");
+
+ // Get the new swapchain images
+ VK(vk->GetSwapchainImagesKHR(vk->dev, p->swapchain, &num_images, NULL));
+ vkimages = pl_calloc_ptr(NULL, num_images, vkimages);
+ VK(vk->GetSwapchainImagesKHR(vk->dev, p->swapchain, &num_images, vkimages));
+
+ for (int i = 0; i < num_images; i++)
+ PL_VK_NAME(IMAGE, vkimages[i], "swapchain");
+
+ // If needed, allocate some more semaphores
+ while (num_images > p->sems.num) {
+ VkSemaphore sem_in = VK_NULL_HANDLE, sem_out = VK_NULL_HANDLE;
+ static const VkSemaphoreCreateInfo seminfo = {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+ };
+ VK(vk->CreateSemaphore(vk->dev, &seminfo, PL_VK_ALLOC, &sem_in));
+ VK(vk->CreateSemaphore(vk->dev, &seminfo, PL_VK_ALLOC, &sem_out));
+ PL_VK_NAME(SEMAPHORE, sem_in, "swapchain in");
+ PL_VK_NAME(SEMAPHORE, sem_out, "swapchain out");
+
+ PL_ARRAY_APPEND(sw, p->sems, (struct sem_pair) {
+ .in = sem_in,
+ .out = sem_out,
+ });
+ }
+
+ // Recreate the pl_tex wrappers
+ for (int i = 0; i < p->images.num; i++)
+ pl_tex_destroy(gpu, &p->images.elem[i]);
+ p->images.num = 0;
+
+ for (int i = 0; i < num_images; i++) {
+ const VkExtent2D *ext = &sinfo.imageExtent;
+ pl_tex tex = pl_vulkan_wrap(gpu, pl_vulkan_wrap_params(
+ .image = vkimages[i],
+ .width = ext->width,
+ .height = ext->height,
+ .format = sinfo.imageFormat,
+ .usage = sinfo.imageUsage,
+ ));
+ if (!tex)
+ goto error;
+ PL_ARRAY_APPEND(sw, p->images, tex);
+ }
+
+ pl_assert(num_images > 0);
+ int bits = 0;
+
+ // The channel with the most bits is probably the most authoritative about
+ // the actual color information (consider e.g. a2bgr10). Slight downside
+ // in that it results in rounding r/b for e.g. rgb565, but we don't pick
+ // surfaces with fewer than 8 bits anyway, so let's not care for now.
+ pl_fmt fmt = p->images.elem[0]->params.format;
+ for (int i = 0; i < fmt->num_components; i++)
+ bits = PL_MAX(bits, fmt->component_depth[i]);
+
+ p->color_repr.bits.sample_depth = bits;
+ p->color_repr.bits.color_depth = bits;
+
+ // Note: `p->color_space.hdr` is (re-)applied by `set_hdr_metadata`
+ map_color_space(sinfo.imageColorSpace, &p->color_space);
+
+ // Forcibly re-apply HDR metadata, bypassing the no-op check
+ struct pl_hdr_metadata metadata = p->hdr_metadata;
+ p->hdr_metadata = pl_hdr_metadata_empty;
+ set_hdr_metadata(p, &metadata);
+
+ pl_free(vkimages);
+ return true;
+
+error:
+ PL_ERR(vk, "Failed (re)creating swapchain!");
+ pl_free(vkimages);
+ vk->DestroySwapchainKHR(vk->dev, p->swapchain, PL_VK_ALLOC);
+ p->swapchain = VK_NULL_HANDLE;
+ p->cur_width = p->cur_height = 0;
+ return false;
+}
+
+static bool vk_sw_start_frame(pl_swapchain sw,
+ struct pl_swapchain_frame *out_frame)
+{
+ struct priv *p = PL_PRIV(sw);
+ struct vk_ctx *vk = p->vk;
+ pl_mutex_lock(&p->lock);
+
+ bool recreate = !p->swapchain || p->needs_recreate;
+ if (p->suboptimal && !p->params.allow_suboptimal)
+ recreate = true;
+
+ if (recreate && !vk_sw_recreate(sw, 0, 0)) {
+ pl_mutex_unlock(&p->lock);
+ return false;
+ }
+
+ VkSemaphore sem_in = p->sems.elem[p->idx_sems].in;
+ PL_TRACE(vk, "vkAcquireNextImageKHR signals 0x%"PRIx64, (uint64_t) sem_in);
+
+ for (int attempts = 0; attempts < 2; attempts++) {
+ uint32_t imgidx = 0;
+ VkResult res = vk->AcquireNextImageKHR(vk->dev, p->swapchain, UINT64_MAX,
+ sem_in, VK_NULL_HANDLE, &imgidx);
+
+ switch (res) {
+ case VK_SUBOPTIMAL_KHR:
+ p->suboptimal = true;
+ // fall through
+ case VK_SUCCESS:
+ p->last_imgidx = imgidx;
+ pl_vulkan_release_ex(sw->gpu, pl_vulkan_release_params(
+ .tex = p->images.elem[imgidx],
+ .layout = VK_IMAGE_LAYOUT_UNDEFINED,
+ .qf = VK_QUEUE_FAMILY_IGNORED,
+ .semaphore = { sem_in },
+ ));
+ *out_frame = (struct pl_swapchain_frame) {
+ .fbo = p->images.elem[imgidx],
+ .flipped = false,
+ .color_repr = p->color_repr,
+ .color_space = p->color_space,
+ };
+ // keep lock held
+ return true;
+
+ case VK_ERROR_OUT_OF_DATE_KHR: {
+ // In these cases try recreating the swapchain
+ if (!vk_sw_recreate(sw, 0, 0)) {
+ pl_mutex_unlock(&p->lock);
+ return false;
+ }
+ continue;
+ }
+
+ default:
+ PL_ERR(vk, "Failed acquiring swapchain image: %s", vk_res_str(res));
+ pl_mutex_unlock(&p->lock);
+ return false;
+ }
+ }
+
+ // If we've exhausted the number of attempts to recreate the swapchain,
+ // just give up silently and let the user retry some time later.
+ pl_mutex_unlock(&p->lock);
+ return false;
+}
+
+static void present_cb(struct priv *p, void *arg)
+{
+ (void) pl_rc_deref(&p->frames_in_flight);
+}
+
+static bool vk_sw_submit_frame(pl_swapchain sw)
+{
+ pl_gpu gpu = sw->gpu;
+ struct priv *p = PL_PRIV(sw);
+ struct vk_ctx *vk = p->vk;
+ pl_assert(p->last_imgidx >= 0);
+ pl_assert(p->swapchain);
+ uint32_t idx = p->last_imgidx;
+ VkSemaphore sem_out = p->sems.elem[p->idx_sems++].out;
+ p->idx_sems %= p->sems.num;
+ p->last_imgidx = -1;
+
+ bool held = pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params(
+ .tex = p->images.elem[idx],
+ .layout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,
+ .qf = VK_QUEUE_FAMILY_IGNORED,
+ .semaphore = { sem_out },
+ ));
+
+ if (!held) {
+ PL_ERR(gpu, "Failed holding swapchain image for presentation");
+ pl_mutex_unlock(&p->lock);
+ return false;
+ }
+
+ struct vk_cmd *cmd = pl_vk_steal_cmd(gpu);
+ if (!cmd) {
+ pl_mutex_unlock(&p->lock);
+ return false;
+ }
+
+ pl_rc_ref(&p->frames_in_flight);
+ vk_cmd_callback(cmd, (vk_cb) present_cb, p, NULL);
+ if (!vk_cmd_submit(&cmd)) {
+ pl_mutex_unlock(&p->lock);
+ return false;
+ }
+
+ struct vk_cmdpool *pool = vk->pool_graphics;
+ int qidx = pool->idx_queues;
+ VkQueue queue = pool->queues[qidx];
+
+ vk_rotate_queues(p->vk);
+ vk_malloc_garbage_collect(vk->ma);
+
+ VkPresentInfoKHR pinfo = {
+ .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR,
+ .waitSemaphoreCount = 1,
+ .pWaitSemaphores = &sem_out,
+ .swapchainCount = 1,
+ .pSwapchains = &p->swapchain,
+ .pImageIndices = &idx,
+ };
+
+ PL_TRACE(vk, "vkQueuePresentKHR waits on 0x%"PRIx64, (uint64_t) sem_out);
+ vk->lock_queue(vk->queue_ctx, pool->qf, qidx);
+ VkResult res = vk->QueuePresentKHR(queue, &pinfo);
+ vk->unlock_queue(vk->queue_ctx, pool->qf, qidx);
+ pl_mutex_unlock(&p->lock);
+
+ switch (res) {
+ case VK_SUBOPTIMAL_KHR:
+ p->suboptimal = true;
+ // fall through
+ case VK_SUCCESS:
+ return true;
+
+ case VK_ERROR_OUT_OF_DATE_KHR:
+ // We can silently ignore this error, since the next start_frame will
+ // recreate the swapchain automatically.
+ return true;
+
+ default:
+ PL_ERR(vk, "Failed presenting to queue %p: %s", (void *) queue,
+ vk_res_str(res));
+ return false;
+ }
+}
+
+static void vk_sw_swap_buffers(pl_swapchain sw)
+{
+ struct priv *p = PL_PRIV(sw);
+
+ pl_mutex_lock(&p->lock);
+ while (pl_rc_count(&p->frames_in_flight) >= p->swapchain_depth) {
+ pl_mutex_unlock(&p->lock); // don't hold mutex while blocking
+ vk_poll_commands(p->vk, UINT64_MAX);
+ pl_mutex_lock(&p->lock);
+ }
+ pl_mutex_unlock(&p->lock);
+}
+
+static bool vk_sw_resize(pl_swapchain sw, int *width, int *height)
+{
+ struct priv *p = PL_PRIV(sw);
+ bool ok = true;
+
+ pl_mutex_lock(&p->lock);
+
+ bool width_changed = *width && *width != p->cur_width,
+ height_changed = *height && *height != p->cur_height;
+
+ if (p->suboptimal || p->needs_recreate || width_changed || height_changed)
+ ok = vk_sw_recreate(sw, *width, *height);
+
+ *width = p->cur_width;
+ *height = p->cur_height;
+
+ pl_mutex_unlock(&p->lock);
+ return ok;
+}
+
+static void vk_sw_colorspace_hint(pl_swapchain sw, const struct pl_color_space *csp)
+{
+ struct priv *p = PL_PRIV(sw);
+ pl_mutex_lock(&p->lock);
+
+ // This should never fail if the swapchain already exists
+ bool ok = pick_surf_format(sw, csp);
+ set_hdr_metadata(p, &csp->hdr);
+ pl_assert(ok);
+
+ pl_mutex_unlock(&p->lock);
+}
+
+bool pl_vulkan_swapchain_suboptimal(pl_swapchain sw)
+{
+ struct priv *p = PL_PRIV(sw);
+ return p->suboptimal;
+}
+
+static const struct pl_sw_fns vulkan_swapchain = {
+ .destroy = vk_sw_destroy,
+ .latency = vk_sw_latency,
+ .resize = vk_sw_resize,
+ .colorspace_hint = vk_sw_colorspace_hint,
+ .start_frame = vk_sw_start_frame,
+ .submit_frame = vk_sw_submit_frame,
+ .swap_buffers = vk_sw_swap_buffers,
+};
diff --git a/src/vulkan/utils.c b/src/vulkan/utils.c
new file mode 100644
index 0000000..914f9e4
--- /dev/null
+++ b/src/vulkan/utils.c
@@ -0,0 +1,181 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "utils.h"
+
+VkExternalMemoryHandleTypeFlagBitsKHR
+vk_mem_handle_type(enum pl_handle_type handle_type)
+{
+ if (!handle_type)
+ return 0;
+
+ switch (handle_type) {
+ case PL_HANDLE_FD:
+ return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+ case PL_HANDLE_WIN32:
+ return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR;
+ case PL_HANDLE_WIN32_KMT:
+ return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR;
+ case PL_HANDLE_DMA_BUF:
+ return VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT;
+ case PL_HANDLE_HOST_PTR:
+ return VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT;
+ case PL_HANDLE_MTL_TEX:
+ case PL_HANDLE_IOSURFACE:
+ return 0;
+ }
+
+ pl_unreachable();
+}
+
+VkExternalSemaphoreHandleTypeFlagBitsKHR
+vk_sync_handle_type(enum pl_handle_type handle_type)
+{
+ if (!handle_type)
+ return 0;
+
+ switch (handle_type) {
+ case PL_HANDLE_FD:
+ return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+ case PL_HANDLE_WIN32:
+ return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR;
+ case PL_HANDLE_WIN32_KMT:
+ return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR;
+ case PL_HANDLE_DMA_BUF:
+ case PL_HANDLE_HOST_PTR:
+ case PL_HANDLE_MTL_TEX:
+ case PL_HANDLE_IOSURFACE:
+ return 0;
+ }
+
+ pl_unreachable();
+}
+
+bool vk_external_mem_check(struct vk_ctx *vk,
+ const VkExternalMemoryPropertiesKHR *props,
+ enum pl_handle_type handle_type,
+ bool import)
+{
+ VkExternalMemoryFeatureFlagsKHR flags = props->externalMemoryFeatures;
+ VkExternalMemoryHandleTypeFlagBitsKHR vk_handle = vk_mem_handle_type(handle_type);
+
+ if (import) {
+ if (!(flags & VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT_KHR)) {
+ PL_DEBUG(vk, "Handle type %s (0x%x) is not importable",
+ vk_handle_name(vk_handle), (unsigned int) handle_type);
+ return false;
+ }
+ } else {
+ if (!(flags & VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT_KHR)) {
+ PL_DEBUG(vk, "Handle type %s (0x%x) is not exportable",
+ vk_handle_name(vk_handle), (unsigned int) handle_type);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+const enum pl_handle_type vk_mem_handle_list[] = {
+ PL_HANDLE_HOST_PTR,
+#ifdef PL_HAVE_UNIX
+ PL_HANDLE_FD,
+ PL_HANDLE_DMA_BUF,
+#endif
+#ifdef PL_HAVE_WIN32
+ PL_HANDLE_WIN32,
+ PL_HANDLE_WIN32_KMT,
+#endif
+ 0
+};
+
+const enum pl_handle_type vk_sync_handle_list[] = {
+#ifdef PL_HAVE_UNIX
+ PL_HANDLE_FD,
+#endif
+#ifdef PL_HAVE_WIN32
+ PL_HANDLE_WIN32,
+ PL_HANDLE_WIN32_KMT,
+#endif
+ 0
+};
+
+const void *vk_find_struct(const void *chain, VkStructureType stype)
+{
+ const VkBaseInStructure *in = chain;
+ while (in) {
+ if (in->sType == stype)
+ return in;
+
+ in = in->pNext;
+ }
+
+ return NULL;
+}
+
+void vk_link_struct(void *chain, const void *in)
+{
+ if (!in)
+ return;
+
+ VkBaseOutStructure *out = chain;
+ while (out->pNext)
+ out = out->pNext;
+
+ out->pNext = (void *) in;
+}
+
+void *vk_struct_memdup(void *alloc, const void *pin)
+{
+ if (!pin)
+ return NULL;
+
+ const VkBaseInStructure *in = pin;
+ size_t size = vk_struct_size(in->sType);
+ pl_assert(size);
+
+ VkBaseOutStructure *out = pl_memdup(alloc, in, size);
+ out->pNext = NULL;
+ return out;
+}
+
+void *vk_chain_memdup(void *alloc, const void *pin)
+{
+ if (!pin)
+ return NULL;
+
+ const VkBaseInStructure *in = pin;
+ VkBaseOutStructure *out = vk_struct_memdup(alloc, in);
+ pl_assert(out);
+
+ out->pNext = vk_chain_memdup(alloc, in->pNext);
+ return out;
+}
+
+void *vk_chain_alloc(void *alloc, void *chain, VkStructureType stype)
+{
+ for (VkBaseOutStructure *out = chain;; out = out->pNext) {
+ if (out->sType == stype)
+ return out;
+ if (!out->pNext) {
+ VkBaseOutStructure *s = pl_zalloc(alloc, vk_struct_size(stype));
+ s->sType = stype;
+ out->pNext = s;
+ return s;
+ }
+ }
+}
diff --git a/src/vulkan/utils.h b/src/vulkan/utils.h
new file mode 100644
index 0000000..cb1c5f5
--- /dev/null
+++ b/src/vulkan/utils.h
@@ -0,0 +1,136 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "common.h"
+
+// Return a human-readable name for various vulkan enums
+const char *vk_res_str(VkResult res);
+const char *vk_fmt_name(VkFormat fmt);
+const char *vk_csp_name(VkColorSpaceKHR csp);
+const char *vk_handle_name(VkExternalMemoryHandleTypeFlagBitsKHR handle);
+const char *vk_obj_type(VkObjectType obj);
+const char *vk_alpha_mode(VkCompositeAlphaFlagsKHR alpha);
+const char *vk_surface_transform(VkSurfaceTransformFlagsKHR transform);
+
+// Return the size of an arbitrary vulkan struct. Returns 0 for unknown structs
+size_t vk_struct_size(VkStructureType stype);
+
+// Returns the vulkan API version which a given extension was promoted to, or 0
+// if the extension is not promoted.
+uint32_t vk_ext_promoted_ver(const char *extension);
+
+// Enum translation boilerplate
+VkExternalMemoryHandleTypeFlagBitsKHR vk_mem_handle_type(enum pl_handle_type);
+VkExternalSemaphoreHandleTypeFlagBitsKHR vk_sync_handle_type(enum pl_handle_type);
+
+// Bitmask of all access flags that imply a read/write operation, respectively
+extern const VkAccessFlags2 vk_access_read;
+extern const VkAccessFlags2 vk_access_write;
+
+// Check for compatibility of a VkExternalMemoryProperties
+bool vk_external_mem_check(struct vk_ctx *vk,
+ const VkExternalMemoryPropertiesKHR *props,
+ enum pl_handle_type handle_type,
+ bool check_import);
+
+// Static lists of external handle types we should try probing for
+extern const enum pl_handle_type vk_mem_handle_list[];
+extern const enum pl_handle_type vk_sync_handle_list[];
+
+// Find a structure in a pNext chain, or NULL
+const void *vk_find_struct(const void *chain, VkStructureType stype);
+
+// Link a structure into a pNext chain
+void vk_link_struct(void *chain, const void *in);
+
+// Make a copy of a structure, not including the pNext chain
+void *vk_struct_memdup(void *alloc, const void *in);
+
+// Make a deep copy of an entire pNext chain
+void *vk_chain_memdup(void *alloc, const void *in);
+
+// Find a structure in a pNext chain, or allocate + link it if absent.
+void *vk_chain_alloc(void *alloc, void *chain, VkStructureType stype);
+
+// Renormalize input features into a state consistent for a given API version.
+// If `api_ver` is specified as 0, *both* meta-structs and extension structs
+// will be emitted. Note: `out` should be initialized by the user. In
+// particular, if it already contains a valid features chain, then this
+// function will effectively act as a union.
+void vk_features_normalize(void *alloc, const VkPhysicalDeviceFeatures2 *in,
+ uint32_t api_ver, VkPhysicalDeviceFeatures2 *out);
+
+// Convenience macros to simplify a lot of common boilerplate
+#define PL_VK_ASSERT(res, str) \
+ do { \
+ if (res != VK_SUCCESS) { \
+ PL_ERR(vk, str ": %s (%s:%d)", \
+ vk_res_str(res), __FILE__, __LINE__); \
+ goto error; \
+ } \
+ } while (0)
+
+#define VK(cmd) \
+ do { \
+ PL_TRACE(vk, #cmd); \
+ VkResult _res = (cmd); \
+ PL_VK_ASSERT(_res, #cmd); \
+ } while (0)
+
+#define PL_VK_NAME(type, obj, name) \
+ do { \
+ if (vk->SetDebugUtilsObjectNameEXT) { \
+ vk->SetDebugUtilsObjectNameEXT(vk->dev, &(VkDebugUtilsObjectNameInfoEXT) { \
+ .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT, \
+ .objectType = VK_OBJECT_TYPE_##type, \
+ .objectHandle = (uint64_t) (obj), \
+ .pObjectName = (name), \
+ }); \
+ } \
+ } while (0)
+
+// Variant of PL_VK_NAME for dispatchable handles
+#define PL_VK_NAME_HANDLE(type, obj, name) \
+ PL_VK_NAME(type, (uintptr_t) (obj), name)
+
+// Helper functions to wrap and unwrap non-dispatchable handles into pointers.
+// Note that wrap/unwrap must always be used linearly.
+#if VK_USE_64_BIT_PTR_DEFINES == 1
+#define vk_wrap_handle(h) (h)
+#define vk_unwrap_handle(h) (h)
+#elif UINTPTR_MAX >= UINT64_MAX
+#define vk_wrap_handle(h) ((void *) (uintptr_t) (h))
+#define vk_unwrap_handle(h) ((uint64_t) (uintptr_t) (h))
+#else
+static inline void *vk_wrap_handle(uint64_t h)
+{
+ uint64_t *wrapper = malloc(sizeof(h));
+ assert(wrapper);
+ *wrapper = h;
+ return wrapper;
+}
+
+static inline uint64_t vk_unwrap_handle(void *h)
+{
+ uint64_t *wrapper = h;
+ uint64_t ret = *wrapper;
+ free(wrapper);
+ return ret;
+}
+#endif
diff --git a/src/vulkan/utils_gen.c.j2 b/src/vulkan/utils_gen.c.j2
new file mode 100644
index 0000000..6db0454
--- /dev/null
+++ b/src/vulkan/utils_gen.c.j2
@@ -0,0 +1,137 @@
+#define VK_ENABLE_BETA_EXTENSIONS
+#include "vulkan/utils.h"
+
+const char *vk_res_str(VkResult res)
+{
+ switch (res) {
+{% for res in vkresults %}
+ case {{ res }}: return "{{ res }}";
+{% endfor %}
+
+ default: return "unknown error";
+ }
+}
+
+const char *vk_fmt_name(VkFormat fmt)
+{
+ switch (fmt) {
+{% for fmt in vkformats %}
+ case {{ fmt }}: return "{{ fmt }}";
+{% endfor %}
+
+ default: return "unknown format";
+ }
+}
+
+const char *vk_csp_name(VkColorSpaceKHR csp)
+{
+ switch (csp) {
+{% for csp in vkspaces %}
+ case {{ csp }}: return "{{ csp }}";
+{% endfor %}
+
+ default: return "unknown color space";
+ }
+}
+
+const char *vk_handle_name(VkExternalMemoryHandleTypeFlagBitsKHR handle)
+{
+ switch (handle) {
+{% for handle in vkhandles %}
+ case {{ handle }}: return "{{ handle }}";
+{% endfor %}
+
+ default: return "unknown handle type";
+ }
+}
+
+const char *vk_alpha_mode(VkCompositeAlphaFlagsKHR alpha)
+{
+ switch (alpha) {
+{% for mode in vkalphas %}
+ case {{ mode }}: return "{{ mode }}";
+{% endfor %}
+
+ default: return "unknown alpha mode";
+ }
+}
+
+const char *vk_surface_transform(VkSurfaceTransformFlagsKHR tf)
+{
+ switch (tf) {
+{% for tf in vktransforms %}
+ case {{ tf }}: return "{{ tf }}";
+{% endfor %}
+
+ default: return "unknown surface transform";
+ }
+}
+
+
+const char *vk_obj_type(VkObjectType obj)
+{
+ switch (obj) {
+{% for obj in vkobjects %}
+ case {{ obj.enum }}: return "{{ obj.name }}";
+{% endfor %}
+
+ default: return "unknown object";
+ }
+}
+
+size_t vk_struct_size(VkStructureType stype)
+{
+ switch (stype) {
+{% for struct in vkstructs %}
+ case {{ struct.stype }}: return sizeof({{ struct.name }});
+{% endfor %}
+
+ default: return 0;
+ }
+}
+
+uint32_t vk_ext_promoted_ver(const char *extension)
+{
+{% for ext in vkexts %}
+{% if ext.promoted_ver %}
+ if (!strcmp(extension, "{{ ext.name }}"))
+ return {{ ext.promoted_ver }};
+{% endif %}
+{% endfor %}
+ return 0;
+}
+
+void vk_features_normalize(void *alloc, const VkPhysicalDeviceFeatures2 *fin,
+ uint32_t api_ver, VkPhysicalDeviceFeatures2 *out)
+{
+ for (const VkBaseInStructure *in = (void *) fin; in; in = in->pNext) {
+ switch (in->sType) {
+ default: break;
+{% for fs in vkfeatures %}
+ case {{ fs.stype }}: {
+ const {{ fs.name }} *i = (const void *) in;
+{% for f in fs.features %}
+ if (i->{{ f.name }}) {
+{% for r in f.replacements %}
+{% if r.core_ver %}
+ if (!api_ver || api_ver >= {{ r.core_ver }})
+{% elif r.max_ver %}
+ if (!api_ver || api_ver < {{ r.max_ver }})
+{% endif %}
+{% if fs.is_base %}
+ out->{{ f.name }} = true;
+{% else %}
+ (({{ r.name }} *) vk_chain_alloc(alloc, out, {{ r.stype }}))->{{ f.name }} = true;
+{% endif %}
+{% endfor %}
+ }
+{% endfor %}
+ break;
+ }
+{% endfor %}
+ }
+ }
+}
+
+const VkAccessFlags2 vk_access_read = {{ '0x%x' % vkaccess.read }}LLU;
+const VkAccessFlags2 vk_access_write = {{ '0x%x' % vkaccess.write }}LLU;
diff --git a/src/vulkan/utils_gen.py b/src/vulkan/utils_gen.py
new file mode 100644
index 0000000..a8652fd
--- /dev/null
+++ b/src/vulkan/utils_gen.py
@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+#
+# This file is part of libplacebo.
+#
+# libplacebo is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# libplacebo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+
+import os.path
+import re
+import sys
+import xml.etree.ElementTree as ET
+
+try:
+ import jinja2
+except ModuleNotFoundError:
+ print('Module \'jinja2\' not found, please install \'python3-Jinja2\' or '
+ 'an equivalent package on your system! Alternatively, run '
+ '`git submodule update --init` followed by `meson --wipe`.',
+ file=sys.stderr)
+ sys.exit(1)
+
+TEMPLATE = jinja2.Environment(
+ loader = jinja2.FileSystemLoader(searchpath=os.path.dirname(__file__)),
+ trim_blocks=True,
+).get_template('utils_gen.c.j2')
+
+class Obj(object):
+ def __init__(self, **kwargs):
+ self.__dict__.update(kwargs)
+
+class VkXML(ET.ElementTree):
+ def blacklist_block(self, req):
+ for t in req.iterfind('type'):
+ self.blacklist_types.add(t.attrib['name'])
+ for e in req.iterfind('enum'):
+ self.blacklist_enums.add(e.attrib['name'])
+
+ def __init__(self, *args, **kwargs):
+
+ super().__init__(*args, **kwargs)
+ self.blacklist_types = set()
+ self.blacklist_enums = set()
+
+ for f in self.iterfind('feature'):
+ # Feature block for non-Vulkan API
+ if not 'vulkan' in f.attrib['api'].split(','):
+ for r in f.iterfind('require'):
+ self.blacklist_block(r)
+
+ for e in self.iterfind('extensions/extension'):
+ # Entire extension is unsupported on vulkan or platform-specifid
+ if not 'vulkan' in e.attrib['supported'].split(',') or 'platform' in e.attrib:
+ for r in e.iterfind('require'):
+ self.blacklist_block(r)
+ continue
+
+ # Only individual <require> blocks are API-specific
+ for r in e.iterfind('require[@api]'):
+ if not 'vulkan' in r.attrib['api'].split(','):
+ self.blacklist_block(r)
+
+ def findall_enum(self, name):
+ for e in self.iterfind('enums[@name="{0}"]/enum'.format(name)):
+ if not 'alias' in e.attrib:
+ if not e.attrib['name'] in self.blacklist_enums:
+ yield e
+ for e in self.iterfind('.//enum[@extends="{0}"]'.format(name)):
+ if not 'alias' in e.attrib:
+ if not e.attrib['name'] in self.blacklist_enums:
+ yield e
+
+ def findall_type(self, category):
+ for t in self.iterfind('types/type[@category="{0}"]'.format(category)):
+ name = t.attrib.get('name') or t.find('name').text
+ if name in self.blacklist_types:
+ continue
+ yield t
+
+
+def get_vkenum(registry, enum):
+ for e in registry.findall_enum(enum):
+ yield e.attrib['name']
+
+def get_vkobjects(registry):
+ for t in registry.findall_type('handle'):
+ if 'objtypeenum' in t.attrib:
+ yield Obj(enum = t.attrib['objtypeenum'],
+ name = t.find('name').text)
+
+def get_vkstructs(registry):
+ for t in registry.findall_type('struct'):
+ stype = None
+ for m in t.iterfind('member'):
+ if m.find('name').text == 'sType':
+ stype = m
+ break
+
+ if stype is not None and 'values' in stype.attrib:
+ yield Obj(stype = stype.attrib['values'],
+ name = t.attrib['name'])
+
+def get_vkaccess(registry):
+ access = Obj(read = 0, write = 0)
+ for e in registry.findall_enum('VkAccessFlagBits2'):
+ if '_READ_' in e.attrib['name']:
+ access.read |= 1 << int(e.attrib['bitpos'])
+ if '_WRITE_' in e.attrib['name']:
+ access.write |= 1 << int(e.attrib['bitpos'])
+ return access
+
+def get_vkexts(registry):
+ for e in registry.iterfind('extensions/extension'):
+ promoted_ver = None
+ if res := re.match(r'VK_VERSION_(\d)_(\d)', e.attrib.get('promotedto', '')):
+ promoted_ver = 'VK_API_VERSION_{0}_{1}'.format(res[1], res[2])
+ yield Obj(name = e.attrib['name'],
+ promoted_ver = promoted_ver)
+
+def get_vkfeatures(registry):
+ structs = [];
+ featuremap = {}; # features -> [struct]
+ for t in registry.findall_type('struct'):
+ sname = t.attrib['name']
+ is_base = sname == 'VkPhysicalDeviceFeatures'
+ extends = t.attrib.get('structextends', [])
+ if is_base:
+ sname = 'VkPhysicalDeviceFeatures2'
+ stype = 'VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2'
+ elif not 'VkPhysicalDeviceFeatures2' in extends:
+ continue
+
+ features = []
+ for f in t.iterfind('member'):
+ if f.find('type').text == 'VkStructureType':
+ stype = f.attrib['values']
+ elif f.find('type').text == 'VkBool32':
+ fname = f.find('name').text
+ if is_base:
+ fname = 'features.' + fname
+ features.append(Obj(name = fname))
+
+ core_ver = None
+ if res := re.match(r'VkPhysicalDeviceVulkan(\d)(\d)Features', sname):
+ core_ver = 'VK_API_VERSION_{0}_{1}'.format(res[1], res[2])
+
+ struct = Obj(name = sname,
+ stype = stype,
+ core_ver = core_ver,
+ is_base = is_base,
+ features = features)
+
+ structs.append(struct)
+ for f in features:
+ featuremap.setdefault(f.name, []).append(struct)
+
+ for s in structs:
+ for f in s.features:
+ f.replacements = featuremap[f.name]
+ core_ver = next(( r.core_ver for r in f.replacements if r.core_ver ), None)
+ for r in f.replacements:
+ if not r.core_ver:
+ r.max_ver = core_ver
+
+ yield from structs
+
+def find_registry_xml(datadir):
+ registry_paths = [
+ '{0}/vulkan/registry/vk.xml'.format(datadir),
+ '$MINGW_PREFIX/share/vulkan/registry/vk.xml',
+ '%VULKAN_SDK%/share/vulkan/registry/vk.xml',
+ '$VULKAN_SDK/share/vulkan/registry/vk.xml',
+ '/usr/share/vulkan/registry/vk.xml',
+ ]
+
+ for p in registry_paths:
+ path = os.path.expandvars(p)
+ if os.path.isfile(path):
+ print('Found vk.xml: {0}'.format(path))
+ return path
+
+ print('Could not find the vulkan registry (vk.xml), please specify its '
+ 'location manually using the -Dvulkan-registry=/path/to/vk.xml '
+ 'option!', file=sys.stderr)
+ sys.exit(1)
+
+if __name__ == '__main__':
+ assert len(sys.argv) == 4
+ datadir = sys.argv[1]
+ xmlfile = sys.argv[2]
+ outfile = sys.argv[3]
+
+ if not xmlfile or xmlfile == '':
+ xmlfile = find_registry_xml(datadir)
+
+ registry = VkXML(ET.parse(xmlfile))
+ with open(outfile, 'w') as f:
+ f.write(TEMPLATE.render(
+ vkresults = get_vkenum(registry, 'VkResult'),
+ vkformats = get_vkenum(registry, 'VkFormat'),
+ vkspaces = get_vkenum(registry, 'VkColorSpaceKHR'),
+ vkhandles = get_vkenum(registry, 'VkExternalMemoryHandleTypeFlagBits'),
+ vkalphas = get_vkenum(registry, 'VkCompositeAlphaFlagBitsKHR'),
+ vktransforms = get_vkenum(registry, 'VkSurfaceTransformFlagBitsKHR'),
+ vkobjects = get_vkobjects(registry),
+ vkstructs = get_vkstructs(registry),
+ vkaccess = get_vkaccess(registry),
+ vkexts = get_vkexts(registry),
+ vkfeatures = get_vkfeatures(registry),
+ ))