diff options
Diffstat (limited to 'src/vulkan')
-rw-r--r-- | src/vulkan/command.c | 571 | ||||
-rw-r--r-- | src/vulkan/command.h | 142 | ||||
-rw-r--r-- | src/vulkan/common.h | 234 | ||||
-rw-r--r-- | src/vulkan/context.c | 1704 | ||||
-rw-r--r-- | src/vulkan/formats.c | 616 | ||||
-rw-r--r-- | src/vulkan/formats.h | 34 | ||||
-rw-r--r-- | src/vulkan/gpu.c | 924 | ||||
-rw-r--r-- | src/vulkan/gpu.h | 175 | ||||
-rw-r--r-- | src/vulkan/gpu_buf.c | 470 | ||||
-rw-r--r-- | src/vulkan/gpu_pass.c | 964 | ||||
-rw-r--r-- | src/vulkan/gpu_tex.c | 1453 | ||||
-rw-r--r-- | src/vulkan/malloc.c | 1058 | ||||
-rw-r--r-- | src/vulkan/malloc.h | 72 | ||||
-rw-r--r-- | src/vulkan/meson.build | 59 | ||||
-rw-r--r-- | src/vulkan/stubs.c | 108 | ||||
-rw-r--r-- | src/vulkan/swapchain.c | 911 | ||||
-rw-r--r-- | src/vulkan/utils.c | 181 | ||||
-rw-r--r-- | src/vulkan/utils.h | 136 | ||||
-rw-r--r-- | src/vulkan/utils_gen.c.j2 | 137 | ||||
-rw-r--r-- | src/vulkan/utils_gen.py | 219 |
20 files changed, 10168 insertions, 0 deletions
diff --git a/src/vulkan/command.c b/src/vulkan/command.c new file mode 100644 index 0000000..5020aff --- /dev/null +++ b/src/vulkan/command.c @@ -0,0 +1,571 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "command.h" +#include "utils.h" + +// returns VK_SUCCESS (completed), VK_TIMEOUT (not yet completed) or an error +static VkResult vk_cmd_poll(struct vk_cmd *cmd, uint64_t timeout) +{ + struct vk_ctx *vk = cmd->pool->vk; + return vk->WaitSemaphores(vk->dev, &(VkSemaphoreWaitInfo) { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO, + .semaphoreCount = 1, + .pSemaphores = &cmd->sync.sem, + .pValues = &cmd->sync.value, + }, timeout); +} + +static void flush_callbacks(struct vk_ctx *vk) +{ + while (vk->num_pending_callbacks) { + const struct vk_callback *cb = vk->pending_callbacks++; + vk->num_pending_callbacks--; + cb->run(cb->priv, cb->arg); + } +} + +static void vk_cmd_reset(struct vk_cmd *cmd) +{ + struct vk_ctx *vk = cmd->pool->vk; + + // Flush possible callbacks left over from a previous command still in the + // process of being reset, whose callback triggered this command being + // reset. + flush_callbacks(vk); + vk->pending_callbacks = cmd->callbacks.elem; + vk->num_pending_callbacks = cmd->callbacks.num; + flush_callbacks(vk); + + cmd->callbacks.num = 0; + cmd->deps.num = 0; + cmd->sigs.num = 0; +} + +static void vk_cmd_destroy(struct vk_cmd *cmd) +{ + if (!cmd) + return; + + struct vk_ctx *vk = cmd->pool->vk; + vk_cmd_poll(cmd, UINT64_MAX); + vk_cmd_reset(cmd); + vk->DestroySemaphore(vk->dev, cmd->sync.sem, PL_VK_ALLOC); + vk->FreeCommandBuffers(vk->dev, cmd->pool->pool, 1, &cmd->buf); + + pl_free(cmd); +} + +static struct vk_cmd *vk_cmd_create(struct vk_cmdpool *pool) +{ + struct vk_ctx *vk = pool->vk; + struct vk_cmd *cmd = pl_zalloc_ptr(NULL, cmd); + cmd->pool = pool; + + VkCommandBufferAllocateInfo ainfo = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, + .commandPool = pool->pool, + .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, + .commandBufferCount = 1, + }; + + VK(vk->AllocateCommandBuffers(vk->dev, &ainfo, &cmd->buf)); + + static const VkSemaphoreTypeCreateInfo stinfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO, + .semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE, + .initialValue = 0, + }; + + static const VkSemaphoreCreateInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, + .pNext = &stinfo, + }; + + VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &cmd->sync.sem)); + PL_VK_NAME(SEMAPHORE, cmd->sync.sem, "cmd"); + + return cmd; + +error: + vk_cmd_destroy(cmd); + vk->failed = true; + return NULL; +} + +void vk_dev_callback(struct vk_ctx *vk, vk_cb callback, + const void *priv, const void *arg) +{ + pl_mutex_lock(&vk->lock); + if (vk->cmds_pending.num > 0) { + struct vk_cmd *last_cmd = vk->cmds_pending.elem[vk->cmds_pending.num - 1]; + vk_cmd_callback(last_cmd, callback, priv, arg); + } else { + // The device was already idle, so we can just immediately call it + callback((void *) priv, (void *) arg); + } + pl_mutex_unlock(&vk->lock); +} + +void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback, + const void *priv, const void *arg) +{ + PL_ARRAY_APPEND(cmd, cmd->callbacks, (struct vk_callback) { + .run = callback, + .priv = (void *) priv, + .arg = (void *) arg, + }); +} + +void vk_cmd_dep(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem dep) +{ + PL_ARRAY_APPEND(cmd, cmd->deps, (VkSemaphoreSubmitInfo) { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, + .semaphore = dep.sem, + .value = dep.value, + .stageMask = stage, + }); +} + +void vk_cmd_sig(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem sig) +{ + VkSemaphoreSubmitInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, + .semaphore = sig.sem, + .value = sig.value, + .stageMask = stage, + }; + + // Try updating existing semaphore signal operations in-place + for (int i = 0; i < cmd->sigs.num; i++) { + if (cmd->sigs.elem[i].semaphore == sig.sem) { + pl_assert(sig.value > cmd->sigs.elem[i].value); + cmd->sigs.elem[i] = sinfo; + return; + } + } + + PL_ARRAY_APPEND(cmd, cmd->sigs, sinfo); +} + +#define SET(FLAG, CHECK) \ + if (flags2 & (CHECK)) \ + flags |= FLAG + +static VkAccessFlags lower_access2(VkAccessFlags2 flags2) +{ + VkAccessFlags flags = flags2 & VK_ACCESS_FLAG_BITS_MAX_ENUM; + SET(VK_ACCESS_SHADER_READ_BIT, VK_ACCESS_2_SHADER_SAMPLED_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_READ_BIT); + SET(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT); + return flags; +} + +static VkPipelineStageFlags lower_stage2(VkPipelineStageFlags2 flags2) +{ + VkPipelineStageFlags flags = flags2 & VK_PIPELINE_STAGE_FLAG_BITS_MAX_ENUM; + SET(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_2_COPY_BIT | + VK_PIPELINE_STAGE_2_RESOLVE_BIT | + VK_PIPELINE_STAGE_2_BLIT_BIT | + VK_PIPELINE_STAGE_2_CLEAR_BIT); + SET(VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT | + VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT); + return flags; +} + +#undef SET + +void vk_cmd_barrier(struct vk_cmd *cmd, const VkDependencyInfo *info) +{ + struct vk_ctx *vk = cmd->pool->vk; + if (vk->CmdPipelineBarrier2KHR) { + vk->CmdPipelineBarrier2KHR(cmd->buf, info); + return; + } + + pl_assert(!info->pNext); + pl_assert(info->memoryBarrierCount == 0); + pl_assert(info->bufferMemoryBarrierCount + info->imageMemoryBarrierCount == 1); + + if (info->bufferMemoryBarrierCount) { + + const VkBufferMemoryBarrier2 *barr2 = info->pBufferMemoryBarriers; + const VkBufferMemoryBarrier barr = { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, + .pNext = barr2->pNext, + .srcAccessMask = lower_access2(barr2->srcAccessMask), + .dstAccessMask = lower_access2(barr2->dstAccessMask), + .srcQueueFamilyIndex = barr2->srcQueueFamilyIndex, + .dstQueueFamilyIndex = barr2->dstQueueFamilyIndex, + .buffer = barr2->buffer, + .offset = barr2->offset, + .size = barr2->size, + }; + + vk->CmdPipelineBarrier(cmd->buf, lower_stage2(barr2->srcStageMask), + lower_stage2(barr2->dstStageMask), + info->dependencyFlags, + 0, NULL, 1, &barr, 0, NULL); + + } else { + + const VkImageMemoryBarrier2 *barr2 = info->pImageMemoryBarriers; + const VkImageMemoryBarrier barr = { + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = barr2->pNext, + .srcAccessMask = lower_access2(barr2->srcAccessMask), + .dstAccessMask = lower_access2(barr2->dstAccessMask), + .oldLayout = barr2->oldLayout, + .newLayout = barr2->newLayout, + .srcQueueFamilyIndex = barr2->srcQueueFamilyIndex, + .dstQueueFamilyIndex = barr2->dstQueueFamilyIndex, + .image = barr2->image, + .subresourceRange = barr2->subresourceRange, + }; + + vk->CmdPipelineBarrier(cmd->buf, lower_stage2(barr2->srcStageMask), + lower_stage2(barr2->dstStageMask), + info->dependencyFlags, + 0, NULL, 0, NULL, 1, &barr); + } +} + +struct vk_sync_scope vk_sem_barrier(struct vk_cmd *cmd, struct vk_sem *sem, + VkPipelineStageFlags2 stage, + VkAccessFlags2 access, bool is_trans) +{ + bool is_write = (access & vk_access_write) || is_trans; + + // Writes need to be synchronized against the last *read* (which is + // transitively synchronized against the last write), reads only + // need to be synchronized against the last write. + struct vk_sync_scope last = sem->write; + if (is_write && sem->read.access) + last = sem->read; + + if (last.queue != cmd->queue) { + if (!is_write && sem->read.queue == cmd->queue) { + // No semaphore needed in this case because the implicit submission + // order execution dependencies already transitively imply a wait + // for the previous write + } else if (last.sync.sem) { + // Image barrier still needs to depend on this stage for implicit + // ordering guarantees to apply properly + vk_cmd_dep(cmd, stage, last.sync); + last.stage = stage; + } + + // Last access is on different queue, so no pipeline barrier needed + last.access = 0; + } + + if (!is_write && sem->read.queue == cmd->queue && + (sem->read.stage & stage) == stage && + (sem->read.access & access) == access) + { + // A past pipeline barrier already covers this access transitively, so + // we don't need to emit another pipeline barrier at all + last.access = 0; + } + + if (is_write) { + sem->write = (struct vk_sync_scope) { + .sync = cmd->sync, + .queue = cmd->queue, + .stage = stage, + .access = access, + }; + + sem->read = (struct vk_sync_scope) { + .sync = cmd->sync, + .queue = cmd->queue, + // no stage or access scope, because no reads happened yet + }; + } else if (sem->read.queue == cmd->queue) { + // Coalesce multiple same-queue reads into a single access scope + sem->read.sync = cmd->sync; + sem->read.stage |= stage; + sem->read.access |= access; + } else { + sem->read = (struct vk_sync_scope) { + .sync = cmd->sync, + .queue = cmd->queue, + .stage = stage, + .access = access, + }; + } + + // We never need to include pipeline barriers for reads, only writes + last.access &= vk_access_write; + return last; +} + +struct vk_cmdpool *vk_cmdpool_create(struct vk_ctx *vk, int qf, int qnum, + VkQueueFamilyProperties props) +{ + struct vk_cmdpool *pool = pl_alloc_ptr(NULL, pool); + *pool = (struct vk_cmdpool) { + .vk = vk, + .props = props, + .qf = qf, + .queues = pl_calloc(pool, qnum, sizeof(VkQueue)), + .num_queues = qnum, + }; + + for (int n = 0; n < qnum; n++) + vk->GetDeviceQueue(vk->dev, qf, n, &pool->queues[n]); + + VkCommandPoolCreateInfo cinfo = { + .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, + .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | + VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, + .queueFamilyIndex = qf, + }; + + VK(vk->CreateCommandPool(vk->dev, &cinfo, PL_VK_ALLOC, &pool->pool)); + return pool; + +error: + vk_cmdpool_destroy(pool); + vk->failed = true; + return NULL; +} + +void vk_cmdpool_destroy(struct vk_cmdpool *pool) +{ + if (!pool) + return; + + for (int i = 0; i < pool->cmds.num; i++) + vk_cmd_destroy(pool->cmds.elem[i]); + + struct vk_ctx *vk = pool->vk; + vk->DestroyCommandPool(vk->dev, pool->pool, PL_VK_ALLOC); + pl_free(pool); +} + +struct vk_cmd *vk_cmd_begin(struct vk_cmdpool *pool, pl_debug_tag debug_tag) +{ + struct vk_ctx *vk = pool->vk; + + // Garbage collect the cmdpool first, to increase the chances of getting + // an already-available command buffer. + vk_poll_commands(vk, 0); + + struct vk_cmd *cmd = NULL; + pl_mutex_lock(&vk->lock); + if (!PL_ARRAY_POP(pool->cmds, &cmd)) { + cmd = vk_cmd_create(pool); + if (!cmd) { + pl_mutex_unlock(&vk->lock); + goto error; + } + } + + cmd->qindex = pool->idx_queues; + cmd->queue = pool->queues[cmd->qindex]; + pl_mutex_unlock(&vk->lock); + + VkCommandBufferBeginInfo binfo = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, + .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT, + }; + + VK(vk->BeginCommandBuffer(cmd->buf, &binfo)); + + debug_tag = PL_DEF(debug_tag, "vk_cmd"); + PL_VK_NAME_HANDLE(COMMAND_BUFFER, cmd->buf, debug_tag); + PL_VK_NAME(SEMAPHORE, cmd->sync.sem, debug_tag); + + cmd->sync.value++; + vk_cmd_sig(cmd, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, cmd->sync); + return cmd; + +error: + // Something has to be seriously messed up if we get to this point + vk_cmd_destroy(cmd); + vk->failed = true; + return NULL; +} + +static VkResult vk_queue_submit2(struct vk_ctx *vk, VkQueue queue, + const VkSubmitInfo2 *info2, VkFence fence) +{ + if (vk->QueueSubmit2KHR) + return vk->QueueSubmit2KHR(queue, 1, info2, fence); + + const uint32_t num_deps = info2->waitSemaphoreInfoCount; + const uint32_t num_sigs = info2->signalSemaphoreInfoCount; + const uint32_t num_cmds = info2->commandBufferInfoCount; + + void *tmp = pl_tmp(NULL); + VkSemaphore *deps = pl_calloc_ptr(tmp, num_deps, deps); + VkPipelineStageFlags *masks = pl_calloc_ptr(tmp, num_deps, masks); + uint64_t *depvals = pl_calloc_ptr(tmp, num_deps, depvals); + VkSemaphore *sigs = pl_calloc_ptr(tmp, num_sigs, sigs); + uint64_t *sigvals = pl_calloc_ptr(tmp, num_sigs, sigvals); + VkCommandBuffer *cmds = pl_calloc_ptr(tmp, num_cmds, cmds); + + for (int i = 0; i < num_deps; i++) { + deps[i] = info2->pWaitSemaphoreInfos[i].semaphore; + masks[i] = info2->pWaitSemaphoreInfos[i].stageMask; + depvals[i] = info2->pWaitSemaphoreInfos[i].value; + } + for (int i = 0; i < num_sigs; i++) { + sigs[i] = info2->pSignalSemaphoreInfos[i].semaphore; + sigvals[i] = info2->pSignalSemaphoreInfos[i].value; + } + for (int i = 0; i < num_cmds; i++) + cmds[i] = info2->pCommandBufferInfos[i].commandBuffer; + + const VkTimelineSemaphoreSubmitInfo tinfo = { + .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO, + .pNext = info2->pNext, + .waitSemaphoreValueCount = num_deps, + .pWaitSemaphoreValues = depvals, + .signalSemaphoreValueCount = num_sigs, + .pSignalSemaphoreValues = sigvals, + }; + + const VkSubmitInfo info = { + .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, + .pNext = &tinfo, + .waitSemaphoreCount = num_deps, + .pWaitSemaphores = deps, + .pWaitDstStageMask = masks, + .commandBufferCount = num_cmds, + .pCommandBuffers = cmds, + .signalSemaphoreCount = num_sigs, + .pSignalSemaphores = sigs, + }; + + VkResult res = vk->QueueSubmit(queue, 1, &info, fence); + pl_free(tmp); + return res; +} + +bool vk_cmd_submit(struct vk_cmd **pcmd) +{ + struct vk_cmd *cmd = *pcmd; + if (!cmd) + return true; + + *pcmd = NULL; + struct vk_cmdpool *pool = cmd->pool; + struct vk_ctx *vk = pool->vk; + + VK(vk->EndCommandBuffer(cmd->buf)); + + VkSubmitInfo2 sinfo = { + .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2, + .waitSemaphoreInfoCount = cmd->deps.num, + .pWaitSemaphoreInfos = cmd->deps.elem, + .signalSemaphoreInfoCount = cmd->sigs.num, + .pSignalSemaphoreInfos = cmd->sigs.elem, + .commandBufferInfoCount = 1, + .pCommandBufferInfos = &(VkCommandBufferSubmitInfo) { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, + .commandBuffer = cmd->buf, + }, + }; + + if (pl_msg_test(vk->log, PL_LOG_TRACE)) { + PL_TRACE(vk, "Submitting command %p on queue %p (QF %d):", + (void *) cmd->buf, (void *) cmd->queue, pool->qf); + for (int n = 0; n < cmd->deps.num; n++) { + PL_TRACE(vk, " waits on semaphore 0x%"PRIx64" = %"PRIu64, + (uint64_t) cmd->deps.elem[n].semaphore, cmd->deps.elem[n].value); + } + for (int n = 0; n < cmd->sigs.num; n++) { + PL_TRACE(vk, " signals semaphore 0x%"PRIx64" = %"PRIu64, + (uint64_t) cmd->sigs.elem[n].semaphore, cmd->sigs.elem[n].value); + } + if (cmd->callbacks.num) + PL_TRACE(vk, " signals %d callbacks", cmd->callbacks.num); + } + + vk->lock_queue(vk->queue_ctx, pool->qf, cmd->qindex); + VkResult res = vk_queue_submit2(vk, cmd->queue, &sinfo, VK_NULL_HANDLE); + vk->unlock_queue(vk->queue_ctx, pool->qf, cmd->qindex); + PL_VK_ASSERT(res, "vkQueueSubmit2"); + + pl_mutex_lock(&vk->lock); + PL_ARRAY_APPEND(vk->alloc, vk->cmds_pending, cmd); + pl_mutex_unlock(&vk->lock); + return true; + +error: + vk_cmd_reset(cmd); + pl_mutex_lock(&vk->lock); + PL_ARRAY_APPEND(pool, pool->cmds, cmd); + pl_mutex_unlock(&vk->lock); + vk->failed = true; + return false; +} + +bool vk_poll_commands(struct vk_ctx *vk, uint64_t timeout) +{ + bool ret = false; + pl_mutex_lock(&vk->lock); + + while (vk->cmds_pending.num) { + struct vk_cmd *cmd = vk->cmds_pending.elem[0]; + struct vk_cmdpool *pool = cmd->pool; + pl_mutex_unlock(&vk->lock); // don't hold mutex while blocking + if (vk_cmd_poll(cmd, timeout) == VK_TIMEOUT) + return ret; + pl_mutex_lock(&vk->lock); + if (!vk->cmds_pending.num || vk->cmds_pending.elem[0] != cmd) + continue; // another thread modified this state while blocking + + PL_TRACE(vk, "VkSemaphore signalled: 0x%"PRIx64" = %"PRIu64, + (uint64_t) cmd->sync.sem, cmd->sync.value); + PL_ARRAY_REMOVE_AT(vk->cmds_pending, 0); // remove before callbacks + vk_cmd_reset(cmd); + PL_ARRAY_APPEND(pool, pool->cmds, cmd); + ret = true; + + // If we've successfully spent some time waiting for at least one + // command, disable the timeout. This has the dual purpose of both + // making sure we don't over-wait due to repeat timeout application, + // but also makes sure we don't block on future commands if we've + // already spend time waiting for one. + timeout = 0; + } + + pl_mutex_unlock(&vk->lock); + return ret; +} + +void vk_rotate_queues(struct vk_ctx *vk) +{ + pl_mutex_lock(&vk->lock); + + // Rotate the queues to ensure good parallelism across frames + for (int i = 0; i < vk->pools.num; i++) { + struct vk_cmdpool *pool = vk->pools.elem[i]; + pool->idx_queues = (pool->idx_queues + 1) % pool->num_queues; + PL_TRACE(vk, "QF %d: %d/%d", pool->qf, pool->idx_queues, pool->num_queues); + } + + pl_mutex_unlock(&vk->lock); +} + +void vk_wait_idle(struct vk_ctx *vk) +{ + while (vk_poll_commands(vk, UINT64_MAX)) ; +} diff --git a/src/vulkan/command.h b/src/vulkan/command.h new file mode 100644 index 0000000..4c70482 --- /dev/null +++ b/src/vulkan/command.h @@ -0,0 +1,142 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once +#include "common.h" + +// Since lots of vulkan operations need to be done lazily once the affected +// resources are no longer in use, provide an abstraction for tracking these. +// In practice, these are only checked and run when submitting new commands, so +// the actual execution may be delayed by a frame. +typedef void (*vk_cb)(void *p, void *arg); + +struct vk_callback { + vk_cb run; + void *priv; + void *arg; +}; + +// Associate a callback with the completion of all currently pending commands. +// This will essentially run once the device is completely idle. +void vk_dev_callback(struct vk_ctx *vk, vk_cb callback, + const void *priv, const void *arg); + +// Helper wrapper around command buffers that also track dependencies, +// callbacks and synchronization primitives +// +// Thread-safety: Unsafe +struct vk_cmd { + struct vk_cmdpool *pool; // pool it was allocated from + pl_vulkan_sem sync; // pending execution, tied to lifetime of device + VkQueue queue; // the submission queue (for recording/pending) + int qindex; // the index of `queue` in `pool` + VkCommandBuffer buf; // the command buffer itself + // Command dependencies and signals. Not owned by the vk_cmd. + PL_ARRAY(VkSemaphoreSubmitInfo) deps; + PL_ARRAY(VkSemaphoreSubmitInfo) sigs; + // "Callbacks" to fire once a command completes. These are used for + // multiple purposes, ranging from resource deallocation to fencing. + PL_ARRAY(struct vk_callback) callbacks; +}; + +// Associate a callback with the completion of the current command. This +// function will be run once the command completes, or shortly thereafter. +void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback, + const void *priv, const void *arg); + +// Associate a raw dependency for the current command. This semaphore must +// signal by the corresponding stage before the command may execute. +void vk_cmd_dep(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem dep); + +// Associate a raw signal with the current command. This semaphore will signal +// after the given stage completes. +void vk_cmd_sig(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem sig); + +// Compatibility wrappers for vkCmdPipelineBarrier2 (works with pre-1.3) +void vk_cmd_barrier(struct vk_cmd *cmd, const VkDependencyInfo *info); + +// Synchronization scope +struct vk_sync_scope { + pl_vulkan_sem sync; // semaphore of last access + VkQueue queue; // source queue of last access + VkPipelineStageFlags2 stage;// stage bitmask of last access + VkAccessFlags2 access; // access type bitmask +}; + +// Synchronization primitive +struct vk_sem { + struct vk_sync_scope read, write; +}; + +// Updates the `vk_sem` state for a given access. If `is_trans` is set, this +// access is treated as a write (since it alters the resource's state). +// +// Returns a struct describing the previous access to a resource. A pipeline +// barrier is only required if the previous access scope is nonzero. +struct vk_sync_scope vk_sem_barrier(struct vk_cmd *cmd, struct vk_sem *sem, + VkPipelineStageFlags2 stage, + VkAccessFlags2 access, bool is_trans); + +// Command pool / queue family hybrid abstraction +struct vk_cmdpool { + struct vk_ctx *vk; + VkQueueFamilyProperties props; + int qf; // queue family index + VkCommandPool pool; + VkQueue *queues; + int num_queues; + int idx_queues; + // Command buffers associated with this queue. These are available for + // re-recording + PL_ARRAY(struct vk_cmd *) cmds; +}; + +// Set up a vk_cmdpool corresponding to a queue family. `qnum` may be less than +// `props.queueCount`, to restrict the number of queues in this queue family. +struct vk_cmdpool *vk_cmdpool_create(struct vk_ctx *vk, int qf, int qnum, + VkQueueFamilyProperties props); + +void vk_cmdpool_destroy(struct vk_cmdpool *pool); + +// Fetch a command buffer from a command pool and begin recording to it. +// Returns NULL on failure. +struct vk_cmd *vk_cmd_begin(struct vk_cmdpool *pool, pl_debug_tag debug_tag); + +// Finish recording a command buffer and submit it for execution. This function +// takes over ownership of **cmd, and sets *cmd to NULL in doing so. +bool vk_cmd_submit(struct vk_cmd **cmd); + +// Block until some commands complete executing. This is the only function that +// actually processes the callbacks. Will wait at most `timeout` nanoseconds +// for the completion of any command. The timeout may also be passed as 0, in +// which case this function will not block, but only poll for completed +// commands. Returns whether any forward progress was made. +// +// This does *not* flush any queued commands, forgetting to do so may result +// in infinite loops if waiting for the completion of callbacks that were +// never flushed! +bool vk_poll_commands(struct vk_ctx *vk, uint64_t timeout); + +// Rotate through queues in each command pool. Call this once per frame, after +// submitting all of the command buffers for that frame. Calling this more +// often than that is possible but bad for performance. +void vk_rotate_queues(struct vk_ctx *vk); + +// Wait until all commands are complete, i.e. the device is idle. This is +// basically equivalent to calling `vk_poll_commands` with a timeout of +// UINT64_MAX until it returns `false`. +void vk_wait_idle(struct vk_ctx *vk); diff --git a/src/vulkan/common.h b/src/vulkan/common.h new file mode 100644 index 0000000..31b309e --- /dev/null +++ b/src/vulkan/common.h @@ -0,0 +1,234 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#define VK_NO_PROTOTYPES +#define VK_ENABLE_BETA_EXTENSIONS // for VK_KHR_portability_subset +#define VK_USE_PLATFORM_METAL_EXT + +#include "../common.h" +#include "../log.h" +#include "../pl_thread.h" + +#include <libplacebo/vulkan.h> + +#ifdef PL_HAVE_WIN32 +#include <windows.h> +#include <vulkan/vulkan_win32.h> +#endif + +// Vulkan allows the optional use of a custom allocator. We don't need one but +// mark this parameter with a better name in case we ever decide to change this +// in the future. (And to make the code more readable) +#define PL_VK_ALLOC NULL + +// Type of a vulkan function that needs to be loaded +#define PL_VK_FUN(name) PFN_vk##name name + +// Load a vulkan instance-level extension function directly (on the stack) +#define PL_VK_LOAD_FUN(inst, name, get_addr) \ + PL_VK_FUN(name) = (PFN_vk##name) get_addr(inst, "vk" #name); + +#ifndef VK_VENDOR_ID_NVIDIA +#define VK_VENDOR_ID_NVIDIA 0x10DE +#endif + +// Shared struct used to hold vulkan context information +struct vk_ctx { + pl_mutex lock; + pl_vulkan vulkan; + void *alloc; // host allocations bound to the lifetime of this vk_ctx + struct vk_malloc *ma; // VRAM malloc layer + pl_vk_inst internal_instance; + pl_log log; + VkInstance inst; + VkPhysicalDevice physd; + VkPhysicalDeviceProperties props; + VkPhysicalDeviceFeatures2 features; + uint32_t api_ver; // device API version + VkDevice dev; + bool imported; // device was not created by us + + // Generic error flag for catching "failed" devices + bool failed; + + // Enabled extensions + PL_ARRAY(const char *) exts; + + // Command pools (one per queue family) + PL_ARRAY(struct vk_cmdpool *) pools; + + // Pointers into `pools` (always set) + struct vk_cmdpool *pool_graphics; + struct vk_cmdpool *pool_compute; + struct vk_cmdpool *pool_transfer; + + // Queue locking functions + PL_ARRAY(PL_ARRAY(pl_mutex)) queue_locks; + void (*lock_queue)(void *queue_ctx, uint32_t qf, uint32_t idx); + void (*unlock_queue)(void *queue_ctx, uint32_t qf, uint32_t idx); + void *queue_ctx; + + // Pending commands. These are shared for the entire mpvk_ctx to ensure + // submission and callbacks are FIFO + PL_ARRAY(struct vk_cmd *) cmds_pending; // submitted but not completed + + // Pending callbacks that still need to be drained before processing + // callbacks for the next command (in case commands are recursively being + // polled from another callback) + const struct vk_callback *pending_callbacks; + int num_pending_callbacks; + + // Instance-level function pointers + PL_VK_FUN(CreateDevice); + PL_VK_FUN(EnumerateDeviceExtensionProperties); + PL_VK_FUN(GetDeviceProcAddr); + PL_VK_FUN(GetInstanceProcAddr); + PL_VK_FUN(GetPhysicalDeviceExternalBufferProperties); + PL_VK_FUN(GetPhysicalDeviceExternalSemaphoreProperties); + PL_VK_FUN(GetPhysicalDeviceFeatures2KHR); + PL_VK_FUN(GetPhysicalDeviceFormatProperties); + PL_VK_FUN(GetPhysicalDeviceFormatProperties2KHR); + PL_VK_FUN(GetPhysicalDeviceImageFormatProperties2KHR); + PL_VK_FUN(GetPhysicalDeviceMemoryProperties); + PL_VK_FUN(GetPhysicalDeviceProperties); + PL_VK_FUN(GetPhysicalDeviceProperties2); + PL_VK_FUN(GetPhysicalDeviceQueueFamilyProperties); + PL_VK_FUN(GetPhysicalDeviceSurfaceCapabilitiesKHR); + PL_VK_FUN(GetPhysicalDeviceSurfaceFormatsKHR); + PL_VK_FUN(GetPhysicalDeviceSurfacePresentModesKHR); + PL_VK_FUN(GetPhysicalDeviceSurfaceSupportKHR); + + // Device-level function pointers + PL_VK_FUN(AcquireNextImageKHR); + PL_VK_FUN(AllocateCommandBuffers); + PL_VK_FUN(AllocateDescriptorSets); + PL_VK_FUN(AllocateMemory); + PL_VK_FUN(BeginCommandBuffer); + PL_VK_FUN(BindBufferMemory); + PL_VK_FUN(BindImageMemory); + PL_VK_FUN(CmdBeginDebugUtilsLabelEXT); + PL_VK_FUN(CmdBeginRenderPass); + PL_VK_FUN(CmdBindDescriptorSets); + PL_VK_FUN(CmdBindIndexBuffer); + PL_VK_FUN(CmdBindPipeline); + PL_VK_FUN(CmdBindVertexBuffers); + PL_VK_FUN(CmdBlitImage); + PL_VK_FUN(CmdClearColorImage); + PL_VK_FUN(CmdCopyBuffer); + PL_VK_FUN(CmdCopyBufferToImage); + PL_VK_FUN(CmdCopyImage); + PL_VK_FUN(CmdCopyImageToBuffer); + PL_VK_FUN(CmdDispatch); + PL_VK_FUN(CmdDraw); + PL_VK_FUN(CmdDrawIndexed); + PL_VK_FUN(CmdEndDebugUtilsLabelEXT); + PL_VK_FUN(CmdEndRenderPass); + PL_VK_FUN(CmdPipelineBarrier); + PL_VK_FUN(CmdPipelineBarrier2KHR); + PL_VK_FUN(CmdPushConstants); + PL_VK_FUN(CmdPushDescriptorSetKHR); + PL_VK_FUN(CmdResetQueryPool); + PL_VK_FUN(CmdSetScissor); + PL_VK_FUN(CmdSetViewport); + PL_VK_FUN(CmdUpdateBuffer); + PL_VK_FUN(CmdWriteTimestamp); + PL_VK_FUN(CreateBuffer); + PL_VK_FUN(CreateBufferView); + PL_VK_FUN(CreateCommandPool); + PL_VK_FUN(CreateComputePipelines); + PL_VK_FUN(CreateDebugReportCallbackEXT); + PL_VK_FUN(CreateDescriptorPool); + PL_VK_FUN(CreateDescriptorSetLayout); + PL_VK_FUN(CreateFence); + PL_VK_FUN(CreateFramebuffer); + PL_VK_FUN(CreateGraphicsPipelines); + PL_VK_FUN(CreateImage); + PL_VK_FUN(CreateImageView); + PL_VK_FUN(CreatePipelineCache); + PL_VK_FUN(CreatePipelineLayout); + PL_VK_FUN(CreateQueryPool); + PL_VK_FUN(CreateRenderPass); + PL_VK_FUN(CreateSampler); + PL_VK_FUN(CreateSemaphore); + PL_VK_FUN(CreateShaderModule); + PL_VK_FUN(CreateSwapchainKHR); + PL_VK_FUN(DestroyBuffer); + PL_VK_FUN(DestroyBufferView); + PL_VK_FUN(DestroyCommandPool); + PL_VK_FUN(DestroyDebugReportCallbackEXT); + PL_VK_FUN(DestroyDescriptorPool); + PL_VK_FUN(DestroyDescriptorSetLayout); + PL_VK_FUN(DestroyDevice); + PL_VK_FUN(DestroyFence); + PL_VK_FUN(DestroyFramebuffer); + PL_VK_FUN(DestroyImage); + PL_VK_FUN(DestroyImageView); + PL_VK_FUN(DestroyInstance); + PL_VK_FUN(DestroyPipeline); + PL_VK_FUN(DestroyPipelineCache); + PL_VK_FUN(DestroyPipelineLayout); + PL_VK_FUN(DestroyQueryPool); + PL_VK_FUN(DestroyRenderPass); + PL_VK_FUN(DestroySampler); + PL_VK_FUN(DestroySemaphore); + PL_VK_FUN(DestroyShaderModule); + PL_VK_FUN(DestroySwapchainKHR); + PL_VK_FUN(DeviceWaitIdle); + PL_VK_FUN(EndCommandBuffer); + PL_VK_FUN(FlushMappedMemoryRanges); + PL_VK_FUN(FreeCommandBuffers); + PL_VK_FUN(FreeMemory); + PL_VK_FUN(GetBufferMemoryRequirements); + PL_VK_FUN(GetDeviceQueue); + PL_VK_FUN(GetImageDrmFormatModifierPropertiesEXT); + PL_VK_FUN(GetImageMemoryRequirements2); + PL_VK_FUN(GetImageSubresourceLayout); + PL_VK_FUN(GetMemoryFdKHR); + PL_VK_FUN(GetMemoryFdPropertiesKHR); + PL_VK_FUN(GetMemoryHostPointerPropertiesEXT); + PL_VK_FUN(GetPipelineCacheData); + PL_VK_FUN(GetQueryPoolResults); + PL_VK_FUN(GetSemaphoreFdKHR); + PL_VK_FUN(GetSwapchainImagesKHR); + PL_VK_FUN(InvalidateMappedMemoryRanges); + PL_VK_FUN(MapMemory); + PL_VK_FUN(QueuePresentKHR); + PL_VK_FUN(QueueSubmit); + PL_VK_FUN(QueueSubmit2KHR); + PL_VK_FUN(QueueWaitIdle); + PL_VK_FUN(ResetFences); + PL_VK_FUN(ResetQueryPool); + PL_VK_FUN(SetDebugUtilsObjectNameEXT); + PL_VK_FUN(SetHdrMetadataEXT); + PL_VK_FUN(UpdateDescriptorSets); + PL_VK_FUN(WaitForFences); + PL_VK_FUN(WaitSemaphores); + +#ifdef PL_HAVE_WIN32 + PL_VK_FUN(GetMemoryWin32HandleKHR); + PL_VK_FUN(GetSemaphoreWin32HandleKHR); +#endif + +#ifdef VK_EXT_metal_objects + PL_VK_FUN(ExportMetalObjectsEXT); +#endif +#ifdef VK_EXT_full_screen_exclusive + PL_VK_FUN(AcquireFullScreenExclusiveModeEXT); +#endif +}; diff --git a/src/vulkan/context.c b/src/vulkan/context.c new file mode 100644 index 0000000..ad8a859 --- /dev/null +++ b/src/vulkan/context.c @@ -0,0 +1,1704 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "common.h" +#include "command.h" +#include "utils.h" +#include "gpu.h" + +#ifdef PL_HAVE_VK_PROC_ADDR +VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetInstanceProcAddr( + VkInstance instance, + const char* pName); +#endif + +const struct pl_vk_inst_params pl_vk_inst_default_params = {0}; + +struct vk_fun { + const char *name; + size_t offset; + bool device_level; +}; + +struct vk_ext { + const char *name; + const struct vk_fun *funs; +}; + +#define PL_VK_INST_FUN(N) \ + { .name = "vk" #N, \ + .offset = offsetof(struct vk_ctx, N), \ + } + +#define PL_VK_DEV_FUN(N) \ + { .name = "vk" #N, \ + .offset = offsetof(struct vk_ctx, N), \ + .device_level = true, \ + } + +// Table of optional vulkan instance extensions +static const char *vk_instance_extensions[] = { + VK_KHR_SURFACE_EXTENSION_NAME, + VK_EXT_SWAPCHAIN_COLOR_SPACE_EXTENSION_NAME, + VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME, + VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME, + VK_KHR_GET_SURFACE_CAPABILITIES_2_EXTENSION_NAME, +}; + +// List of mandatory instance-level function pointers, including functions +// associated with mandatory instance extensions +static const struct vk_fun vk_inst_funs[] = { + PL_VK_INST_FUN(CreateDevice), + PL_VK_INST_FUN(EnumerateDeviceExtensionProperties), + PL_VK_INST_FUN(GetDeviceProcAddr), + PL_VK_INST_FUN(GetPhysicalDeviceExternalBufferProperties), + PL_VK_INST_FUN(GetPhysicalDeviceExternalSemaphoreProperties), + PL_VK_INST_FUN(GetPhysicalDeviceFeatures2KHR), + PL_VK_INST_FUN(GetPhysicalDeviceFormatProperties), + PL_VK_INST_FUN(GetPhysicalDeviceFormatProperties2KHR), + PL_VK_INST_FUN(GetPhysicalDeviceImageFormatProperties2KHR), + PL_VK_INST_FUN(GetPhysicalDeviceMemoryProperties), + PL_VK_INST_FUN(GetPhysicalDeviceProperties), + PL_VK_INST_FUN(GetPhysicalDeviceProperties2), + PL_VK_INST_FUN(GetPhysicalDeviceQueueFamilyProperties), + + // These are not actually mandatory, but they're universal enough that we + // just load them unconditionally (in lieu of not having proper support for + // loading arbitrary instance extensions). Their use is generally guarded + // behind various VkSurfaceKHR values already being provided by the API + // user (implying this extension is loaded). + PL_VK_INST_FUN(GetPhysicalDeviceSurfaceCapabilitiesKHR), + PL_VK_INST_FUN(GetPhysicalDeviceSurfaceFormatsKHR), + PL_VK_INST_FUN(GetPhysicalDeviceSurfacePresentModesKHR), + PL_VK_INST_FUN(GetPhysicalDeviceSurfaceSupportKHR), +}; + +// Table of vulkan device extensions and functions they load, including +// functions exported by dependent instance-level extensions +static const struct vk_ext vk_device_extensions[] = { + { + .name = VK_KHR_SWAPCHAIN_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(AcquireNextImageKHR), + PL_VK_DEV_FUN(CreateSwapchainKHR), + PL_VK_DEV_FUN(DestroySwapchainKHR), + PL_VK_DEV_FUN(GetSwapchainImagesKHR), + PL_VK_DEV_FUN(QueuePresentKHR), + {0} + }, + }, { + .name = VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(CmdPushDescriptorSetKHR), + {0} + }, + }, { + .name = VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(GetMemoryFdKHR), + {0} + }, + }, { + .name = VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(GetMemoryFdPropertiesKHR), + {0} + }, +#ifdef PL_HAVE_WIN32 + }, { + .name = VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(GetMemoryWin32HandleKHR), + {0} + }, +#endif + }, { + .name = VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(GetMemoryHostPointerPropertiesEXT), + {0} + }, + }, { + .name = VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(GetSemaphoreFdKHR), + {0} + }, +#ifdef PL_HAVE_WIN32 + }, { + .name = VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(GetSemaphoreWin32HandleKHR), + {0} + }, +#endif + }, { + .name = VK_EXT_PCI_BUS_INFO_EXTENSION_NAME, + }, { + .name = VK_EXT_HDR_METADATA_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(SetHdrMetadataEXT), + {0} + }, + }, { + .name = VK_EXT_IMAGE_DRM_FORMAT_MODIFIER_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(GetImageDrmFormatModifierPropertiesEXT), + {0} + }, +#ifdef VK_KHR_portability_subset + }, { + .name = VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME, +#endif +#ifdef VK_EXT_metal_objects + }, { + .name = VK_EXT_METAL_OBJECTS_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(ExportMetalObjectsEXT), + {0} + }, +#endif +#ifdef VK_EXT_full_screen_exclusive + }, { + .name = VK_EXT_FULL_SCREEN_EXCLUSIVE_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(AcquireFullScreenExclusiveModeEXT), + {0} + }, +#endif + }, { + .name = VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME, + .funs = (const struct vk_fun[]) { + PL_VK_DEV_FUN(CmdPipelineBarrier2KHR), + PL_VK_DEV_FUN(QueueSubmit2KHR), + {0} + }, + }, +}; + +// Make sure to keep this in sync with the above! +const char * const pl_vulkan_recommended_extensions[] = { + VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME, + VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME, + VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME, + VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME, + VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME, +#ifdef PL_HAVE_WIN32 + VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME, + VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME, +#endif + VK_EXT_PCI_BUS_INFO_EXTENSION_NAME, + VK_EXT_HDR_METADATA_EXTENSION_NAME, + VK_EXT_IMAGE_DRM_FORMAT_MODIFIER_EXTENSION_NAME, +#ifdef VK_KHR_portability_subset + VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME, +#endif +#ifdef VK_EXT_metal_objects + VK_EXT_METAL_OBJECTS_EXTENSION_NAME, +#endif +#ifdef VK_EXT_full_screen_exclusive + VK_EXT_FULL_SCREEN_EXCLUSIVE_EXTENSION_NAME, +#endif + VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME, +}; + +const int pl_vulkan_num_recommended_extensions = + PL_ARRAY_SIZE(pl_vulkan_recommended_extensions); + +// +1 because VK_KHR_swapchain is not automatically pulled in +static_assert(PL_ARRAY_SIZE(pl_vulkan_recommended_extensions) + 1 == + PL_ARRAY_SIZE(vk_device_extensions), + "pl_vulkan_recommended_extensions out of sync with " + "vk_device_extensions?"); + +// Recommended features; keep in sync with libavutil vulkan hwcontext +static const VkPhysicalDeviceVulkan13Features recommended_vk13 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES, + .computeFullSubgroups = true, + .maintenance4 = true, + .shaderZeroInitializeWorkgroupMemory = true, + .synchronization2 = true, +}; + +static const VkPhysicalDeviceVulkan12Features recommended_vk12 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES, + .pNext = (void *) &recommended_vk13, + .bufferDeviceAddress = true, + .storagePushConstant8 = true, + .shaderInt8 = true, + .shaderFloat16 = true, + .shaderSharedInt64Atomics = true, + .storageBuffer8BitAccess = true, + .uniformAndStorageBuffer8BitAccess = true, + .vulkanMemoryModel = true, + .vulkanMemoryModelDeviceScope = true, +}; + +static const VkPhysicalDeviceVulkan11Features recommended_vk11 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES, + .pNext = (void *) &recommended_vk12, + .samplerYcbcrConversion = true, + .storagePushConstant16 = true, +}; + +const VkPhysicalDeviceFeatures2 pl_vulkan_recommended_features = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2, + .pNext = (void *) &recommended_vk11, + .features = { + .shaderImageGatherExtended = true, + .shaderStorageImageReadWithoutFormat = true, + .shaderStorageImageWriteWithoutFormat = true, + + // Needed for GPU-assisted validation, but not harmful to enable + .fragmentStoresAndAtomics = true, + .vertexPipelineStoresAndAtomics = true, + .shaderInt64 = true, + } +}; + +// Required features +static const VkPhysicalDeviceVulkan12Features required_vk12 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES, + .hostQueryReset = true, + .timelineSemaphore = true, +}; + +static const VkPhysicalDeviceVulkan11Features required_vk11 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES, + .pNext = (void *) &required_vk12, +}; + +const VkPhysicalDeviceFeatures2 pl_vulkan_required_features = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2, + .pNext = (void *) &required_vk11, +}; + +static bool check_required_features(struct vk_ctx *vk) +{ + #define CHECK_FEATURE(maj, min, feat) do { \ + const VkPhysicalDeviceVulkan##maj##min##Features *f; \ + f = vk_find_struct(&vk->features, \ + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_##maj##_##min##_FEATURES); \ + if (!f || !f->feat) { \ + PL_ERR(vk, "Missing device feature: " #feat); \ + return false; \ + } \ + } while (0) + + CHECK_FEATURE(1, 2, hostQueryReset); + CHECK_FEATURE(1, 2, timelineSemaphore); + + #undef CHECK_FEATURE + return true; +} + + +// List of mandatory device-level functions +// +// Note: Also includes VK_EXT_debug_utils functions, even though they aren't +// mandatory, simply because we load that extension in a special way. +static const struct vk_fun vk_dev_funs[] = { + PL_VK_DEV_FUN(AllocateCommandBuffers), + PL_VK_DEV_FUN(AllocateDescriptorSets), + PL_VK_DEV_FUN(AllocateMemory), + PL_VK_DEV_FUN(BeginCommandBuffer), + PL_VK_DEV_FUN(BindBufferMemory), + PL_VK_DEV_FUN(BindImageMemory), + PL_VK_DEV_FUN(CmdBeginDebugUtilsLabelEXT), + PL_VK_DEV_FUN(CmdBeginRenderPass), + PL_VK_DEV_FUN(CmdBindDescriptorSets), + PL_VK_DEV_FUN(CmdBindIndexBuffer), + PL_VK_DEV_FUN(CmdBindPipeline), + PL_VK_DEV_FUN(CmdBindVertexBuffers), + PL_VK_DEV_FUN(CmdBlitImage), + PL_VK_DEV_FUN(CmdClearColorImage), + PL_VK_DEV_FUN(CmdCopyBuffer), + PL_VK_DEV_FUN(CmdCopyBufferToImage), + PL_VK_DEV_FUN(CmdCopyImage), + PL_VK_DEV_FUN(CmdCopyImageToBuffer), + PL_VK_DEV_FUN(CmdDispatch), + PL_VK_DEV_FUN(CmdDraw), + PL_VK_DEV_FUN(CmdDrawIndexed), + PL_VK_DEV_FUN(CmdEndDebugUtilsLabelEXT), + PL_VK_DEV_FUN(CmdEndRenderPass), + PL_VK_DEV_FUN(CmdPipelineBarrier), + PL_VK_DEV_FUN(CmdPushConstants), + PL_VK_DEV_FUN(CmdResetQueryPool), + PL_VK_DEV_FUN(CmdSetScissor), + PL_VK_DEV_FUN(CmdSetViewport), + PL_VK_DEV_FUN(CmdUpdateBuffer), + PL_VK_DEV_FUN(CmdWriteTimestamp), + PL_VK_DEV_FUN(CreateBuffer), + PL_VK_DEV_FUN(CreateBufferView), + PL_VK_DEV_FUN(CreateCommandPool), + PL_VK_DEV_FUN(CreateComputePipelines), + PL_VK_DEV_FUN(CreateDescriptorPool), + PL_VK_DEV_FUN(CreateDescriptorSetLayout), + PL_VK_DEV_FUN(CreateFence), + PL_VK_DEV_FUN(CreateFramebuffer), + PL_VK_DEV_FUN(CreateGraphicsPipelines), + PL_VK_DEV_FUN(CreateImage), + PL_VK_DEV_FUN(CreateImageView), + PL_VK_DEV_FUN(CreatePipelineCache), + PL_VK_DEV_FUN(CreatePipelineLayout), + PL_VK_DEV_FUN(CreateQueryPool), + PL_VK_DEV_FUN(CreateRenderPass), + PL_VK_DEV_FUN(CreateSampler), + PL_VK_DEV_FUN(CreateSemaphore), + PL_VK_DEV_FUN(CreateShaderModule), + PL_VK_DEV_FUN(DestroyBuffer), + PL_VK_DEV_FUN(DestroyBufferView), + PL_VK_DEV_FUN(DestroyCommandPool), + PL_VK_DEV_FUN(DestroyDescriptorPool), + PL_VK_DEV_FUN(DestroyDescriptorSetLayout), + PL_VK_DEV_FUN(DestroyDevice), + PL_VK_DEV_FUN(DestroyFence), + PL_VK_DEV_FUN(DestroyFramebuffer), + PL_VK_DEV_FUN(DestroyImage), + PL_VK_DEV_FUN(DestroyImageView), + PL_VK_DEV_FUN(DestroyInstance), + PL_VK_DEV_FUN(DestroyPipeline), + PL_VK_DEV_FUN(DestroyPipelineCache), + PL_VK_DEV_FUN(DestroyPipelineLayout), + PL_VK_DEV_FUN(DestroyQueryPool), + PL_VK_DEV_FUN(DestroyRenderPass), + PL_VK_DEV_FUN(DestroySampler), + PL_VK_DEV_FUN(DestroySemaphore), + PL_VK_DEV_FUN(DestroyShaderModule), + PL_VK_DEV_FUN(DeviceWaitIdle), + PL_VK_DEV_FUN(EndCommandBuffer), + PL_VK_DEV_FUN(FlushMappedMemoryRanges), + PL_VK_DEV_FUN(FreeCommandBuffers), + PL_VK_DEV_FUN(FreeMemory), + PL_VK_DEV_FUN(GetBufferMemoryRequirements), + PL_VK_DEV_FUN(GetDeviceQueue), + PL_VK_DEV_FUN(GetImageMemoryRequirements2), + PL_VK_DEV_FUN(GetImageSubresourceLayout), + PL_VK_DEV_FUN(GetPipelineCacheData), + PL_VK_DEV_FUN(GetQueryPoolResults), + PL_VK_DEV_FUN(InvalidateMappedMemoryRanges), + PL_VK_DEV_FUN(MapMemory), + PL_VK_DEV_FUN(QueueSubmit), + PL_VK_DEV_FUN(QueueWaitIdle), + PL_VK_DEV_FUN(ResetFences), + PL_VK_DEV_FUN(ResetQueryPool), + PL_VK_DEV_FUN(SetDebugUtilsObjectNameEXT), + PL_VK_DEV_FUN(UpdateDescriptorSets), + PL_VK_DEV_FUN(WaitForFences), + PL_VK_DEV_FUN(WaitSemaphores), +}; + +static void load_vk_fun(struct vk_ctx *vk, const struct vk_fun *fun) +{ + PFN_vkVoidFunction *pfn = (void *) ((uintptr_t) vk + (ptrdiff_t) fun->offset); + + if (fun->device_level) { + *pfn = vk->GetDeviceProcAddr(vk->dev, fun->name); + } else { + *pfn = vk->GetInstanceProcAddr(vk->inst, fun->name); + }; + + if (!*pfn) { + // Some functions get their extension suffix stripped when promoted + // to core. As a very simple work-around to this, try loading the + // function a second time with the reserved suffixes stripped. + static const char *ext_suffixes[] = { "KHR", "EXT" }; + pl_str fun_name = pl_str0(fun->name); + char buf[64]; + + for (int i = 0; i < PL_ARRAY_SIZE(ext_suffixes); i++) { + if (!pl_str_eatend0(&fun_name, ext_suffixes[i])) + continue; + + pl_assert(sizeof(buf) > fun_name.len); + snprintf(buf, sizeof(buf), "%.*s", PL_STR_FMT(fun_name)); + if (fun->device_level) { + *pfn = vk->GetDeviceProcAddr(vk->dev, buf); + } else { + *pfn = vk->GetInstanceProcAddr(vk->inst, buf); + } + return; + } + } +} + +// Private struct for pl_vk_inst +struct priv { + VkDebugUtilsMessengerEXT debug_utils_cb; +}; + +void pl_vk_inst_destroy(pl_vk_inst *inst_ptr) +{ + pl_vk_inst inst = *inst_ptr; + if (!inst) + return; + + struct priv *p = PL_PRIV(inst); + if (p->debug_utils_cb) { + PL_VK_LOAD_FUN(inst->instance, DestroyDebugUtilsMessengerEXT, inst->get_proc_addr); + DestroyDebugUtilsMessengerEXT(inst->instance, p->debug_utils_cb, PL_VK_ALLOC); + } + + PL_VK_LOAD_FUN(inst->instance, DestroyInstance, inst->get_proc_addr); + DestroyInstance(inst->instance, PL_VK_ALLOC); + pl_free_ptr((void **) inst_ptr); +} + +static VkBool32 VKAPI_PTR vk_dbg_utils_cb(VkDebugUtilsMessageSeverityFlagBitsEXT sev, + VkDebugUtilsMessageTypeFlagsEXT msgType, + const VkDebugUtilsMessengerCallbackDataEXT *data, + void *priv) +{ + pl_log log = priv; + + // Ignore errors for messages that we consider false positives + switch (data->messageIdNumber) { + case 0x7cd0911d: // VUID-VkSwapchainCreateInfoKHR-imageExtent-01274 + case 0x8928392f: // UNASSIGNED-BestPractices-NonSuccess-Result + case 0xdc18ad6b: // UNASSIGNED-BestPractices-vkAllocateMemory-small-allocation + case 0xb3d4346b: // UNASSIGNED-BestPractices-vkBindMemory-small-dedicated-allocation + case 0x6cfe18a5: // UNASSIGNED-BestPractices-SemaphoreCount + case 0x48a09f6c: // UNASSIGNED-BestPractices-pipeline-stage-flags + // profile chain expectations + case 0x30f4ac70: // VUID-VkImageCreateInfo-pNext-06811 + return false; + + case 0x5f379b89: // UNASSIGNED-BestPractices-Error-Result + if (strstr(data->pMessage, "VK_ERROR_FORMAT_NOT_SUPPORTED")) + return false; + break; + + case 0xf6a37cfa: // VUID-vkGetImageSubresourceLayout-format-04461 + // Work around https://github.com/KhronosGroup/Vulkan-Docs/issues/2109 + return false; + } + + enum pl_log_level lev; + switch (sev) { + case VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT: lev = PL_LOG_ERR; break; + case VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT: lev = PL_LOG_WARN; break; + case VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT: lev = PL_LOG_DEBUG; break; + case VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT: lev = PL_LOG_TRACE; break; + default: lev = PL_LOG_INFO; break; + } + + pl_msg(log, lev, "vk %s", data->pMessage); + + for (int i = 0; i < data->queueLabelCount; i++) + pl_msg(log, lev, " during %s", data->pQueueLabels[i].pLabelName); + for (int i = 0; i < data->cmdBufLabelCount; i++) + pl_msg(log, lev, " inside %s", data->pCmdBufLabels[i].pLabelName); + for (int i = 0; i < data->objectCount; i++) { + const VkDebugUtilsObjectNameInfoEXT *obj = &data->pObjects[i]; + pl_msg(log, lev, " using %s: %s (0x%llx)", + vk_obj_type(obj->objectType), + obj->pObjectName ? obj->pObjectName : "anon", + (unsigned long long) obj->objectHandle); + } + + // The return value of this function determines whether the call will + // be explicitly aborted (to prevent GPU errors) or not. In this case, + // we generally want this to be on for the validation errors, but nothing + // else (e.g. performance warnings) + bool is_error = (sev & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) && + (msgType & VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT); + + if (is_error) { + pl_log_stack_trace(log, lev); + pl_debug_abort(); + return true; + } + + return false; +} + +static PFN_vkGetInstanceProcAddr get_proc_addr_fallback(pl_log log, + PFN_vkGetInstanceProcAddr get_proc_addr) +{ + if (get_proc_addr) + return get_proc_addr; + +#ifdef PL_HAVE_VK_PROC_ADDR + return vkGetInstanceProcAddr; +#else + pl_fatal(log, "No `vkGetInstanceProcAddr` function provided, and " + "libplacebo built without linking against this function!"); + return NULL; +#endif +} + +#define PRINTF_VER(ver) \ + (int) VK_API_VERSION_MAJOR(ver), \ + (int) VK_API_VERSION_MINOR(ver), \ + (int) VK_API_VERSION_PATCH(ver) + +pl_vk_inst pl_vk_inst_create(pl_log log, const struct pl_vk_inst_params *params) +{ + void *tmp = pl_tmp(NULL); + params = PL_DEF(params, &pl_vk_inst_default_params); + VkInstance inst = NULL; + pl_clock_t start; + + PL_ARRAY(const char *) exts = {0}; + + PFN_vkGetInstanceProcAddr get_addr; + if (!(get_addr = get_proc_addr_fallback(log, params->get_proc_addr))) + goto error; + + // Query instance version support + uint32_t api_ver = VK_API_VERSION_1_0; + PL_VK_LOAD_FUN(NULL, EnumerateInstanceVersion, get_addr); + if (EnumerateInstanceVersion && EnumerateInstanceVersion(&api_ver) != VK_SUCCESS) + goto error; + + pl_debug(log, "Available instance version: %d.%d.%d", PRINTF_VER(api_ver)); + + if (params->max_api_version) { + api_ver = PL_MIN(api_ver, params->max_api_version); + pl_info(log, "Restricting API version to %d.%d.%d... new version %d.%d.%d", + PRINTF_VER(params->max_api_version), PRINTF_VER(api_ver)); + } + + if (api_ver < PL_VK_MIN_VERSION) { + pl_fatal(log, "Instance API version %d.%d.%d is lower than the minimum " + "required version of %d.%d.%d, cannot proceed!", + PRINTF_VER(api_ver), PRINTF_VER(PL_VK_MIN_VERSION)); + goto error; + } + + VkInstanceCreateInfo info = { + .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, + .pApplicationInfo = &(VkApplicationInfo) { + .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO, + .apiVersion = api_ver, + }, + }; + + // Enumerate all supported layers + start = pl_clock_now(); + PL_VK_LOAD_FUN(NULL, EnumerateInstanceLayerProperties, get_addr); + uint32_t num_layers_avail = 0; + EnumerateInstanceLayerProperties(&num_layers_avail, NULL); + VkLayerProperties *layers_avail = pl_calloc_ptr(tmp, num_layers_avail, layers_avail); + EnumerateInstanceLayerProperties(&num_layers_avail, layers_avail); + pl_log_cpu_time(log, start, pl_clock_now(), "enumerating instance layers"); + + pl_debug(log, "Available layers:"); + for (int i = 0; i < num_layers_avail; i++) { + pl_debug(log, " %s (v%d.%d.%d)", layers_avail[i].layerName, + PRINTF_VER(layers_avail[i].specVersion)); + } + + PL_ARRAY(const char *) layers = {0}; + + // Sorted by priority + static const char *debug_layers[] = { + "VK_LAYER_KHRONOS_validation", + "VK_LAYER_LUNARG_standard_validation", + }; + + // This layer has to be initialized first, otherwise all sorts of weirdness + // happens (random segfaults, yum) + bool debug = params->debug; + uint32_t debug_layer = 0; // layer idx of debug layer + uint32_t debug_layer_version = 0; + if (debug) { + for (int i = 0; i < PL_ARRAY_SIZE(debug_layers); i++) { + for (int n = 0; n < num_layers_avail; n++) { + if (strcmp(debug_layers[i], layers_avail[n].layerName) != 0) + continue; + + debug_layer = n; + debug_layer_version = layers_avail[n].specVersion; + pl_info(log, "Enabling debug meta layer: %s (v%d.%d.%d)", + debug_layers[i], PRINTF_VER(debug_layer_version)); + PL_ARRAY_APPEND(tmp, layers, debug_layers[i]); + goto debug_layers_done; + } + } + + // No layer found.. + pl_warn(log, "API debugging requested but no debug meta layers present... ignoring"); + debug = false; + } + +debug_layers_done: ; + + for (int i = 0; i < params->num_layers; i++) + PL_ARRAY_APPEND(tmp, layers, params->layers[i]); + + for (int i = 0; i < params->num_opt_layers; i++) { + const char *layer = params->opt_layers[i]; + for (int n = 0; n < num_layers_avail; n++) { + if (strcmp(layer, layers_avail[n].layerName) == 0) { + PL_ARRAY_APPEND(tmp, layers, layer); + break; + } + } + } + + // Enumerate all supported extensions + start = pl_clock_now(); + PL_VK_LOAD_FUN(NULL, EnumerateInstanceExtensionProperties, get_addr); + uint32_t num_exts_avail = 0; + EnumerateInstanceExtensionProperties(NULL, &num_exts_avail, NULL); + VkExtensionProperties *exts_avail = pl_calloc_ptr(tmp, num_exts_avail, exts_avail); + EnumerateInstanceExtensionProperties(NULL, &num_exts_avail, exts_avail); + + struct { + VkExtensionProperties *exts; + uint32_t num_exts; + } *layer_exts = pl_calloc_ptr(tmp, num_layers_avail, layer_exts); + + // Enumerate extensions from layers + for (int i = 0; i < num_layers_avail; i++) { + VkExtensionProperties **lexts = &layer_exts[i].exts; + uint32_t *num = &layer_exts[i].num_exts; + + EnumerateInstanceExtensionProperties(layers_avail[i].layerName, num, NULL); + *lexts = pl_calloc_ptr(tmp, *num, *lexts); + EnumerateInstanceExtensionProperties(layers_avail[i].layerName, num, *lexts); + + // Replace all extensions that are already available globally by {0} + for (int j = 0; j < *num; j++) { + for (int k = 0; k < num_exts_avail; k++) { + if (strcmp((*lexts)[j].extensionName, exts_avail[k].extensionName) == 0) + (*lexts)[j] = (VkExtensionProperties) {0}; + } + } + } + + pl_log_cpu_time(log, start, pl_clock_now(), "enumerating instance extensions"); + pl_debug(log, "Available instance extensions:"); + for (int i = 0; i < num_exts_avail; i++) + pl_debug(log, " %s", exts_avail[i].extensionName); + for (int i = 0; i < num_layers_avail; i++) { + for (int j = 0; j < layer_exts[i].num_exts; j++) { + if (!layer_exts[i].exts[j].extensionName[0]) + continue; + + pl_debug(log, " %s (via %s)", + layer_exts[i].exts[j].extensionName, + layers_avail[i].layerName); + } + } + + // Add mandatory extensions + PL_ARRAY_APPEND(tmp, exts, VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME); + + // Add optional extensions + for (int i = 0; i < PL_ARRAY_SIZE(vk_instance_extensions); i++) { + const char *ext = vk_instance_extensions[i]; + for (int n = 0; n < num_exts_avail; n++) { + if (strcmp(ext, exts_avail[n].extensionName) == 0) { + PL_ARRAY_APPEND(tmp, exts, ext); + break; + } + } + } + +#ifdef VK_KHR_portability_enumeration + // Required for macOS ( MoltenVK ) compatibility + for (int n = 0; n < num_exts_avail; n++) { + if (strcmp(VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME, exts_avail[n].extensionName) == 0) { + PL_ARRAY_APPEND(tmp, exts, VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME); + info.flags |= VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR; + break; + } + } +#endif + + // Add extra user extensions + for (int i = 0; i < params->num_extensions; i++) { + const char *ext = params->extensions[i]; + PL_ARRAY_APPEND(tmp, exts, ext); + + // Enable any additional layers that are required for this extension + for (int n = 0; n < num_layers_avail; n++) { + for (int j = 0; j < layer_exts[n].num_exts; j++) { + if (!layer_exts[n].exts[j].extensionName[0]) + continue; + if (strcmp(ext, layer_exts[n].exts[j].extensionName) == 0) { + PL_ARRAY_APPEND(tmp, layers, layers_avail[n].layerName); + goto next_user_ext; + } + } + } + +next_user_ext: ; + } + + // Add extra optional user extensions + for (int i = 0; i < params->num_opt_extensions; i++) { + const char *ext = params->opt_extensions[i]; + for (int n = 0; n < num_exts_avail; n++) { + if (strcmp(ext, exts_avail[n].extensionName) == 0) { + PL_ARRAY_APPEND(tmp, exts, ext); + goto next_opt_user_ext; + } + } + + for (int n = 0; n < num_layers_avail; n++) { + for (int j = 0; j < layer_exts[n].num_exts; j++) { + if (!layer_exts[n].exts[j].extensionName[0]) + continue; + if (strcmp(ext, layer_exts[n].exts[j].extensionName) == 0) { + PL_ARRAY_APPEND(tmp, exts, ext); + PL_ARRAY_APPEND(tmp, layers, layers_avail[n].layerName); + goto next_opt_user_ext; + } + } + } + +next_opt_user_ext: ; + } + + // If debugging is enabled, load the necessary debug utils extension + if (debug) { + const char * const ext = VK_EXT_DEBUG_UTILS_EXTENSION_NAME; + for (int n = 0; n < num_exts_avail; n++) { + if (strcmp(ext, exts_avail[n].extensionName) == 0) { + PL_ARRAY_APPEND(tmp, exts, ext); + goto debug_ext_done; + } + } + + for (int n = 0; n < layer_exts[debug_layer].num_exts; n++) { + if (strcmp(ext, layer_exts[debug_layer].exts[n].extensionName) == 0) { + PL_ARRAY_APPEND(tmp, exts, ext); + goto debug_ext_done; + } + } + + // No extension found + pl_warn(log, "API debug layers enabled but no debug report extension " + "found... ignoring. Debug messages may be spilling to " + "stdout/stderr!"); + debug = false; + } + +debug_ext_done: ; + + // Limit this to 1.3.250+ because of bugs in older versions. + if (debug && params->debug_extra && + debug_layer_version >= VK_MAKE_API_VERSION(0, 1, 3, 259)) + { + // Try enabling as many validation features as possible + static const VkValidationFeatureEnableEXT validation_features[] = { + VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT, + VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT, + VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT, + VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT, + }; + + static const VkValidationFeaturesEXT vinfo = { + .sType = VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT, + .pEnabledValidationFeatures = validation_features, + .enabledValidationFeatureCount = PL_ARRAY_SIZE(validation_features), + }; + + const char * const ext = VK_EXT_VALIDATION_FEATURES_EXTENSION_NAME; + for (int n = 0; n < num_exts_avail; n++) { + if (strcmp(ext, exts_avail[n].extensionName) == 0) { + PL_ARRAY_APPEND(tmp, exts, ext); + vk_link_struct(&info, &vinfo); + goto debug_extra_ext_done; + } + } + + for (int n = 0; n < layer_exts[debug_layer].num_exts; n++) { + if (strcmp(ext, layer_exts[debug_layer].exts[n].extensionName) == 0) { + PL_ARRAY_APPEND(tmp, exts, ext); + vk_link_struct(&info, &vinfo); + goto debug_extra_ext_done; + } + } + + pl_warn(log, "GPU-assisted validation enabled but not supported by " + "instance, disabling..."); + } + +debug_extra_ext_done: ; + + info.ppEnabledExtensionNames = exts.elem; + info.enabledExtensionCount = exts.num; + info.ppEnabledLayerNames = layers.elem; + info.enabledLayerCount = layers.num; + + pl_info(log, "Creating vulkan instance%s", exts.num ? " with extensions:" : ""); + for (int i = 0; i < exts.num; i++) + pl_info(log, " %s", exts.elem[i]); + + if (layers.num) { + pl_info(log, " and layers:"); + for (int i = 0; i < layers.num; i++) + pl_info(log, " %s", layers.elem[i]); + } + + start = pl_clock_now(); + PL_VK_LOAD_FUN(NULL, CreateInstance, get_addr); + VkResult res = CreateInstance(&info, PL_VK_ALLOC, &inst); + pl_log_cpu_time(log, start, pl_clock_now(), "creating vulkan instance"); + if (res != VK_SUCCESS) { + pl_fatal(log, "Failed creating instance: %s", vk_res_str(res)); + goto error; + } + + struct pl_vk_inst_t *pl_vk = pl_zalloc_obj(NULL, pl_vk, struct priv); + struct priv *p = PL_PRIV(pl_vk); + *pl_vk = (struct pl_vk_inst_t) { + .instance = inst, + .api_version = api_ver, + .get_proc_addr = get_addr, + .extensions = pl_steal(pl_vk, exts.elem), + .num_extensions = exts.num, + .layers = pl_steal(pl_vk, layers.elem), + .num_layers = layers.num, + }; + + // Set up a debug callback to catch validation messages + if (debug) { + VkDebugUtilsMessengerCreateInfoEXT dinfo = { + .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT, + .messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT, + .messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT, + .pfnUserCallback = vk_dbg_utils_cb, + .pUserData = (void *) log, + }; + + PL_VK_LOAD_FUN(inst, CreateDebugUtilsMessengerEXT, get_addr); + CreateDebugUtilsMessengerEXT(inst, &dinfo, PL_VK_ALLOC, &p->debug_utils_cb); + } + + pl_free(tmp); + return pl_vk; + +error: + pl_fatal(log, "Failed initializing vulkan instance"); + if (inst) { + PL_VK_LOAD_FUN(inst, DestroyInstance, get_addr); + DestroyInstance(inst, PL_VK_ALLOC); + } + pl_free(tmp); + return NULL; +} + +const struct pl_vulkan_params pl_vulkan_default_params = { PL_VULKAN_DEFAULTS }; + +void pl_vulkan_destroy(pl_vulkan *pl_vk) +{ + if (!*pl_vk) + return; + + struct vk_ctx *vk = PL_PRIV(*pl_vk); + if (vk->dev) { + if ((*pl_vk)->gpu) { + PL_DEBUG(vk, "Waiting for remaining commands..."); + pl_gpu_finish((*pl_vk)->gpu); + pl_assert(vk->cmds_pending.num == 0); + + pl_gpu_destroy((*pl_vk)->gpu); + } + vk_malloc_destroy(&vk->ma); + for (int i = 0; i < vk->pools.num; i++) + vk_cmdpool_destroy(vk->pools.elem[i]); + + if (!vk->imported) + vk->DestroyDevice(vk->dev, PL_VK_ALLOC); + } + + for (int i = 0; i < vk->queue_locks.num; i++) { + for (int n = 0; n < vk->queue_locks.elem[i].num; n++) + pl_mutex_destroy(&vk->queue_locks.elem[i].elem[n]); + } + + pl_vk_inst_destroy(&vk->internal_instance); + pl_mutex_destroy(&vk->lock); + pl_free_ptr((void **) pl_vk); +} + +static bool supports_surf(pl_log log, VkInstance inst, + PFN_vkGetInstanceProcAddr get_addr, + VkPhysicalDevice physd, VkSurfaceKHR surf) +{ + // Hack for the VK macro's logging to work + struct { pl_log log; } *vk = (void *) &log; + + PL_VK_LOAD_FUN(inst, GetPhysicalDeviceQueueFamilyProperties, get_addr); + PL_VK_LOAD_FUN(inst, GetPhysicalDeviceSurfaceSupportKHR, get_addr); + uint32_t qfnum = 0; + GetPhysicalDeviceQueueFamilyProperties(physd, &qfnum, NULL); + + for (int i = 0; i < qfnum; i++) { + VkBool32 sup = false; + VK(GetPhysicalDeviceSurfaceSupportKHR(physd, i, surf, &sup)); + if (sup) + return true; + } + +error: + return false; +} + +VkPhysicalDevice pl_vulkan_choose_device(pl_log log, + const struct pl_vulkan_device_params *params) +{ + // Hack for the VK macro's logging to work + struct { pl_log log; } *vk = (void *) &log; + PL_INFO(vk, "Probing for vulkan devices:"); + + pl_assert(params->instance); + VkInstance inst = params->instance; + VkPhysicalDevice dev = VK_NULL_HANDLE; + + PFN_vkGetInstanceProcAddr get_addr; + if (!(get_addr = get_proc_addr_fallback(log, params->get_proc_addr))) + return NULL; + + PL_VK_LOAD_FUN(inst, EnumeratePhysicalDevices, get_addr); + PL_VK_LOAD_FUN(inst, GetPhysicalDeviceProperties2, get_addr); + pl_assert(GetPhysicalDeviceProperties2); + + pl_clock_t start = pl_clock_now(); + VkPhysicalDevice *devices = NULL; + uint32_t num = 0; + VK(EnumeratePhysicalDevices(inst, &num, NULL)); + devices = pl_calloc_ptr(NULL, num, devices); + VK(EnumeratePhysicalDevices(inst, &num, devices)); + pl_log_cpu_time(log, start, pl_clock_now(), "enumerating physical devices"); + + static const struct { const char *name; int priority; } types[] = { + [VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU] = {"discrete", 5}, + [VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU] = {"integrated", 4}, + [VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU] = {"virtual", 3}, + [VK_PHYSICAL_DEVICE_TYPE_CPU] = {"software", 2}, + [VK_PHYSICAL_DEVICE_TYPE_OTHER] = {"other", 1}, + }; + + static const uint8_t nil[VK_UUID_SIZE] = {0}; + bool uuid_set = memcmp(params->device_uuid, nil, VK_UUID_SIZE) != 0; + + int best = -1; + for (int i = 0; i < num; i++) { + VkPhysicalDeviceIDPropertiesKHR id_props = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR, + }; + + VkPhysicalDeviceProperties2 prop = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR, + .pNext = &id_props, + }; + + GetPhysicalDeviceProperties2(devices[i], &prop); + VkPhysicalDeviceType t = prop.properties.deviceType; + const char *dtype = t < PL_ARRAY_SIZE(types) ? types[t].name : "unknown?"; + PL_INFO(vk, " GPU %d: %s v%d.%d.%d (%s)", i, prop.properties.deviceName, + PRINTF_VER(prop.properties.apiVersion), dtype); + PL_INFO(vk, " uuid: %s", PRINT_UUID(id_props.deviceUUID)); + + if (params->surface) { + if (!supports_surf(log, inst, get_addr, devices[i], params->surface)) { + PL_DEBUG(vk, " -> excluding due to lack of surface support"); + continue; + } + } + + if (uuid_set) { + if (memcmp(id_props.deviceUUID, params->device_uuid, VK_UUID_SIZE) == 0) { + dev = devices[i]; + continue; + } else { + PL_DEBUG(vk, " -> excluding due to UUID mismatch"); + continue; + } + } else if (params->device_name && params->device_name[0] != '\0') { + if (strcmp(params->device_name, prop.properties.deviceName) == 0) { + dev = devices[i]; + continue; + } else { + PL_DEBUG(vk, " -> excluding due to name mismatch"); + continue; + } + } + + if (!params->allow_software && t == VK_PHYSICAL_DEVICE_TYPE_CPU) { + PL_DEBUG(vk, " -> excluding due to !params->allow_software"); + continue; + } + + if (prop.properties.apiVersion < PL_VK_MIN_VERSION) { + PL_DEBUG(vk, " -> excluding due to too low API version"); + continue; + } + + int priority = t < PL_ARRAY_SIZE(types) ? types[t].priority : 0; + if (priority > best) { + dev = devices[i]; + best = priority; + } + } + +error: + pl_free(devices); + return dev; +} + +static void lock_queue_internal(void *priv, uint32_t qf, uint32_t qidx) +{ + struct vk_ctx *vk = priv; + pl_mutex_lock(&vk->queue_locks.elem[qf].elem[qidx]); +} + +static void unlock_queue_internal(void *priv, uint32_t qf, uint32_t qidx) +{ + struct vk_ctx *vk = priv; + pl_mutex_unlock(&vk->queue_locks.elem[qf].elem[qidx]); +} + +static void init_queue_locks(struct vk_ctx *vk, uint32_t qfnum, + const VkQueueFamilyProperties *qfs) +{ + vk->queue_locks.elem = pl_calloc_ptr(vk->alloc, qfnum, vk->queue_locks.elem); + vk->queue_locks.num = qfnum; + for (int i = 0; i < qfnum; i++) { + const uint32_t qnum = qfs[i].queueCount; + vk->queue_locks.elem[i].elem = pl_calloc(vk->alloc, qnum, sizeof(pl_mutex)); + vk->queue_locks.elem[i].num = qnum; + for (int n = 0; n < qnum; n++) + pl_mutex_init(&vk->queue_locks.elem[i].elem[n]); + } + + vk->lock_queue = lock_queue_internal; + vk->unlock_queue = unlock_queue_internal; + vk->queue_ctx = vk; +} + +// Find the most specialized queue supported a combination of flags. In cases +// where there are multiple queue families at the same specialization level, +// this finds the one with the most queues. Returns -1 if no queue was found. +static int find_qf(VkQueueFamilyProperties *qfs, int qfnum, VkQueueFlags flags) +{ + int idx = -1; + for (int i = 0; i < qfnum; i++) { + if ((qfs[i].queueFlags & flags) != flags) + continue; + + // QF is more specialized. Since we don't care about other bits like + // SPARSE_BIT, mask the ones we're interestew in + const VkQueueFlags mask = VK_QUEUE_GRAPHICS_BIT | + VK_QUEUE_TRANSFER_BIT | + VK_QUEUE_COMPUTE_BIT; + + if (idx < 0 || (qfs[i].queueFlags & mask) < (qfs[idx].queueFlags & mask)) + idx = i; + + // QF has more queues (at the same specialization level) + if (qfs[i].queueFlags == qfs[idx].queueFlags && + qfs[i].queueCount > qfs[idx].queueCount) + idx = i; + } + + return idx; +} + +static bool device_init(struct vk_ctx *vk, const struct pl_vulkan_params *params) +{ + pl_assert(vk->physd); + void *tmp = pl_tmp(NULL); + + // Enumerate the queue families and find suitable families for each task + uint32_t qfnum = 0; + vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, NULL); + VkQueueFamilyProperties *qfs = pl_calloc_ptr(tmp, qfnum, qfs); + vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, qfs); + init_queue_locks(vk, qfnum, qfs); + + PL_DEBUG(vk, "Queue families supported by device:"); + for (int i = 0; i < qfnum; i++) { + PL_DEBUG(vk, " %d: flags 0x%"PRIx32" num %"PRIu32, i, + qfs[i].queueFlags, qfs[i].queueCount); + } + + VkQueueFlagBits gfx_flags = VK_QUEUE_GRAPHICS_BIT; + if (!params->async_compute) + gfx_flags |= VK_QUEUE_COMPUTE_BIT; + + int idx_gfx = find_qf(qfs, qfnum, gfx_flags); + int idx_comp = find_qf(qfs, qfnum, VK_QUEUE_COMPUTE_BIT); + int idx_tf = find_qf(qfs, qfnum, VK_QUEUE_TRANSFER_BIT); + if (idx_tf < 0) + idx_tf = idx_comp; + + if (!params->async_compute) + idx_comp = idx_gfx; + if (!params->async_transfer) + idx_tf = idx_gfx; + + PL_DEBUG(vk, "Using graphics queue %d", idx_gfx); + if (idx_tf != idx_gfx) + PL_INFO(vk, "Using async transfer (queue %d)", idx_tf); + if (idx_comp != idx_gfx) + PL_INFO(vk, "Using async compute (queue %d)", idx_comp); + + // Vulkan requires at least one GRAPHICS+COMPUTE queue, so if this fails + // something is horribly wrong. + pl_assert(idx_gfx >= 0 && idx_comp >= 0 && idx_tf >= 0); + + // If needed, ensure we can actually present to the surface using this queue + if (params->surface) { + VkBool32 sup = false; + VK(vk->GetPhysicalDeviceSurfaceSupportKHR(vk->physd, idx_gfx, + params->surface, &sup)); + if (!sup) { + PL_FATAL(vk, "Queue family does not support surface presentation!"); + goto error; + } + } + + // Enumerate all supported extensions + pl_clock_t start = pl_clock_now(); + uint32_t num_exts_avail = 0; + VK(vk->EnumerateDeviceExtensionProperties(vk->physd, NULL, &num_exts_avail, NULL)); + VkExtensionProperties *exts_avail = pl_calloc_ptr(tmp, num_exts_avail, exts_avail); + VK(vk->EnumerateDeviceExtensionProperties(vk->physd, NULL, &num_exts_avail, exts_avail)); + pl_log_cpu_time(vk->log, start, pl_clock_now(), "enumerating device extensions"); + + PL_DEBUG(vk, "Available device extensions:"); + for (int i = 0; i < num_exts_avail; i++) + PL_DEBUG(vk, " %s", exts_avail[i].extensionName); + + // Add all extensions we need + if (params->surface) + PL_ARRAY_APPEND(vk->alloc, vk->exts, VK_KHR_SWAPCHAIN_EXTENSION_NAME); + + // Keep track of all optional function pointers associated with extensions + PL_ARRAY(const struct vk_fun *) ext_funs = {0}; + + // Add all optional device-level extensions extensions + for (int i = 0; i < PL_ARRAY_SIZE(vk_device_extensions); i++) { + const struct vk_ext *ext = &vk_device_extensions[i]; + uint32_t core_ver = vk_ext_promoted_ver(ext->name); + if (core_ver && vk->api_ver >= core_ver) { + // Layer is already implicitly enabled by the API version + for (const struct vk_fun *f = ext->funs; f && f->name; f++) + PL_ARRAY_APPEND(tmp, ext_funs, f); + continue; + } + + for (int n = 0; n < num_exts_avail; n++) { + if (strcmp(ext->name, exts_avail[n].extensionName) == 0) { + PL_ARRAY_APPEND(vk->alloc, vk->exts, ext->name); + for (const struct vk_fun *f = ext->funs; f && f->name; f++) + PL_ARRAY_APPEND(tmp, ext_funs, f); + break; + } + } + } + + // Add extra user extensions + for (int i = 0; i < params->num_extensions; i++) + PL_ARRAY_APPEND(vk->alloc, vk->exts, params->extensions[i]); + + // Add optional extra user extensions + for (int i = 0; i < params->num_opt_extensions; i++) { + const char *ext = params->opt_extensions[i]; + for (int n = 0; n < num_exts_avail; n++) { + if (strcmp(ext, exts_avail[n].extensionName) == 0) { + PL_ARRAY_APPEND(vk->alloc, vk->exts, ext); + break; + } + } + } + + VkPhysicalDeviceFeatures2 features = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR + }; + + vk_features_normalize(tmp, &pl_vulkan_required_features, vk->api_ver, &features); + vk_features_normalize(tmp, &pl_vulkan_recommended_features, vk->api_ver, &features); + vk_features_normalize(tmp, params->features, vk->api_ver, &features); + + // Explicitly clear the features struct before querying feature support + // from the driver. This way, we don't mistakenly mark as supported + // features coming from structs the driver doesn't have support for. + VkPhysicalDeviceFeatures2 *features_sup = vk_chain_memdup(tmp, &features);; + for (VkBaseOutStructure *out = (void *) features_sup; out; out = out->pNext) { + const size_t size = vk_struct_size(out->sType); + memset(&out[1], 0, size - sizeof(out[0])); + } + + vk->GetPhysicalDeviceFeatures2KHR(vk->physd, features_sup); + + // Filter out unsupported features + for (VkBaseOutStructure *f = (VkBaseOutStructure *) &features; f; f = f->pNext) { + const VkBaseInStructure *sup = vk_find_struct(features_sup, f->sType); + VkBool32 *flags = (VkBool32 *) &f[1]; + const VkBool32 *flags_sup = (const VkBool32 *) &sup[1]; + const size_t size = vk_struct_size(f->sType) - sizeof(VkBaseOutStructure); + for (int i = 0; i < size / sizeof(VkBool32); i++) + flags[i] &= flags_sup[i]; + } + + // Construct normalized output chain + vk->features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; + vk_features_normalize(vk->alloc, &features, 0, &vk->features); + if (!check_required_features(vk)) { + PL_FATAL(vk, "Vulkan device does not support all required features!"); + goto error; + } + + // Enable all queues at device creation time, to maximize compatibility + // with other API users (e.g. FFmpeg) + PL_ARRAY(VkDeviceQueueCreateInfo) qinfos = {0}; + for (int i = 0; i < qfnum; i++) { + bool use_qf = i == idx_gfx || i == idx_comp || i == idx_tf; + use_qf |= qfs[i].queueFlags & params->extra_queues; + if (!use_qf) + continue; + PL_ARRAY_APPEND(tmp, qinfos, (VkDeviceQueueCreateInfo) { + .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, + .queueFamilyIndex = i, + .queueCount = qfs[i].queueCount, + .pQueuePriorities = pl_calloc(tmp, qfs[i].queueCount, sizeof(float)), + }); + } + + VkDeviceCreateInfo dinfo = { + .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, + .pNext = &features, + .pQueueCreateInfos = qinfos.elem, + .queueCreateInfoCount = qinfos.num, + .ppEnabledExtensionNames = vk->exts.elem, + .enabledExtensionCount = vk->exts.num, + }; + + PL_INFO(vk, "Creating vulkan device%s", vk->exts.num ? " with extensions:" : ""); + for (int i = 0; i < vk->exts.num; i++) + PL_INFO(vk, " %s", vk->exts.elem[i]); + + start = pl_clock_now(); + VK(vk->CreateDevice(vk->physd, &dinfo, PL_VK_ALLOC, &vk->dev)); + pl_log_cpu_time(vk->log, start, pl_clock_now(), "creating vulkan device"); + + // Load all mandatory device-level functions + for (int i = 0; i < PL_ARRAY_SIZE(vk_dev_funs); i++) + load_vk_fun(vk, &vk_dev_funs[i]); + + // Load all of the optional functions from the extensions we enabled + for (int i = 0; i < ext_funs.num; i++) + load_vk_fun(vk, ext_funs.elem[i]); + + // Create the command pools for the queues we care about + const uint32_t qmax = PL_DEF(params->queue_count, UINT32_MAX); + for (int i = 0; i < qfnum; i++) { + if (i != idx_gfx && i != idx_tf && i != idx_comp) + continue; // ignore QFs not used internally + + int qnum = qfs[i].queueCount; + if (qmax < qnum) { + PL_DEBUG(vk, "Restricting QF %d from %d queues to %d", i, qnum, qmax); + qnum = qmax; + } + + struct vk_cmdpool *pool = vk_cmdpool_create(vk, i, qnum, qfs[i]); + if (!pool) + goto error; + PL_ARRAY_APPEND(vk->alloc, vk->pools, pool); + + // Update the pool_* pointers based on the corresponding index + const char *qf_name = NULL; + if (i == idx_tf) { + vk->pool_transfer = pool; + qf_name = "transfer"; + } + if (i == idx_comp) { + vk->pool_compute = pool; + qf_name = "compute"; + } + if (i == idx_gfx) { + vk->pool_graphics = pool; + qf_name = "graphics"; + } + + for (int n = 0; n < pool->num_queues; n++) + PL_VK_NAME_HANDLE(QUEUE, pool->queues[n], qf_name); + } + + pl_free(tmp); + return true; + +error: + PL_FATAL(vk, "Failed creating logical device!"); + pl_free(tmp); + vk->failed = true; + return false; +} + +static void lock_queue(pl_vulkan pl_vk, uint32_t qf, uint32_t qidx) +{ + struct vk_ctx *vk = PL_PRIV(pl_vk); + vk->lock_queue(vk->queue_ctx, qf, qidx); +} + +static void unlock_queue(pl_vulkan pl_vk, uint32_t qf, uint32_t qidx) +{ + struct vk_ctx *vk = PL_PRIV(pl_vk); + vk->unlock_queue(vk->queue_ctx, qf, qidx); +} + +static bool finalize_context(struct pl_vulkan_t *pl_vk, int max_glsl_version) +{ + struct vk_ctx *vk = PL_PRIV(pl_vk); + + pl_assert(vk->pool_graphics); + pl_assert(vk->pool_compute); + pl_assert(vk->pool_transfer); + + vk->ma = vk_malloc_create(vk); + if (!vk->ma) + return false; + + pl_vk->gpu = pl_gpu_create_vk(vk); + if (!pl_vk->gpu) + return false; + + // Blacklist / restrict features + if (max_glsl_version) { + struct pl_glsl_version *glsl = (struct pl_glsl_version *) &pl_vk->gpu->glsl; + glsl->version = PL_MIN(glsl->version, max_glsl_version); + glsl->version = PL_MAX(glsl->version, 140); // required for GL_KHR_vulkan_glsl + PL_INFO(vk, "Restricting GLSL version to %d... new version is %d", + max_glsl_version, glsl->version); + } + + // Expose the resulting vulkan objects + pl_vk->instance = vk->inst; + pl_vk->phys_device = vk->physd; + pl_vk->device = vk->dev; + pl_vk->get_proc_addr = vk->GetInstanceProcAddr; + pl_vk->api_version = vk->api_ver; + pl_vk->extensions = vk->exts.elem; + pl_vk->num_extensions = vk->exts.num; + pl_vk->features = &vk->features; + pl_vk->num_queues = vk->pools.num; + pl_vk->queues = pl_calloc_ptr(vk->alloc, vk->pools.num, pl_vk->queues); + pl_vk->lock_queue = lock_queue; + pl_vk->unlock_queue = unlock_queue; + + for (int i = 0; i < vk->pools.num; i++) { + struct pl_vulkan_queue *queues = (struct pl_vulkan_queue *) pl_vk->queues; + queues[i] = (struct pl_vulkan_queue) { + .index = vk->pools.elem[i]->qf, + .count = vk->pools.elem[i]->num_queues, + }; + + if (vk->pools.elem[i] == vk->pool_graphics) + pl_vk->queue_graphics = queues[i]; + if (vk->pools.elem[i] == vk->pool_compute) + pl_vk->queue_compute = queues[i]; + if (vk->pools.elem[i] == vk->pool_transfer) + pl_vk->queue_transfer = queues[i]; + } + + pl_assert(vk->lock_queue); + pl_assert(vk->unlock_queue); + return true; +} + +pl_vulkan pl_vulkan_create(pl_log log, const struct pl_vulkan_params *params) +{ + params = PL_DEF(params, &pl_vulkan_default_params); + struct pl_vulkan_t *pl_vk = pl_zalloc_obj(NULL, pl_vk, struct vk_ctx); + struct vk_ctx *vk = PL_PRIV(pl_vk); + *vk = (struct vk_ctx) { + .vulkan = pl_vk, + .alloc = pl_vk, + .log = log, + .inst = params->instance, + .GetInstanceProcAddr = get_proc_addr_fallback(log, params->get_proc_addr), + }; + + pl_mutex_init_type(&vk->lock, PL_MUTEX_RECURSIVE); + if (!vk->GetInstanceProcAddr) + goto error; + + if (!vk->inst) { + pl_assert(!params->surface); + pl_assert(!params->device); + PL_DEBUG(vk, "No VkInstance provided, creating one..."); + + // Mirror the instance params here to set `get_proc_addr` correctly + struct pl_vk_inst_params iparams; + iparams = *PL_DEF(params->instance_params, &pl_vk_inst_default_params); + iparams.get_proc_addr = params->get_proc_addr; + vk->internal_instance = pl_vk_inst_create(log, &iparams); + if (!vk->internal_instance) + goto error; + vk->inst = vk->internal_instance->instance; + } + + // Directly load all mandatory instance-level function pointers, since + // these will be required for all further device creation logic + for (int i = 0; i < PL_ARRAY_SIZE(vk_inst_funs); i++) + load_vk_fun(vk, &vk_inst_funs[i]); + + // Choose the physical device + if (params->device) { + PL_DEBUG(vk, "Using specified VkPhysicalDevice"); + vk->physd = params->device; + } else { + struct pl_vulkan_device_params dparams = { + .instance = vk->inst, + .get_proc_addr = params->get_proc_addr, + .surface = params->surface, + .device_name = params->device_name, + .allow_software = params->allow_software, + }; + memcpy(dparams.device_uuid, params->device_uuid, VK_UUID_SIZE); + + vk->physd = pl_vulkan_choose_device(log, &dparams); + if (!vk->physd) { + PL_FATAL(vk, "Found no suitable device, giving up."); + goto error; + } + } + + VkPhysicalDeviceIDPropertiesKHR id_props = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR, + }; + + VkPhysicalDeviceProperties2KHR prop = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR, + .pNext = &id_props, + }; + + vk->GetPhysicalDeviceProperties2(vk->physd, &prop); + vk->props = prop.properties; + + PL_INFO(vk, "Vulkan device properties:"); + PL_INFO(vk, " Device Name: %s", prop.properties.deviceName); + PL_INFO(vk, " Device ID: %"PRIx32":%"PRIx32, prop.properties.vendorID, + prop.properties.deviceID); + PL_INFO(vk, " Device UUID: %s", PRINT_UUID(id_props.deviceUUID)); + PL_INFO(vk, " Driver version: %"PRIx32, prop.properties.driverVersion); + PL_INFO(vk, " API version: %d.%d.%d", PRINTF_VER(prop.properties.apiVersion)); + + // Needed by device_init + vk->api_ver = prop.properties.apiVersion; + if (params->max_api_version) { + vk->api_ver = PL_MIN(vk->api_ver, params->max_api_version); + PL_INFO(vk, "Restricting API version to %d.%d.%d... new version %d.%d.%d", + PRINTF_VER(params->max_api_version), PRINTF_VER(vk->api_ver)); + } + + if (vk->api_ver < PL_VK_MIN_VERSION) { + PL_FATAL(vk, "Device API version %d.%d.%d is lower than the minimum " + "required version of %d.%d.%d, cannot proceed!", + PRINTF_VER(vk->api_ver), PRINTF_VER(PL_VK_MIN_VERSION)); + goto error; + } + + // Finally, initialize the logical device and the rest of the vk_ctx + if (!device_init(vk, params)) + goto error; + + if (!finalize_context(pl_vk, params->max_glsl_version)) + goto error; + + return pl_vk; + +error: + PL_FATAL(vk, "Failed initializing vulkan device"); + pl_vulkan_destroy((pl_vulkan *) &pl_vk); + return NULL; +} + +pl_vulkan pl_vulkan_import(pl_log log, const struct pl_vulkan_import_params *params) +{ + void *tmp = pl_tmp(NULL); + + struct pl_vulkan_t *pl_vk = pl_zalloc_obj(NULL, pl_vk, struct vk_ctx); + struct vk_ctx *vk = PL_PRIV(pl_vk); + *vk = (struct vk_ctx) { + .vulkan = pl_vk, + .alloc = pl_vk, + .log = log, + .imported = true, + .inst = params->instance, + .physd = params->phys_device, + .dev = params->device, + .GetInstanceProcAddr = get_proc_addr_fallback(log, params->get_proc_addr), + .lock_queue = params->lock_queue, + .unlock_queue = params->unlock_queue, + .queue_ctx = params->queue_ctx, + }; + + pl_mutex_init_type(&vk->lock, PL_MUTEX_RECURSIVE); + if (!vk->GetInstanceProcAddr) + goto error; + + for (int i = 0; i < PL_ARRAY_SIZE(vk_inst_funs); i++) + load_vk_fun(vk, &vk_inst_funs[i]); + + VkPhysicalDeviceIDPropertiesKHR id_props = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR, + }; + + VkPhysicalDeviceProperties2KHR prop = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR, + .pNext = &id_props, + }; + + pl_assert(vk->GetPhysicalDeviceProperties2); + vk->GetPhysicalDeviceProperties2(vk->physd, &prop); + vk->props = prop.properties; + + PL_INFO(vk, "Imported vulkan device properties:"); + PL_INFO(vk, " Device Name: %s", prop.properties.deviceName); + PL_INFO(vk, " Device ID: %"PRIx32":%"PRIx32, prop.properties.vendorID, + prop.properties.deviceID); + PL_INFO(vk, " Device UUID: %s", PRINT_UUID(id_props.deviceUUID)); + PL_INFO(vk, " Driver version: %"PRIx32, prop.properties.driverVersion); + PL_INFO(vk, " API version: %d.%d.%d", PRINTF_VER(prop.properties.apiVersion)); + + vk->api_ver = prop.properties.apiVersion; + if (params->max_api_version) { + vk->api_ver = PL_MIN(vk->api_ver, params->max_api_version); + PL_INFO(vk, "Restricting API version to %d.%d.%d... new version %d.%d.%d", + PRINTF_VER(params->max_api_version), PRINTF_VER(vk->api_ver)); + } + + if (vk->api_ver < PL_VK_MIN_VERSION) { + PL_FATAL(vk, "Device API version %d.%d.%d is lower than the minimum " + "required version of %d.%d.%d, cannot proceed!", + PRINTF_VER(vk->api_ver), PRINTF_VER(PL_VK_MIN_VERSION)); + goto error; + } + + vk->features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; + vk_features_normalize(vk->alloc, params->features, 0, &vk->features); + if (!check_required_features(vk)) { + PL_FATAL(vk, "Imported Vulkan device was not created with all required " + "features!"); + goto error; + } + + // Load all mandatory device-level functions + for (int i = 0; i < PL_ARRAY_SIZE(vk_dev_funs); i++) + load_vk_fun(vk, &vk_dev_funs[i]); + + // Load all of the optional functions from the extensions enabled + for (int i = 0; i < PL_ARRAY_SIZE(vk_device_extensions); i++) { + const struct vk_ext *ext = &vk_device_extensions[i]; + uint32_t core_ver = vk_ext_promoted_ver(ext->name); + if (core_ver && vk->api_ver >= core_ver) { + for (const struct vk_fun *f = ext->funs; f && f->name; f++) + load_vk_fun(vk, f); + continue; + } + for (int n = 0; n < params->num_extensions; n++) { + if (strcmp(ext->name, params->extensions[n]) == 0) { + for (const struct vk_fun *f = ext->funs; f && f->name; f++) + load_vk_fun(vk, f); + break; + } + } + } + + uint32_t qfnum = 0; + vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, NULL); + VkQueueFamilyProperties *qfs = pl_calloc_ptr(tmp, qfnum, qfs); + vk->GetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, qfs); + if (!params->lock_queue) + init_queue_locks(vk, qfnum, qfs); + + // Create the command pools for each unique qf that exists + struct { + const struct pl_vulkan_queue *info; + struct vk_cmdpool **pool; + VkQueueFlagBits flags; // *any* of these flags provide the cap + } qinfos[] = { + { + .info = ¶ms->queue_graphics, + .pool = &vk->pool_graphics, + .flags = VK_QUEUE_GRAPHICS_BIT, + }, { + .info = ¶ms->queue_compute, + .pool = &vk->pool_compute, + .flags = VK_QUEUE_COMPUTE_BIT, + }, { + .info = ¶ms->queue_transfer, + .pool = &vk->pool_transfer, + .flags = VK_QUEUE_TRANSFER_BIT | + VK_QUEUE_GRAPHICS_BIT | + VK_QUEUE_COMPUTE_BIT, + } + }; + + for (int i = 0; i < PL_ARRAY_SIZE(qinfos); i++) { + int qf = qinfos[i].info->index; + struct vk_cmdpool **pool = qinfos[i].pool; + if (!qinfos[i].info->count) + continue; + + // API sanity check + pl_assert(qfs[qf].queueFlags & qinfos[i].flags); + + // See if we already created a pool for this queue family + for (int j = 0; j < i; j++) { + if (qinfos[j].info->count && qinfos[j].info->index == qf) { + *pool = *qinfos[j].pool; + goto next_qf; + } + } + + *pool = vk_cmdpool_create(vk, qf, qinfos[i].info->count, qfs[qf]); + if (!*pool) + goto error; + PL_ARRAY_APPEND(vk->alloc, vk->pools, *pool); + + // Pre-emptively set "lower priority" pools as well + for (int j = i+1; j < PL_ARRAY_SIZE(qinfos); j++) { + if (qfs[qf].queueFlags & qinfos[j].flags) + *qinfos[j].pool = *pool; + } + +next_qf: ; + } + + if (!vk->pool_graphics) { + PL_ERR(vk, "No valid queues provided?"); + goto error; + } + + if (!finalize_context(pl_vk, params->max_glsl_version)) + goto error; + + pl_free(tmp); + return pl_vk; + +error: + PL_FATAL(vk, "Failed importing vulkan device"); + pl_vulkan_destroy((pl_vulkan *) &pl_vk); + pl_free(tmp); + return NULL; +} diff --git a/src/vulkan/formats.c b/src/vulkan/formats.c new file mode 100644 index 0000000..f0eb0fb --- /dev/null +++ b/src/vulkan/formats.c @@ -0,0 +1,616 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "formats.h" + +#define FMT(_name, num, size, ftype, bits, idx) \ + (struct pl_fmt_t) { \ + .name = _name, \ + .type = PL_FMT_##ftype, \ + .num_components = num, \ + .component_depth = bits, \ + .internal_size = size, \ + .opaque = false, \ + .texel_size = size, \ + .texel_align = size, \ + .host_bits = bits, \ + .sample_order = idx, \ + } + +#define IDX(...) {__VA_ARGS__} +#define BITS(...) {__VA_ARGS__} + +#define REGFMT(name, num, bits, type) \ + FMT(name, num, (num) * (bits) / 8, type, \ + BITS(bits, bits, bits, bits), \ + IDX(0, 1, 2, 3)) + +#define EMUFMT(_name, in, en, ib, eb, ftype) \ + (struct pl_fmt_t) { \ + .name = _name, \ + .type = PL_FMT_##ftype, \ + .num_components = en, \ + .component_depth = BITS(ib, ib, ib, ib),\ + .internal_size = (in) * (ib) / 8, \ + .opaque = false, \ + .emulated = true, \ + .texel_size = (en) * (eb) / 8, \ + .texel_align = (eb) / 8, \ + .host_bits = BITS(eb, eb, eb, eb),\ + .sample_order = IDX(0, 1, 2, 3), \ + } + +#define PACKED16FMT(_name, num, b) \ + (struct pl_fmt_t) { \ + .name = _name, \ + .type = PL_FMT_UNORM, \ + .num_components = num, \ + .component_depth = BITS(b, b, b, b), \ + .internal_size = (num) * 2, \ + .texel_size = (num) * 2, \ + .texel_align = (num) * 2, \ + .host_bits = BITS(16, 16, 16, 16),\ + .sample_order = IDX(0, 1, 2, 3), \ + } + +#define PLANARFMT(_name, planes, size, bits) \ + (struct pl_fmt_t) { \ + .name = _name, \ + .type = PL_FMT_UNORM, \ + .num_planes = planes, \ + .num_components = 3, \ + .component_depth = {bits, bits, bits}, \ + .internal_size = size, \ + .opaque = true, \ + } + +static const struct vk_format rgb8e = { + .tfmt = VK_FORMAT_R8G8B8A8_UNORM, + .bfmt = VK_FORMAT_R8G8B8_UNORM, + .icomps = 4, + .fmt = EMUFMT("rgb8", 4, 3, 8, 8, UNORM), +}; + +static const struct vk_format rgb16e = { + .tfmt = VK_FORMAT_R16G16B16A16_UNORM, + .bfmt = VK_FORMAT_R16G16B16_UNORM, + .icomps = 4, + .fmt = EMUFMT("rgb16", 4, 3, 16, 16, UNORM), +}; + +static const struct vk_format vk_formats[] = { + // Regular, byte-aligned integer formats + {VK_FORMAT_R8_UNORM, REGFMT("r8", 1, 8, UNORM)}, + {VK_FORMAT_R8G8_UNORM, REGFMT("rg8", 2, 8, UNORM)}, + {VK_FORMAT_R8G8B8_UNORM, REGFMT("rgb8", 3, 8, UNORM), .emufmt = &rgb8e}, + {VK_FORMAT_R8G8B8A8_UNORM, REGFMT("rgba8", 4, 8, UNORM)}, + {VK_FORMAT_R16_UNORM, REGFMT("r16", 1, 16, UNORM)}, + {VK_FORMAT_R16G16_UNORM, REGFMT("rg16", 2, 16, UNORM)}, + {VK_FORMAT_R16G16B16_UNORM, REGFMT("rgb16", 3, 16, UNORM), .emufmt = &rgb16e}, + {VK_FORMAT_R16G16B16A16_UNORM, REGFMT("rgba16", 4, 16, UNORM)}, + + {VK_FORMAT_R8_SNORM, REGFMT("r8s", 1, 8, SNORM)}, + {VK_FORMAT_R8G8_SNORM, REGFMT("rg8s", 2, 8, SNORM)}, + {VK_FORMAT_R8G8B8_SNORM, REGFMT("rgb8s", 3, 8, SNORM)}, + {VK_FORMAT_R8G8B8A8_SNORM, REGFMT("rgba8s", 4, 8, SNORM)}, + {VK_FORMAT_R16_SNORM, REGFMT("r16s", 1, 16, SNORM)}, + {VK_FORMAT_R16G16_SNORM, REGFMT("rg16s", 2, 16, SNORM)}, + {VK_FORMAT_R16G16B16_SNORM, REGFMT("rgb16s", 3, 16, SNORM)}, + {VK_FORMAT_R16G16B16A16_SNORM, REGFMT("rgba16s", 4, 16, SNORM)}, + + // Float formats (native formats: hf = half float, df = double float) + {VK_FORMAT_R16_SFLOAT, REGFMT("r16hf", 1, 16, FLOAT)}, + {VK_FORMAT_R16G16_SFLOAT, REGFMT("rg16hf", 2, 16, FLOAT)}, + {VK_FORMAT_R16G16B16_SFLOAT, REGFMT("rgb16hf", 3, 16, FLOAT)}, + {VK_FORMAT_R16G16B16A16_SFLOAT, REGFMT("rgba16hf", 4, 16, FLOAT)}, + {VK_FORMAT_R32_SFLOAT, REGFMT("r32f", 1, 32, FLOAT)}, + {VK_FORMAT_R32G32_SFLOAT, REGFMT("rg32f", 2, 32, FLOAT)}, + {VK_FORMAT_R32G32B32_SFLOAT, REGFMT("rgb32f", 3, 32, FLOAT)}, + {VK_FORMAT_R32G32B32A32_SFLOAT, REGFMT("rgba32f", 4, 32, FLOAT)}, + + // Float formats (emulated upload/download) + {VK_FORMAT_R16_SFLOAT, EMUFMT("r16f", 1, 1, 16, 32, FLOAT)}, + {VK_FORMAT_R16G16_SFLOAT, EMUFMT("rg16f", 2, 2, 16, 32, FLOAT)}, + {VK_FORMAT_R16G16B16_SFLOAT, EMUFMT("rgb16f", 3, 3, 16, 32, FLOAT)}, + {VK_FORMAT_R16G16B16A16_SFLOAT, EMUFMT("rgba16f", 4, 4, 16, 32, FLOAT)}, + + // Integer-sampled formats + {VK_FORMAT_R8_UINT, REGFMT("r8u", 1, 8, UINT)}, + {VK_FORMAT_R8G8_UINT, REGFMT("rg8u", 2, 8, UINT)}, + {VK_FORMAT_R8G8B8_UINT, REGFMT("rgb8u", 3, 8, UINT)}, + {VK_FORMAT_R8G8B8A8_UINT, REGFMT("rgba8u", 4, 8, UINT)}, + {VK_FORMAT_R16_UINT, REGFMT("r16u", 1, 16, UINT)}, + {VK_FORMAT_R16G16_UINT, REGFMT("rg16u", 2, 16, UINT)}, + {VK_FORMAT_R16G16B16_UINT, REGFMT("rgb16u", 3, 16, UINT)}, + {VK_FORMAT_R16G16B16A16_UINT, REGFMT("rgba16u", 4, 16, UINT)}, + {VK_FORMAT_R32_UINT, REGFMT("r32u", 1, 32, UINT)}, + {VK_FORMAT_R32G32_UINT, REGFMT("rg32u", 2, 32, UINT)}, + {VK_FORMAT_R32G32B32_UINT, REGFMT("rgb32u", 3, 32, UINT)}, + {VK_FORMAT_R32G32B32A32_UINT, REGFMT("rgba32u", 4, 32, UINT)}, + + {VK_FORMAT_R8_SINT, REGFMT("r8i", 1, 8, SINT)}, + {VK_FORMAT_R8G8_SINT, REGFMT("rg8i", 2, 8, SINT)}, + {VK_FORMAT_R8G8B8_SINT, REGFMT("rgb8i", 3, 8, SINT)}, + {VK_FORMAT_R8G8B8A8_SINT, REGFMT("rgba8i", 4, 8, SINT)}, + {VK_FORMAT_R16_SINT, REGFMT("r16i", 1, 16, SINT)}, + {VK_FORMAT_R16G16_SINT, REGFMT("rg16i", 2, 16, SINT)}, + {VK_FORMAT_R16G16B16_SINT, REGFMT("rgb16i", 3, 16, SINT)}, + {VK_FORMAT_R16G16B16A16_SINT, REGFMT("rgba16i", 4, 16, SINT)}, + {VK_FORMAT_R32_SINT, REGFMT("r32i", 1, 32, SINT)}, + {VK_FORMAT_R32G32_SINT, REGFMT("rg32i", 2, 32, SINT)}, + {VK_FORMAT_R32G32B32_SINT, REGFMT("rgb32i", 3, 32, SINT)}, + {VK_FORMAT_R32G32B32A32_SINT, REGFMT("rgba32i", 4, 32, SINT)}, + + // "Swapped" component order formats + {VK_FORMAT_B8G8R8_UNORM, FMT("bgr8", 3, 3, UNORM, BITS(8, 8, 8), IDX(2, 1, 0))}, + {VK_FORMAT_B8G8R8A8_UNORM, FMT("bgra8", 4, 4, UNORM, BITS(8, 8, 8, 8), IDX(2, 1, 0, 3))}, + + {VK_FORMAT_B8G8R8_UINT, FMT("bgr8u", 3, 3, UINT, BITS(8, 8, 8), IDX(2, 1, 0))}, + {VK_FORMAT_B8G8R8A8_UINT, FMT("bgra8u", 4, 4, UINT, BITS(8, 8, 8, 8), IDX(2, 1, 0, 3))}, + + {VK_FORMAT_B8G8R8_SINT, FMT("bgr8i", 3, 3, SINT, BITS(8, 8, 8), IDX(2, 1, 0))}, + {VK_FORMAT_B8G8R8A8_SINT, FMT("bgra8i", 4, 4, SINT, BITS(8, 8, 8, 8), IDX(2, 1, 0, 3))}, + + // "Packed" integer formats + // + // Note: These have the component order reversed from what the vulkan name + // implies, because we order our IDX from LSB to MSB (consistent with the + // usual ordering from lowest byte to highest byte, on little endian + // platforms), but Vulkan names them from MSB to LSB. + {VK_FORMAT_R4G4_UNORM_PACK8, FMT("gr4", 2, 1, UNORM, BITS(4, 4), IDX(1, 0))}, + {VK_FORMAT_B4G4R4A4_UNORM_PACK16, FMT("argb4", 4, 2, UNORM, BITS(4, 4, 4, 4), IDX(3, 0, 1, 2))}, + {VK_FORMAT_R4G4B4A4_UNORM_PACK16, FMT("abgr4", 4, 2, UNORM, BITS(4, 4, 4, 4), IDX(3, 2, 1, 0))}, + + {VK_FORMAT_R5G6B5_UNORM_PACK16, FMT("bgr565", 3, 2, UNORM, BITS(5, 6, 5), IDX(2, 1, 0))}, + {VK_FORMAT_B5G6R5_UNORM_PACK16, FMT("rgb565", 3, 2, UNORM, BITS(5, 6, 5), IDX(0, 1, 2))}, + + {VK_FORMAT_R5G5B5A1_UNORM_PACK16, FMT("a1bgr5", 4, 2, UNORM, BITS(1, 5, 5, 5), IDX(3, 2, 1, 0))}, + {VK_FORMAT_B5G5R5A1_UNORM_PACK16, FMT("a1rgb5", 4, 2, UNORM, BITS(1, 5, 5, 5), IDX(3, 0, 1, 2))}, + {VK_FORMAT_A1R5G5B5_UNORM_PACK16, FMT("bgr5a1", 4, 2, UNORM, BITS(5, 5, 5, 1), IDX(2, 1, 0, 3))}, + + {VK_FORMAT_A2B10G10R10_UNORM_PACK32, FMT("rgb10a2", 4, 4, UNORM, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))}, + {VK_FORMAT_A2R10G10B10_UNORM_PACK32, FMT("bgr10a2", 4, 4, UNORM, BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))}, + {VK_FORMAT_A2B10G10R10_SNORM_PACK32, FMT("rgb10a2s", 4, 4, SNORM, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))}, + {VK_FORMAT_A2R10G10B10_SNORM_PACK32, FMT("bgr10a2s", 4, 4, SNORM, BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))}, + {VK_FORMAT_A2B10G10R10_UINT_PACK32, FMT("rgb10a2u", 4, 4, UINT, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))}, + {VK_FORMAT_A2R10G10B10_UINT_PACK32, FMT("bgr10a2u", 4, 4, UINT, BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))}, + {VK_FORMAT_A2B10G10R10_SINT_PACK32, FMT("rgb10a2i", 4, 4, SINT, BITS(10, 10, 10, 2), IDX(0, 1, 2, 3))}, + {VK_FORMAT_A2R10G10B10_SINT_PACK32, FMT("bgr10a2i", 4, 4, SINT, BITS(10, 10, 10, 2), IDX(2, 1, 0, 3))}, + + + // Packed 16 bit formats + {VK_FORMAT_R10X6_UNORM_PACK16, PACKED16FMT("rx10", 1, 10)}, + {VK_FORMAT_R10X6G10X6_UNORM_2PACK16, PACKED16FMT("rxgx10", 2, 10)}, + {VK_FORMAT_R12X4_UNORM_PACK16, PACKED16FMT("rx12", 1, 12)}, + {VK_FORMAT_R12X4G12X4_UNORM_2PACK16, PACKED16FMT("rxgx12", 2, 12)}, + + // FIXME: enabling these requires VK_EXT_rgba10x6_formats or equivalent + // {VK_FORMAT_R10X6G10X6B10X6A10X6_UNORM_4PACK16, PACKED16FMT("rxgxbxax10", 4, 10)}, + // {VK_FORMAT_R12X4G12X4B12X4A12X4_UNORM_4PACK16, PACKED16FMT("rxgxbxax12", 4, 12)}, + + // Planar formats + {VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM, PLANARFMT("g8_b8_r8_420", 3, 12, 8), + .pfmt = { + {VK_FORMAT_R8_UNORM}, + {VK_FORMAT_R8_UNORM, .sx = 1, .sy = 1}, + {VK_FORMAT_R8_UNORM, .sx = 1, .sy = 1}, + }, + }, + {VK_FORMAT_G8_B8_R8_3PLANE_422_UNORM, PLANARFMT("g8_b8_r8_422", 3, 16, 8), + .pfmt = { + {VK_FORMAT_R8_UNORM}, + {VK_FORMAT_R8_UNORM, .sx = 1}, + {VK_FORMAT_R8_UNORM, .sx = 1}, + }, + }, + {VK_FORMAT_G8_B8_R8_3PLANE_444_UNORM, PLANARFMT("g8_b8_r8_444", 3, 24, 8), + .pfmt = { + {VK_FORMAT_R8_UNORM}, + {VK_FORMAT_R8_UNORM}, + {VK_FORMAT_R8_UNORM}, + }, + }, + + {VK_FORMAT_G16_B16_R16_3PLANE_420_UNORM, PLANARFMT("g16_b16_r16_420", 3, 24, 16), + .pfmt = { + {VK_FORMAT_R16_UNORM}, + {VK_FORMAT_R16_UNORM, .sx = 1, .sy = 1}, + {VK_FORMAT_R16_UNORM, .sx = 1, .sy = 1}, + }, + }, + {VK_FORMAT_G16_B16_R16_3PLANE_422_UNORM, PLANARFMT("g16_b16_r16_422", 3, 32, 16), + .pfmt = { + {VK_FORMAT_R16_UNORM}, + {VK_FORMAT_R16_UNORM, .sx = 1}, + {VK_FORMAT_R16_UNORM, .sx = 1}, + }, + }, + {VK_FORMAT_G16_B16_R16_3PLANE_444_UNORM, PLANARFMT("g16_b16_r16_444", 3, 48, 16), + .pfmt = { + {VK_FORMAT_R16_UNORM}, + {VK_FORMAT_R16_UNORM}, + {VK_FORMAT_R16_UNORM}, + }, + }, + + {VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_420_UNORM_3PACK16, PLANARFMT("gx10_bx10_rx10_420", 3, 24, 10), + .pfmt = { + {VK_FORMAT_R10X6_UNORM_PACK16}, + {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1, .sy = 1}, + {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1, .sy = 1}, + }, + }, + {VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_422_UNORM_3PACK16, PLANARFMT("gx10_bx10_rx10_422", 3, 32, 10), + .pfmt = { + {VK_FORMAT_R10X6_UNORM_PACK16}, + {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1}, + {VK_FORMAT_R10X6_UNORM_PACK16, .sx = 1}, + }, + }, + {VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_444_UNORM_3PACK16, PLANARFMT("gx10_bx10_rx10_444", 3, 48, 10), + .pfmt = { + {VK_FORMAT_R10X6_UNORM_PACK16}, + {VK_FORMAT_R10X6_UNORM_PACK16}, + {VK_FORMAT_R10X6_UNORM_PACK16}, + }, + }, + + {VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_420_UNORM_3PACK16, PLANARFMT("gx12_bx12_rx12_420", 3, 24, 12), + .pfmt = { + {VK_FORMAT_R12X4_UNORM_PACK16}, + {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1, .sy = 1}, + {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1, .sy = 1}, + }, + }, + {VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_422_UNORM_3PACK16, PLANARFMT("gx12_bx12_rx12_422", 3, 32, 12), + .pfmt = { + {VK_FORMAT_R12X4_UNORM_PACK16}, + {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1}, + {VK_FORMAT_R12X4_UNORM_PACK16, .sx = 1}, + }, + }, + {VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_444_UNORM_3PACK16, PLANARFMT("gx12_bx12_rx12_444", 3, 48, 12), + .pfmt = { + {VK_FORMAT_R12X4_UNORM_PACK16}, + {VK_FORMAT_R12X4_UNORM_PACK16}, + {VK_FORMAT_R12X4_UNORM_PACK16}, + }, + }, + + {VK_FORMAT_G8_B8R8_2PLANE_420_UNORM, PLANARFMT("g8_br8_420", 2, 12, 8), + .pfmt = { + {VK_FORMAT_R8_UNORM}, + {VK_FORMAT_R8G8_UNORM, .sx = 1, .sy = 1}, + }, + }, + {VK_FORMAT_G8_B8R8_2PLANE_422_UNORM, PLANARFMT("g8_br8_422", 2, 16, 8), + .pfmt = { + {VK_FORMAT_R8_UNORM}, + {VK_FORMAT_R8G8_UNORM, .sx = 1}, + }, + }, + {VK_FORMAT_G8_B8R8_2PLANE_444_UNORM, PLANARFMT("g8_br8_444", 2, 24, 8), + .min_ver = VK_API_VERSION_1_3, + .pfmt = { + {VK_FORMAT_R8_UNORM}, + {VK_FORMAT_R8G8_UNORM}, + }, + }, + + {VK_FORMAT_G16_B16R16_2PLANE_420_UNORM, PLANARFMT("g16_br16_420", 2, 24, 16), + .pfmt = { + {VK_FORMAT_R16_UNORM}, + {VK_FORMAT_R16G16_UNORM, .sx = 1, .sy = 1}, + }, + }, + {VK_FORMAT_G16_B16R16_2PLANE_422_UNORM, PLANARFMT("g16_br16_422", 2, 32, 16), + .pfmt = { + {VK_FORMAT_R16_UNORM}, + {VK_FORMAT_R16G16_UNORM, .sx = 1}, + }, + }, + {VK_FORMAT_G16_B16R16_2PLANE_444_UNORM, PLANARFMT("g16_br16_444", 2, 48, 16), + .min_ver = VK_API_VERSION_1_3, + .pfmt = { + {VK_FORMAT_R16_UNORM}, + {VK_FORMAT_R16G16_UNORM}, + }, + }, + + {VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16, PLANARFMT("gx10_bxrx10_420", 2, 24, 10), + .pfmt = { + {VK_FORMAT_R10X6_UNORM_PACK16}, + {VK_FORMAT_R10X6G10X6_UNORM_2PACK16, .sx = 1, .sy = 1}, + }, + }, + {VK_FORMAT_G10X6_B10X6R10X6_2PLANE_422_UNORM_3PACK16, PLANARFMT("gx10_bxrx10_422", 2, 32, 10), + .pfmt = { + {VK_FORMAT_R10X6_UNORM_PACK16}, + {VK_FORMAT_R10X6G10X6_UNORM_2PACK16, .sx = 1}, + }, + }, + {VK_FORMAT_G10X6_B10X6R10X6_2PLANE_444_UNORM_3PACK16, PLANARFMT("gx10_bxrx10_444", 2, 48, 10), + .min_ver = VK_API_VERSION_1_3, + .pfmt = { + {VK_FORMAT_R10X6_UNORM_PACK16}, + {VK_FORMAT_R10X6G10X6_UNORM_2PACK16}, + }, + }, + + {VK_FORMAT_G12X4_B12X4R12X4_2PLANE_420_UNORM_3PACK16, PLANARFMT("gx12_bxrx12_420", 2, 24, 12), + .pfmt = { + {VK_FORMAT_R12X4_UNORM_PACK16}, + {VK_FORMAT_R12X4G12X4_UNORM_2PACK16, .sx = 1, .sy = 1}, + }, + }, + {VK_FORMAT_G12X4_B12X4R12X4_2PLANE_422_UNORM_3PACK16, PLANARFMT("gx12_bxrx12_422", 2, 32, 12), + .pfmt = { + {VK_FORMAT_R12X4_UNORM_PACK16}, + {VK_FORMAT_R12X4G12X4_UNORM_2PACK16, .sx = 1}, + }, + }, + {VK_FORMAT_G12X4_B12X4R12X4_2PLANE_444_UNORM_3PACK16, PLANARFMT("gx12_bxrx12_444", 2, 48, 12), + .min_ver = VK_API_VERSION_1_3, + .pfmt = { + {VK_FORMAT_R12X4_UNORM_PACK16}, + {VK_FORMAT_R12X4G12X4_UNORM_2PACK16}, + }, + }, + + {0} +}; + +#undef BITS +#undef IDX +#undef REGFMT +#undef FMT + +void vk_setup_formats(struct pl_gpu_t *gpu) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + PL_ARRAY(pl_fmt) formats = {0}; + + // Texture format emulation requires at least support for texel buffers + bool has_emu = gpu->glsl.compute && gpu->limits.max_buffer_texels; + + for (const struct vk_format *pvk_fmt = vk_formats; pvk_fmt->tfmt; pvk_fmt++) { + const struct vk_format *vk_fmt = pvk_fmt; + + // Skip formats that require a too new version of Vulkan + if (vk_fmt->min_ver > vk->api_ver) + continue; + + // Skip formats with innately emulated representation if unsupported + if (vk_fmt->fmt.emulated && !has_emu) + continue; + + // Suppress some errors/warnings spit out by the format probing code + pl_log_level_cap(vk->log, PL_LOG_INFO); + + bool has_drm_mods = vk->GetImageDrmFormatModifierPropertiesEXT; + VkDrmFormatModifierPropertiesEXT modifiers[16] = {0}; + VkDrmFormatModifierPropertiesListEXT drm_props = { + .sType = VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT, + .drmFormatModifierCount = PL_ARRAY_SIZE(modifiers), + .pDrmFormatModifierProperties = modifiers, + }; + + VkFormatProperties2KHR prop2 = { + .sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2, + .pNext = has_drm_mods ? &drm_props : NULL, + }; + + vk->GetPhysicalDeviceFormatProperties2KHR(vk->physd, vk_fmt->tfmt, &prop2); + + // If wholly unsupported, try falling back to the emulation formats + // for texture operations + VkFormatProperties *prop = &prop2.formatProperties; + while (has_emu && !prop->optimalTilingFeatures && vk_fmt->emufmt) { + vk_fmt = vk_fmt->emufmt; + vk->GetPhysicalDeviceFormatProperties2KHR(vk->physd, vk_fmt->tfmt, &prop2); + } + + VkFormatFeatureFlags texflags = prop->optimalTilingFeatures; + VkFormatFeatureFlags bufflags = prop->bufferFeatures; + if (vk_fmt->fmt.emulated) { + // Emulated formats might have a different buffer representation + // than their texture representation. If they don't, assume their + // buffer representation is nonsensical (e.g. r16f) + if (vk_fmt->bfmt) { + vk->GetPhysicalDeviceFormatProperties(vk->physd, vk_fmt->bfmt, prop); + bufflags = prop->bufferFeatures; + } else { + bufflags = 0; + } + } else if (vk_fmt->fmt.num_planes) { + // Planar textures cannot be used directly + texflags = bufflags = 0; + } + + pl_log_level_cap(vk->log, PL_LOG_NONE); + + struct pl_fmt_t *fmt = pl_alloc_obj(gpu, fmt, struct pl_fmt_vk); + struct pl_fmt_vk *fmtp = PL_PRIV(fmt); + *fmt = vk_fmt->fmt; + *fmtp = (struct pl_fmt_vk) { + .vk_fmt = vk_fmt + }; + + // Always set the signature to the actual texture format, so we can use + // it to guarantee renderpass compatibility. + fmt->signature = (uint64_t) vk_fmt->tfmt; + + // For sanity, clear the superfluous fields + for (int i = fmt->num_components; i < 4; i++) { + fmt->component_depth[i] = 0; + fmt->sample_order[i] = 0; + fmt->host_bits[i] = 0; + } + + // We can set this universally + fmt->fourcc = pl_fmt_fourcc(fmt); + + if (has_drm_mods) { + + if (drm_props.drmFormatModifierCount == PL_ARRAY_SIZE(modifiers)) { + PL_WARN(gpu, "DRM modifier list for format %s possibly truncated", + fmt->name); + } + + // Query the list of supported DRM modifiers from the driver + PL_ARRAY(uint64_t) modlist = {0}; + for (int i = 0; i < drm_props.drmFormatModifierCount; i++) { + if (modifiers[i].drmFormatModifierPlaneCount > 1) { + PL_TRACE(gpu, "Ignoring format modifier %s of " + "format %s because its plane count %d > 1", + PRINT_DRM_MOD(modifiers[i].drmFormatModifier), + fmt->name, modifiers[i].drmFormatModifierPlaneCount); + continue; + } + + // Only warn about texture format features relevant to us + const VkFormatFeatureFlags flag_mask = + VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT | + VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT | + VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT | + VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT | + VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT | + VK_FORMAT_FEATURE_BLIT_SRC_BIT | + VK_FORMAT_FEATURE_BLIT_DST_BIT; + + + VkFormatFeatureFlags flags = modifiers[i].drmFormatModifierTilingFeatures; + if ((flags & flag_mask) != (texflags & flag_mask)) { + PL_DEBUG(gpu, "DRM format modifier %s of format %s " + "supports fewer caps (0x%"PRIx32") than optimal tiling " + "(0x%"PRIx32"), may result in limited capability!", + PRINT_DRM_MOD(modifiers[i].drmFormatModifier), + fmt->name, flags, texflags); + } + + PL_ARRAY_APPEND(fmt, modlist, modifiers[i].drmFormatModifier); + } + + fmt->num_modifiers = modlist.num; + fmt->modifiers = modlist.elem; + + } else if (gpu->export_caps.tex & PL_HANDLE_DMA_BUF) { + + // Hard-code a list of static mods that we're likely to support + static const uint64_t static_mods[2] = { + DRM_FORMAT_MOD_INVALID, + DRM_FORMAT_MOD_LINEAR, + }; + + fmt->num_modifiers = PL_ARRAY_SIZE(static_mods); + fmt->modifiers = static_mods; + + } + + struct { VkFormatFeatureFlags flags; enum pl_fmt_caps caps; } bufbits[] = { + {VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT, PL_FMT_CAP_VERTEX}, + {VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT, PL_FMT_CAP_TEXEL_UNIFORM}, + {VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT, PL_FMT_CAP_TEXEL_STORAGE}, + }; + + for (int i = 0; i < PL_ARRAY_SIZE(bufbits); i++) { + if ((bufflags & bufbits[i].flags) == bufbits[i].flags) + fmt->caps |= bufbits[i].caps; + } + + if (fmt->caps) { + fmt->glsl_type = pl_var_glsl_type_name(pl_var_from_fmt(fmt, "")); + pl_assert(fmt->glsl_type); + } + + struct { VkFormatFeatureFlags flags; enum pl_fmt_caps caps; } bits[] = { + {VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT, PL_FMT_CAP_BLENDABLE}, + {VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT, PL_FMT_CAP_LINEAR}, + {VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT, PL_FMT_CAP_SAMPLEABLE}, + {VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT, PL_FMT_CAP_STORABLE}, + {VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT, PL_FMT_CAP_RENDERABLE}, + + // We don't distinguish between the two blit modes for pl_fmt_caps + {VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT, + PL_FMT_CAP_BLITTABLE}, + }; + + for (int i = 0; i < PL_ARRAY_SIZE(bits); i++) { + if ((texflags & bits[i].flags) == bits[i].flags) + fmt->caps |= bits[i].caps; + } + + // For blit emulation via compute shaders + if (!(fmt->caps & PL_FMT_CAP_BLITTABLE) && (fmt->caps & PL_FMT_CAP_STORABLE)) { + fmt->caps |= PL_FMT_CAP_BLITTABLE; + fmtp->blit_emulated = true; + } + + // This is technically supported for all textures, but the semantics + // of pl_gpu require it only be listed for non-opaque ones + if (!fmt->opaque) + fmt->caps |= PL_FMT_CAP_HOST_READABLE; + + // Vulkan requires a minimum GLSL version that supports textureGather() + if (fmt->caps & PL_FMT_CAP_SAMPLEABLE) + fmt->gatherable = true; + + // Disable implied capabilities where the dependencies are unavailable + enum pl_fmt_caps storable = PL_FMT_CAP_STORABLE | PL_FMT_CAP_TEXEL_STORAGE; + if (!(fmt->caps & PL_FMT_CAP_SAMPLEABLE)) + fmt->caps &= ~PL_FMT_CAP_LINEAR; + if (!gpu->glsl.compute) + fmt->caps &= ~storable; + + bool has_nofmt = vk->features.features.shaderStorageImageReadWithoutFormat && + vk->features.features.shaderStorageImageWriteWithoutFormat; + + if (fmt->caps & storable) { + int real_comps = PL_DEF(vk_fmt->icomps, fmt->num_components); + fmt->glsl_format = pl_fmt_glsl_format(fmt, real_comps); + if (!fmt->glsl_format && !has_nofmt) { + PL_DEBUG(gpu, "Storable format '%s' has no matching GLSL " + "format qualifier but read/write without format " + "is not supported.. disabling", fmt->name); + fmt->caps &= ~storable; + } + } + + if (fmt->caps & storable) + fmt->caps |= PL_FMT_CAP_READWRITE; + + // Pick sub-plane formats for planar formats + for (int n = 0; n < fmt->num_planes; n++) { + for (int i = 0; i < formats.num; i++) { + if (formats.elem[i]->signature == vk_fmt->pfmt[n].fmt) { + fmt->planes[n].format = formats.elem[i]; + fmt->planes[n].shift_x = vk_fmt->pfmt[n].sx; + fmt->planes[n].shift_y = vk_fmt->pfmt[n].sy; + break; + } + } + + pl_assert(fmt->planes[n].format); + } + + PL_ARRAY_APPEND(gpu, formats, fmt); + } + + gpu->formats = formats.elem; + gpu->num_formats = formats.num; +} diff --git a/src/vulkan/formats.h b/src/vulkan/formats.h new file mode 100644 index 0000000..b1408fd --- /dev/null +++ b/src/vulkan/formats.h @@ -0,0 +1,34 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "common.h" +#include "gpu.h" + +struct vk_format { + VkFormat tfmt; // internal vulkan format enum (textures) + struct pl_fmt_t fmt;// pl_fmt template (features will be auto-detected) + int icomps; // internal component count (or 0 to infer from `fmt`) + VkFormat bfmt; // vulkan format for use as buffers (or 0 to use `tfmt`) + const struct vk_format *emufmt; // alternate format for emulation + uint32_t min_ver; // minimum vulkan API version for this format to exist + struct { VkFormat fmt; int sx, sy; } pfmt[4]; // plane formats (for planar textures) +}; + +// Add all supported formats to the `pl_gpu` format list +void vk_setup_formats(struct pl_gpu_t *gpu); diff --git a/src/vulkan/gpu.c b/src/vulkan/gpu.c new file mode 100644 index 0000000..69aca67 --- /dev/null +++ b/src/vulkan/gpu.c @@ -0,0 +1,924 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "gpu.h" +#include "formats.h" +#include "glsl/spirv.h" + +#ifdef PL_HAVE_UNIX +#include <unistd.h> +#endif + +// Gives us enough queries for 8 results +#define QUERY_POOL_SIZE 16 + +struct pl_timer_t { + VkQueryPool qpool; // even=start, odd=stop + int index_write; // next index to write to + int index_read; // next index to read from + uint64_t pending; // bitmask of queries that are still running +}; + +static inline uint64_t timer_bit(int index) +{ + return 1llu << (index / 2); +} + +static void timer_destroy_cb(pl_gpu gpu, pl_timer timer) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + pl_assert(!timer->pending); + vk->DestroyQueryPool(vk->dev, timer->qpool, PL_VK_ALLOC); + pl_free(timer); +} + +static pl_timer vk_timer_create(pl_gpu gpu) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + pl_timer timer = pl_alloc_ptr(NULL, timer); + *timer = (struct pl_timer_t) {0}; + + struct VkQueryPoolCreateInfo qinfo = { + .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, + .queryType = VK_QUERY_TYPE_TIMESTAMP, + .queryCount = QUERY_POOL_SIZE, + }; + + VK(vk->CreateQueryPool(vk->dev, &qinfo, PL_VK_ALLOC, &timer->qpool)); + return timer; + +error: + timer_destroy_cb(gpu, timer); + return NULL; +} + +static void vk_timer_destroy(pl_gpu gpu, pl_timer timer) +{ + vk_gpu_idle_callback(gpu, (vk_cb) timer_destroy_cb, gpu, timer); +} + +static uint64_t vk_timer_query(pl_gpu gpu, pl_timer timer) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + if (timer->index_read == timer->index_write) + return 0; // no more unprocessed results + + vk_poll_commands(vk, 0); + if (timer->pending & timer_bit(timer->index_read)) + return 0; // still waiting for results + + VkResult res; + uint64_t ts[2] = {0}; + res = vk->GetQueryPoolResults(vk->dev, timer->qpool, timer->index_read, 2, + sizeof(ts), &ts[0], sizeof(uint64_t), + VK_QUERY_RESULT_64_BIT); + + switch (res) { + case VK_SUCCESS: + timer->index_read = (timer->index_read + 2) % QUERY_POOL_SIZE; + return (ts[1] - ts[0]) * vk->props.limits.timestampPeriod; + case VK_NOT_READY: + return 0; + default: + PL_VK_ASSERT(res, "Retrieving query pool results"); + } + +error: + return 0; +} + +static void timer_begin(pl_gpu gpu, struct vk_cmd *cmd, pl_timer timer) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + if (!timer) + return; + + if (!cmd->pool->props.timestampValidBits) { + PL_TRACE(gpu, "QF %d does not support timestamp queries", cmd->pool->qf); + return; + } + + vk_poll_commands(vk, 0); + if (timer->pending & timer_bit(timer->index_write)) + return; // next query is still running, skip this timer + + VkQueueFlags reset_flags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT; + if (cmd->pool->props.queueFlags & reset_flags) { + // Use direct command buffer resets + vk->CmdResetQueryPool(cmd->buf, timer->qpool, timer->index_write, 2); + } else { + // Use host query reset + vk->ResetQueryPool(vk->dev, timer->qpool, timer->index_write, 2); + } + + vk->CmdWriteTimestamp(cmd->buf, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + timer->qpool, timer->index_write); + + p->cmd_timer = timer; +} + +static inline bool supports_marks(struct vk_cmd *cmd) { + // Spec says debug markers are only available on graphics/compute queues + VkQueueFlags flags = cmd->pool->props.queueFlags; + return flags & (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT); +} + +struct vk_cmd *_begin_cmd(pl_gpu gpu, enum queue_type type, const char *label, + pl_timer timer) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + pl_mutex_lock(&p->recording); + + struct vk_cmdpool *pool; + switch (type) { + case ANY: pool = p->cmd ? p->cmd->pool : vk->pool_graphics; break; + case GRAPHICS: pool = vk->pool_graphics; break; + case COMPUTE: pool = vk->pool_compute; break; + case TRANSFER: pool = vk->pool_transfer; break; + default: pl_unreachable(); + } + + if (!p->cmd || p->cmd->pool != pool) { + vk_cmd_submit(&p->cmd); + p->cmd = vk_cmd_begin(pool, label); + if (!p->cmd) { + pl_mutex_unlock(&p->recording); + return NULL; + } + } + + if (vk->CmdBeginDebugUtilsLabelEXT && supports_marks(p->cmd)) { + vk->CmdBeginDebugUtilsLabelEXT(p->cmd->buf, &(VkDebugUtilsLabelEXT) { + .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, + .pLabelName = label, + }); + } + + timer_begin(gpu, p->cmd, timer); + return p->cmd; +} + +static void timer_end_cb(void *ptimer, void *pindex) +{ + pl_timer timer = ptimer; + int index = (uintptr_t) pindex; + timer->pending &= ~timer_bit(index); +} + +bool _end_cmd(pl_gpu gpu, struct vk_cmd **pcmd, bool submit) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + bool ret = true; + if (!pcmd) { + if (submit) { + pl_mutex_lock(&p->recording); + ret = vk_cmd_submit(&p->cmd); + pl_mutex_unlock(&p->recording); + } + return ret; + } + + struct vk_cmd *cmd = *pcmd; + pl_assert(p->cmd == cmd); + + if (p->cmd_timer) { + pl_timer timer = p->cmd_timer; + vk->CmdWriteTimestamp(cmd->buf, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + timer->qpool, timer->index_write + 1); + + timer->pending |= timer_bit(timer->index_write); + vk_cmd_callback(cmd, (vk_cb) timer_end_cb, timer, + (void *) (uintptr_t) timer->index_write); + + timer->index_write = (timer->index_write + 2) % QUERY_POOL_SIZE; + if (timer->index_write == timer->index_read) { + // forcibly drop the least recent result to make space + timer->index_read = (timer->index_read + 2) % QUERY_POOL_SIZE; + } + + p->cmd_timer = NULL; + } + + if (vk->CmdEndDebugUtilsLabelEXT && supports_marks(cmd)) + vk->CmdEndDebugUtilsLabelEXT(cmd->buf); + + if (submit) + ret = vk_cmd_submit(&p->cmd); + + pl_mutex_unlock(&p->recording); + return ret; +} + +void vk_gpu_idle_callback(pl_gpu gpu, vk_cb cb, const void *priv, const void *arg) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + pl_mutex_lock(&p->recording); + if (p->cmd) { + vk_cmd_callback(p->cmd, cb, priv, arg); + } else { + vk_dev_callback(vk, cb, priv, arg); + } + pl_mutex_unlock(&p->recording); +} + +static void vk_gpu_destroy(pl_gpu gpu) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + vk_cmd_submit(&p->cmd); + vk_wait_idle(vk); + + for (enum pl_tex_sample_mode s = 0; s < PL_TEX_SAMPLE_MODE_COUNT; s++) { + for (enum pl_tex_address_mode a = 0; a < PL_TEX_ADDRESS_MODE_COUNT; a++) + vk->DestroySampler(vk->dev, p->samplers[s][a], PL_VK_ALLOC); + } + + pl_spirv_destroy(&p->spirv); + pl_mutex_destroy(&p->recording); + pl_free((void *) gpu); +} + +pl_vulkan pl_vulkan_get(pl_gpu gpu) +{ + const struct pl_gpu_fns *impl = PL_PRIV(gpu); + if (impl->destroy == vk_gpu_destroy) { + struct pl_vk *p = (struct pl_vk *) impl; + return p->vk->vulkan; + } + + return NULL; +} + +static pl_handle_caps vk_sync_handle_caps(struct vk_ctx *vk) +{ + pl_handle_caps caps = 0; + + for (int i = 0; vk_sync_handle_list[i]; i++) { + enum pl_handle_type type = vk_sync_handle_list[i]; + + VkPhysicalDeviceExternalSemaphoreInfo info = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_SEMAPHORE_INFO_KHR, + .handleType = vk_sync_handle_type(type), + }; + + VkExternalSemaphoreProperties props = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_PROPERTIES_KHR, + }; + + vk->GetPhysicalDeviceExternalSemaphoreProperties(vk->physd, &info, &props); + VkExternalSemaphoreFeatureFlags flags = props.externalSemaphoreFeatures; + if ((props.compatibleHandleTypes & info.handleType) && + (flags & VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT_KHR)) + { + caps |= type; + } + } + + return caps; +} + +static pl_handle_caps vk_tex_handle_caps(struct vk_ctx *vk, bool import) +{ + pl_handle_caps caps = 0; + + for (int i = 0; vk_mem_handle_list[i]; i++) { + enum pl_handle_type handle_type = vk_mem_handle_list[i]; + if (handle_type == PL_HANDLE_DMA_BUF && !vk->GetImageDrmFormatModifierPropertiesEXT) { + PL_DEBUG(vk, "Tex caps for %s (0x%x) unsupported: no DRM modifiers", + vk_handle_name(vk_mem_handle_type(PL_HANDLE_DMA_BUF)), + (unsigned int) PL_HANDLE_DMA_BUF); + continue; + } + + // Query whether creation of a "basic" dummy texture would work + VkPhysicalDeviceImageDrmFormatModifierInfoEXT drm_pinfo = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT, + .drmFormatModifier = DRM_FORMAT_MOD_LINEAR, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + }; + + VkPhysicalDeviceExternalImageFormatInfoKHR ext_pinfo = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO_KHR, + .handleType = vk_mem_handle_type(handle_type), + }; + + VkPhysicalDeviceImageFormatInfo2KHR pinfo = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2_KHR, + .pNext = &ext_pinfo, + .format = VK_FORMAT_R8_UNORM, + .type = VK_IMAGE_TYPE_2D, + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT, + }; + + if (handle_type == PL_HANDLE_DMA_BUF) { + vk_link_struct(&pinfo, &drm_pinfo); + pinfo.tiling = VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT; + } + + VkExternalImageFormatPropertiesKHR ext_props = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES_KHR, + }; + + VkImageFormatProperties2KHR props = { + .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2_KHR, + .pNext = &ext_props, + }; + + VkResult res; + res = vk->GetPhysicalDeviceImageFormatProperties2KHR(vk->physd, &pinfo, &props); + if (res != VK_SUCCESS) { + PL_DEBUG(vk, "Tex caps for %s (0x%x) unsupported: %s", + vk_handle_name(ext_pinfo.handleType), + (unsigned int) handle_type, + vk_res_str(res)); + continue; + } + + if (vk_external_mem_check(vk, &ext_props.externalMemoryProperties, + handle_type, import)) + { + caps |= handle_type; + } + } + +#ifdef VK_EXT_metal_objects + if (vk->ExportMetalObjectsEXT && import) + caps |= PL_HANDLE_MTL_TEX | PL_HANDLE_IOSURFACE; +#endif + + return caps; +} + +static const VkFilter filters[PL_TEX_SAMPLE_MODE_COUNT] = { + [PL_TEX_SAMPLE_NEAREST] = VK_FILTER_NEAREST, + [PL_TEX_SAMPLE_LINEAR] = VK_FILTER_LINEAR, +}; + +static inline struct pl_spirv_version get_spirv_version(const struct vk_ctx *vk) +{ + if (vk->api_ver >= VK_API_VERSION_1_3) { + const VkPhysicalDeviceMaintenance4Features *device_maintenance4; + device_maintenance4 = vk_find_struct(&vk->features, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_4_FEATURES); + + if (device_maintenance4 && device_maintenance4->maintenance4) { + return (struct pl_spirv_version) { + .env_version = VK_API_VERSION_1_3, + .spv_version = PL_SPV_VERSION(1, 6), + }; + } + } + + pl_assert(vk->api_ver >= VK_API_VERSION_1_2); + return (struct pl_spirv_version) { + .env_version = VK_API_VERSION_1_2, + .spv_version = PL_SPV_VERSION(1, 5), + }; +} + +static const struct pl_gpu_fns pl_fns_vk; + +pl_gpu pl_gpu_create_vk(struct vk_ctx *vk) +{ + pl_assert(vk->dev); + + struct pl_gpu_t *gpu = pl_zalloc_obj(NULL, gpu, struct pl_vk); + gpu->log = vk->log; + + struct pl_vk *p = PL_PRIV(gpu); + pl_mutex_init(&p->recording); + p->vk = vk; + p->impl = pl_fns_vk; + p->spirv = pl_spirv_create(vk->log, get_spirv_version(vk)); + if (!p->spirv) + goto error; + + // Query all device properties + VkPhysicalDevicePCIBusInfoPropertiesEXT pci_props = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PCI_BUS_INFO_PROPERTIES_EXT, + }; + + VkPhysicalDeviceIDPropertiesKHR id_props = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR, + .pNext = &pci_props, + }; + + VkPhysicalDevicePushDescriptorPropertiesKHR pushd_props = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR, + .pNext = &id_props, + }; + + VkPhysicalDeviceSubgroupProperties group_props = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES, + .pNext = &pushd_props, + }; + + VkPhysicalDeviceExternalMemoryHostPropertiesEXT host_props = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_MEMORY_HOST_PROPERTIES_EXT, + .pNext = &group_props, + }; + + VkPhysicalDeviceProperties2KHR props = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR, + .pNext = &host_props, + }; + + bool is_portability = false; + +#ifdef VK_KHR_portability_subset + VkPhysicalDevicePortabilitySubsetPropertiesKHR port_props = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PORTABILITY_SUBSET_PROPERTIES_KHR, + .minVertexInputBindingStrideAlignment = 1, + }; + + for (int i = 0; i < vk->exts.num; i++) { + if (!strcmp(vk->exts.elem[i], VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME)) { + vk_link_struct(&props, &port_props); + is_portability = true; + break; + } + } +#endif + + vk->GetPhysicalDeviceProperties2(vk->physd, &props); + VkPhysicalDeviceLimits limits = props.properties.limits; + + // Determine GLSL features and limits + gpu->glsl = (struct pl_glsl_version) { + .version = 450, + .vulkan = true, + .compute = true, + .max_shmem_size = limits.maxComputeSharedMemorySize, + .max_group_threads = limits.maxComputeWorkGroupInvocations, + .max_group_size = { + limits.maxComputeWorkGroupSize[0], + limits.maxComputeWorkGroupSize[1], + limits.maxComputeWorkGroupSize[2], + }, + }; + + VkShaderStageFlags req_stages = VK_SHADER_STAGE_FRAGMENT_BIT | + VK_SHADER_STAGE_COMPUTE_BIT; + VkSubgroupFeatureFlags req_flags = VK_SUBGROUP_FEATURE_BASIC_BIT | + VK_SUBGROUP_FEATURE_VOTE_BIT | + VK_SUBGROUP_FEATURE_ARITHMETIC_BIT | + VK_SUBGROUP_FEATURE_BALLOT_BIT | + VK_SUBGROUP_FEATURE_SHUFFLE_BIT; + + if ((group_props.supportedStages & req_stages) == req_stages && + (group_props.supportedOperations & req_flags) == req_flags) + { + gpu->glsl.subgroup_size = group_props.subgroupSize; + } + + if (vk->features.features.shaderImageGatherExtended) { + gpu->glsl.min_gather_offset = limits.minTexelGatherOffset; + gpu->glsl.max_gather_offset = limits.maxTexelGatherOffset; + } + + const size_t max_size = vk_malloc_avail(vk->ma, 0); + gpu->limits = (struct pl_gpu_limits) { + // pl_gpu + .thread_safe = true, + .callbacks = true, + // pl_buf + .max_buf_size = max_size, + .max_ubo_size = PL_MIN(limits.maxUniformBufferRange, max_size), + .max_ssbo_size = PL_MIN(limits.maxStorageBufferRange, max_size), + .max_vbo_size = vk_malloc_avail(vk->ma, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT), + .max_mapped_size = vk_malloc_avail(vk->ma, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT), + .max_buffer_texels = PL_MIN(limits.maxTexelBufferElements, max_size), + .align_host_ptr = host_props.minImportedHostPointerAlignment, + .host_cached = vk_malloc_avail(vk->ma, VK_MEMORY_PROPERTY_HOST_CACHED_BIT), + // pl_tex + .max_tex_1d_dim = limits.maxImageDimension1D, + .max_tex_2d_dim = limits.maxImageDimension2D, + .max_tex_3d_dim = limits.maxImageDimension3D, + .blittable_1d_3d = true, + .buf_transfer = true, + .align_tex_xfer_pitch = limits.optimalBufferCopyRowPitchAlignment, + .align_tex_xfer_offset = pl_lcm(limits.optimalBufferCopyOffsetAlignment, 4), + // pl_pass + .max_variable_comps = 0, // vulkan doesn't support these at all + .max_constants = SIZE_MAX, + .array_size_constants = !is_portability, + .max_pushc_size = limits.maxPushConstantsSize, +#ifdef VK_KHR_portability_subset + .align_vertex_stride = port_props.minVertexInputBindingStrideAlignment, +#else + .align_vertex_stride = 1, +#endif + .max_dispatch = { + limits.maxComputeWorkGroupCount[0], + limits.maxComputeWorkGroupCount[1], + limits.maxComputeWorkGroupCount[2], + }, + .fragment_queues = vk->pool_graphics->num_queues, + .compute_queues = vk->pool_compute->num_queues, + }; + + gpu->export_caps.buf = vk_malloc_handle_caps(vk->ma, false); + gpu->import_caps.buf = vk_malloc_handle_caps(vk->ma, true); + gpu->export_caps.tex = vk_tex_handle_caps(vk, false); + gpu->import_caps.tex = vk_tex_handle_caps(vk, true); + gpu->export_caps.sync = vk_sync_handle_caps(vk); + gpu->import_caps.sync = 0; // Not supported yet + + if (pl_gpu_supports_interop(gpu)) { + pl_static_assert(sizeof(gpu->uuid) == VK_UUID_SIZE); + memcpy(gpu->uuid, id_props.deviceUUID, sizeof(gpu->uuid)); + + gpu->pci.domain = pci_props.pciDomain; + gpu->pci.bus = pci_props.pciBus; + gpu->pci.device = pci_props.pciDevice; + gpu->pci.function = pci_props.pciFunction; + } + + if (vk->CmdPushDescriptorSetKHR) + p->max_push_descriptors = pushd_props.maxPushDescriptors; + + vk_setup_formats(gpu); + + // Compute the correct minimum texture alignment + p->min_texel_alignment = 1; + for (int i = 0; i < gpu->num_formats; i++) { + if (gpu->formats[i]->emulated || gpu->formats[i]->opaque) + continue; + size_t texel_size = gpu->formats[i]->texel_size; + p->min_texel_alignment = pl_lcm(p->min_texel_alignment, texel_size); + } + PL_DEBUG(gpu, "Minimum texel alignment: %zu", p->min_texel_alignment); + + // Initialize the samplers + for (enum pl_tex_sample_mode s = 0; s < PL_TEX_SAMPLE_MODE_COUNT; s++) { + for (enum pl_tex_address_mode a = 0; a < PL_TEX_ADDRESS_MODE_COUNT; a++) { + static const VkSamplerAddressMode modes[PL_TEX_ADDRESS_MODE_COUNT] = { + [PL_TEX_ADDRESS_CLAMP] = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, + [PL_TEX_ADDRESS_REPEAT] = VK_SAMPLER_ADDRESS_MODE_REPEAT, + [PL_TEX_ADDRESS_MIRROR] = VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT, + }; + + VkSamplerCreateInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, + .magFilter = filters[s], + .minFilter = filters[s], + .addressModeU = modes[a], + .addressModeV = modes[a], + .addressModeW = modes[a], + .maxAnisotropy = 1.0, + }; + + VK(vk->CreateSampler(vk->dev, &sinfo, PL_VK_ALLOC, &p->samplers[s][a])); + } + } + + return pl_gpu_finalize(gpu); + +error: + vk_gpu_destroy(gpu); + return NULL; +} + +static void vk_sync_destroy(pl_gpu gpu, pl_sync sync) +{ + if (!sync) + return; + + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_sync_vk *sync_vk = PL_PRIV(sync); + +#ifdef PL_HAVE_UNIX + if (sync->handle_type == PL_HANDLE_FD) { + if (sync->wait_handle.fd > -1) + close(sync->wait_handle.fd); + if (sync->signal_handle.fd > -1) + close(sync->signal_handle.fd); + } +#endif +#ifdef PL_HAVE_WIN32 + if (sync->handle_type == PL_HANDLE_WIN32) { + if (sync->wait_handle.handle != NULL) + CloseHandle(sync->wait_handle.handle); + if (sync->signal_handle.handle != NULL) + CloseHandle(sync->signal_handle.handle); + } + // PL_HANDLE_WIN32_KMT is just an identifier. It doesn't get closed. +#endif + + vk->DestroySemaphore(vk->dev, sync_vk->wait, PL_VK_ALLOC); + vk->DestroySemaphore(vk->dev, sync_vk->signal, PL_VK_ALLOC); + + pl_free((void *) sync); +} + +void vk_sync_deref(pl_gpu gpu, pl_sync sync) +{ + if (!sync) + return; + + struct pl_sync_vk *sync_vk = PL_PRIV(sync); + if (pl_rc_deref(&sync_vk->rc)) + vk_sync_destroy(gpu, sync); +} + +static pl_sync vk_sync_create(pl_gpu gpu, enum pl_handle_type handle_type) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + struct pl_sync_t *sync = pl_zalloc_obj(NULL, sync, struct pl_sync_vk); + sync->handle_type = handle_type; + + struct pl_sync_vk *sync_vk = PL_PRIV(sync); + pl_rc_init(&sync_vk->rc); + + VkExportSemaphoreCreateInfoKHR einfo = { + .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR, + .handleTypes = vk_sync_handle_type(handle_type), + }; + + switch (handle_type) { + case PL_HANDLE_FD: + sync->wait_handle.fd = -1; + sync->signal_handle.fd = -1; + break; + case PL_HANDLE_WIN32: + case PL_HANDLE_WIN32_KMT: + sync->wait_handle.handle = NULL; + sync->signal_handle.handle = NULL; + break; + case PL_HANDLE_DMA_BUF: + case PL_HANDLE_HOST_PTR: + case PL_HANDLE_MTL_TEX: + case PL_HANDLE_IOSURFACE: + pl_unreachable(); + } + + const VkSemaphoreCreateInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, + .pNext = &einfo, + }; + + VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &sync_vk->wait)); + VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &sync_vk->signal)); + PL_VK_NAME(SEMAPHORE, sync_vk->wait, "sync wait"); + PL_VK_NAME(SEMAPHORE, sync_vk->signal, "sync signal"); + +#ifdef PL_HAVE_UNIX + if (handle_type == PL_HANDLE_FD) { + VkSemaphoreGetFdInfoKHR finfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR, + .semaphore = sync_vk->wait, + .handleType = einfo.handleTypes, + }; + + VK(vk->GetSemaphoreFdKHR(vk->dev, &finfo, &sync->wait_handle.fd)); + + finfo.semaphore = sync_vk->signal; + VK(vk->GetSemaphoreFdKHR(vk->dev, &finfo, &sync->signal_handle.fd)); + } +#endif + +#ifdef PL_HAVE_WIN32 + if (handle_type == PL_HANDLE_WIN32 || + handle_type == PL_HANDLE_WIN32_KMT) + { + VkSemaphoreGetWin32HandleInfoKHR handle_info = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR, + .semaphore = sync_vk->wait, + .handleType = einfo.handleTypes, + }; + + VK(vk->GetSemaphoreWin32HandleKHR(vk->dev, &handle_info, + &sync->wait_handle.handle)); + + handle_info.semaphore = sync_vk->signal; + VK(vk->GetSemaphoreWin32HandleKHR(vk->dev, &handle_info, + &sync->signal_handle.handle)); + } +#endif + + return sync; + +error: + vk_sync_destroy(gpu, sync); + return NULL; +} + +void pl_vulkan_sem_destroy(pl_gpu gpu, VkSemaphore *semaphore) +{ + VkSemaphore sem = *semaphore; + if (!sem) + return; + + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + vk->DestroySemaphore(vk->dev, sem, PL_VK_ALLOC); + *semaphore = VK_NULL_HANDLE; +} + +VkSemaphore pl_vulkan_sem_create(pl_gpu gpu, const struct pl_vulkan_sem_params *params) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + pl_assert(PL_ISPOT(params->export_handle)); + if ((params->export_handle & gpu->export_caps.sync) != params->export_handle) { + PL_ERR(gpu, "Invalid handle type 0x%"PRIx64" specified for " + "`pl_vulkan_sem_create`!", (uint64_t) params->export_handle); + return VK_NULL_HANDLE; + } + + switch (params->export_handle) { + case PL_HANDLE_FD: + params->out_handle->fd = -1; + break; + case PL_HANDLE_WIN32: + case PL_HANDLE_WIN32_KMT: + params->out_handle->handle = NULL; + break; + case PL_HANDLE_DMA_BUF: + case PL_HANDLE_HOST_PTR: + case PL_HANDLE_MTL_TEX: + case PL_HANDLE_IOSURFACE: + pl_unreachable(); + } + + const VkExportSemaphoreCreateInfoKHR einfo = { + .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR, + .handleTypes = vk_sync_handle_type(params->export_handle), + }; + + const VkSemaphoreTypeCreateInfo stinfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO, + .pNext = params->export_handle ? &einfo : NULL, + .semaphoreType = params->type, + .initialValue = params->initial_value, + }; + + const VkSemaphoreCreateInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, + .pNext = &stinfo, + }; + + VkSemaphore sem = VK_NULL_HANDLE; + VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &sem)); + PL_VK_NAME(SEMAPHORE, sem, PL_DEF(params->debug_tag, "pl_vulkan_sem")); + +#ifdef PL_HAVE_UNIX + if (params->export_handle == PL_HANDLE_FD) { + VkSemaphoreGetFdInfoKHR finfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR, + .handleType = einfo.handleTypes, + .semaphore = sem, + }; + + VK(vk->GetSemaphoreFdKHR(vk->dev, &finfo, ¶ms->out_handle->fd)); + } +#endif + +#ifdef PL_HAVE_WIN32 + if (params->export_handle == PL_HANDLE_WIN32 || + params->export_handle == PL_HANDLE_WIN32_KMT) + { + VkSemaphoreGetWin32HandleInfoKHR handle_info = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR, + .handleType = einfo.handleTypes, + .semaphore = sem, + }; + + VK(vk->GetSemaphoreWin32HandleKHR(vk->dev, &handle_info, + ¶ms->out_handle->handle)); + } +#endif + + return sem; + +error: +#ifdef PL_HAVE_UNIX + if (params->export_handle == PL_HANDLE_FD) { + if (params->out_handle->fd > -1) + close(params->out_handle->fd); + } +#endif +#ifdef PL_HAVE_WIN32 + if (params->export_handle == PL_HANDLE_WIN32) { + if (params->out_handle->handle != NULL) + CloseHandle(params->out_handle->handle); + } + // PL_HANDLE_WIN32_KMT is just an identifier. It doesn't get closed. +#endif + vk->DestroySemaphore(vk->dev, sem, PL_VK_ALLOC); + return VK_NULL_HANDLE; +} + +static void vk_gpu_flush(pl_gpu gpu) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + CMD_SUBMIT(NULL); + vk_rotate_queues(vk); + vk_malloc_garbage_collect(vk->ma); +} + +static void vk_gpu_finish(pl_gpu gpu) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + CMD_SUBMIT(NULL); + vk_wait_idle(vk); +} + +static bool vk_gpu_is_failed(pl_gpu gpu) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + return vk->failed; +} + +struct vk_cmd *pl_vk_steal_cmd(pl_gpu gpu) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + pl_mutex_lock(&p->recording); + struct vk_cmd *cmd = p->cmd; + p->cmd = NULL; + pl_mutex_unlock(&p->recording); + + struct vk_cmdpool *pool = vk->pool_graphics; + if (!cmd || cmd->pool != pool) { + vk_cmd_submit(&cmd); + cmd = vk_cmd_begin(pool, NULL); + } + + return cmd; +} + +void pl_vk_print_heap(pl_gpu gpu, enum pl_log_level lev) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + vk_malloc_print_stats(vk->ma, lev); +} + +static const struct pl_gpu_fns pl_fns_vk = { + .destroy = vk_gpu_destroy, + .tex_create = vk_tex_create, + .tex_destroy = vk_tex_deref, + .tex_invalidate = vk_tex_invalidate, + .tex_clear_ex = vk_tex_clear_ex, + .tex_blit = vk_tex_blit, + .tex_upload = vk_tex_upload, + .tex_download = vk_tex_download, + .tex_poll = vk_tex_poll, + .tex_export = vk_tex_export, + .buf_create = vk_buf_create, + .buf_destroy = vk_buf_deref, + .buf_write = vk_buf_write, + .buf_read = vk_buf_read, + .buf_copy = vk_buf_copy, + .buf_export = vk_buf_export, + .buf_poll = vk_buf_poll, + .desc_namespace = vk_desc_namespace, + .pass_create = vk_pass_create, + .pass_destroy = vk_pass_destroy, + .pass_run = vk_pass_run, + .sync_create = vk_sync_create, + .sync_destroy = vk_sync_deref, + .timer_create = vk_timer_create, + .timer_destroy = vk_timer_destroy, + .timer_query = vk_timer_query, + .gpu_flush = vk_gpu_flush, + .gpu_finish = vk_gpu_finish, + .gpu_is_failed = vk_gpu_is_failed, +}; diff --git a/src/vulkan/gpu.h b/src/vulkan/gpu.h new file mode 100644 index 0000000..041de13 --- /dev/null +++ b/src/vulkan/gpu.h @@ -0,0 +1,175 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "common.h" +#include "command.h" +#include "formats.h" +#include "malloc.h" +#include "utils.h" + +#include "../gpu.h" +#include "../glsl/spirv.h" +#include "../pl_thread.h" + +pl_gpu pl_gpu_create_vk(struct vk_ctx *vk); + +// This function takes the current graphics command and steals it from the +// GPU, so the caller can do custom vk_cmd_ calls on it. The caller should +// submit it as well. +struct vk_cmd *pl_vk_steal_cmd(pl_gpu gpu); + +// Print memory usage statistics +void pl_vk_print_heap(pl_gpu, enum pl_log_level); + +// --- pl_gpu internal structs and helpers + +struct pl_fmt_vk { + const struct vk_format *vk_fmt; + bool blit_emulated; +}; + +enum queue_type { + GRAPHICS, + COMPUTE, + TRANSFER, + ANY, +}; + +struct pl_vk { + struct pl_gpu_fns impl; + struct vk_ctx *vk; + pl_spirv spirv; + + // Some additional cached device limits and features checks + uint32_t max_push_descriptors; + size_t min_texel_alignment; + + // The "currently recording" command. This will be queued and replaced by + // a new command every time we need to "switch" between queue families. + pl_mutex recording; + struct vk_cmd *cmd; + pl_timer cmd_timer; + + // Array of VkSamplers for every combination of sample/address modes + VkSampler samplers[PL_TEX_SAMPLE_MODE_COUNT][PL_TEX_ADDRESS_MODE_COUNT]; + + // To avoid spamming warnings + bool warned_modless; +}; + +struct vk_cmd *_begin_cmd(pl_gpu, enum queue_type, const char *label, pl_timer); +bool _end_cmd(pl_gpu, struct vk_cmd **, bool submit); + +#define CMD_BEGIN(type) _begin_cmd(gpu, type, __func__, NULL) +#define CMD_BEGIN_TIMED(type, timer) _begin_cmd(gpu, type, __func__, timer) +#define CMD_FINISH(cmd) _end_cmd(gpu, cmd, false) +#define CMD_SUBMIT(cmd) _end_cmd(gpu, cmd, true) + +// Helper to fire a callback the next time the `pl_gpu` is in an idle state +// +// Use this instead of `vk_dev_callback` when you need to clean up after +// resources that might possibly still be in use by the `pl_gpu` at the time of +// creating the callback. +void vk_gpu_idle_callback(pl_gpu, vk_cb, const void *priv, const void *arg); + +struct pl_tex_vk { + pl_rc_t rc; + bool external_img; + enum queue_type transfer_queue; + VkImageType type; + VkImage img; + VkImageAspectFlags aspect; + struct vk_memslice mem; + // cached properties + VkFormat img_fmt; + VkImageUsageFlags usage_flags; + // for sampling + VkImageView view; + // for rendering + VkFramebuffer framebuffer; + // for vk_tex_upload/download fallback code + pl_fmt texel_fmt; + // for planar textures (as a convenience) + int num_planes; + struct pl_tex_vk *planes[4]; + + // synchronization and current state (planes only) + struct vk_sem sem; + VkImageLayout layout; + PL_ARRAY(pl_vulkan_sem) ext_deps; // external semaphore, not owned by the pl_tex + pl_sync ext_sync; // indicates an exported image + uint32_t qf; // last queue family to access this texture (for barriers) + bool may_invalidate; + bool held; +}; + +pl_tex vk_tex_create(pl_gpu, const struct pl_tex_params *); +void vk_tex_deref(pl_gpu, pl_tex); +void vk_tex_invalidate(pl_gpu, pl_tex); +void vk_tex_clear_ex(pl_gpu, pl_tex, const union pl_clear_color); +void vk_tex_blit(pl_gpu, const struct pl_tex_blit_params *); +bool vk_tex_upload(pl_gpu, const struct pl_tex_transfer_params *); +bool vk_tex_download(pl_gpu, const struct pl_tex_transfer_params *); +bool vk_tex_poll(pl_gpu, pl_tex, uint64_t timeout); +bool vk_tex_export(pl_gpu, pl_tex, pl_sync); +void vk_tex_barrier(pl_gpu, struct vk_cmd *, pl_tex, VkPipelineStageFlags2, + VkAccessFlags2, VkImageLayout, uint32_t qf); + +struct pl_buf_vk { + pl_rc_t rc; + struct vk_memslice mem; + enum queue_type update_queue; + VkBufferView view; // for texel buffers + + // synchronization and current state + struct vk_sem sem; + bool exported; + bool needs_flush; +}; + +pl_buf vk_buf_create(pl_gpu, const struct pl_buf_params *); +void vk_buf_deref(pl_gpu, pl_buf); +void vk_buf_write(pl_gpu, pl_buf, size_t offset, const void *src, size_t size); +bool vk_buf_read(pl_gpu, pl_buf, size_t offset, void *dst, size_t size); +void vk_buf_copy(pl_gpu, pl_buf dst, size_t dst_offset, + pl_buf src, size_t src_offset, size_t size); +bool vk_buf_export(pl_gpu, pl_buf); +bool vk_buf_poll(pl_gpu, pl_buf, uint64_t timeout); + +// Helper to ease buffer barrier creation. (`offset` is relative to pl_buf) +void vk_buf_barrier(pl_gpu, struct vk_cmd *, pl_buf, VkPipelineStageFlags2, + VkAccessFlags2, size_t offset, size_t size, bool export); + +// Flush visible writes to a buffer made by the API +void vk_buf_flush(pl_gpu, struct vk_cmd *, pl_buf, size_t offset, size_t size); + +struct pl_pass_vk; + +int vk_desc_namespace(pl_gpu, enum pl_desc_type); +pl_pass vk_pass_create(pl_gpu, const struct pl_pass_params *); +void vk_pass_destroy(pl_gpu, pl_pass); +void vk_pass_run(pl_gpu, const struct pl_pass_run_params *); + +struct pl_sync_vk { + pl_rc_t rc; + VkSemaphore wait; + VkSemaphore signal; +}; + +void vk_sync_deref(pl_gpu, pl_sync); diff --git a/src/vulkan/gpu_buf.c b/src/vulkan/gpu_buf.c new file mode 100644 index 0000000..2f317bc --- /dev/null +++ b/src/vulkan/gpu_buf.c @@ -0,0 +1,470 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "gpu.h" + +void vk_buf_barrier(pl_gpu gpu, struct vk_cmd *cmd, pl_buf buf, + VkPipelineStageFlags2 stage, VkAccessFlags2 access, + size_t offset, size_t size, bool export) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + pl_assert(!export || !buf_vk->exported); // can't re-export exported buffers + pl_rc_ref(&buf_vk->rc); + + bool needs_flush = buf_vk->needs_flush || buf->params.host_mapped || + buf->params.import_handle == PL_HANDLE_HOST_PTR; + bool noncoherent = buf_vk->mem.data && !buf_vk->mem.coherent; + if (needs_flush && noncoherent) { + VK(vk->FlushMappedMemoryRanges(vk->dev, 1, &(struct VkMappedMemoryRange) { + .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, + .memory = buf_vk->mem.vkmem, + .offset = buf_vk->mem.map_offset, + .size = buf_vk->mem.map_size, + })); + + // Just ignore errors, not much we can do about them other than + // logging them and moving on... + error: ; + } + + struct vk_sync_scope last; + last = vk_sem_barrier(cmd, &buf_vk->sem, stage, access, export); + + // CONCURRENT buffers require transitioning to/from IGNORED, EXCLUSIVE + // buffers require transitioning to/from the concrete QF index + uint32_t qf = vk->pools.num > 1 ? VK_QUEUE_FAMILY_IGNORED : cmd->pool->qf; + uint32_t src_qf = buf_vk->exported ? VK_QUEUE_FAMILY_EXTERNAL_KHR : qf; + uint32_t dst_qf = export ? VK_QUEUE_FAMILY_EXTERNAL_KHR : qf; + + if (last.access || src_qf != dst_qf) { + vk_cmd_barrier(cmd, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .srcStageMask = last.stage, + .srcAccessMask = last.access, + .dstStageMask = stage, + .dstAccessMask = access, + .srcQueueFamilyIndex = src_qf, + .dstQueueFamilyIndex = dst_qf, + .buffer = buf_vk->mem.buf, + .offset = buf_vk->mem.offset + offset, + .size = size, + }, + }); + } + + buf_vk->needs_flush = false; + buf_vk->exported = export; + vk_cmd_callback(cmd, (vk_cb) vk_buf_deref, gpu, buf); +} + +void vk_buf_deref(pl_gpu gpu, pl_buf buf) +{ + if (!buf) + return; + + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + + if (pl_rc_deref(&buf_vk->rc)) { + vk->DestroyBufferView(vk->dev, buf_vk->view, PL_VK_ALLOC); + vk_malloc_free(vk->ma, &buf_vk->mem); + pl_free((void *) buf); + } +} + +pl_buf vk_buf_create(pl_gpu gpu, const struct pl_buf_params *params) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + struct pl_buf_t *buf = pl_zalloc_obj(NULL, buf, struct pl_buf_vk); + buf->params = *params; + buf->params.initial_data = NULL; + + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + pl_rc_init(&buf_vk->rc); + + struct vk_malloc_params mparams = { + .reqs = { + .size = PL_ALIGN2(params->size, 4), // for vk_buf_write + .memoryTypeBits = UINT32_MAX, + .alignment = 1, + }, + // these are always set, because `vk_buf_copy` can always be used + .buf_usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_TRANSFER_DST_BIT, + .export_handle = params->export_handle, + .import_handle = params->import_handle, + .shared_mem = params->shared_mem, + .debug_tag = params->debug_tag, + }; + + // Mandatory/optimal buffer offset alignment + VkDeviceSize *align = &mparams.reqs.alignment; + VkDeviceSize extra_align = vk->props.limits.optimalBufferCopyOffsetAlignment; + + // Try and align all buffers to the minimum texel alignment, to make sure + // tex_upload/tex_download always gets aligned buffer copies if possible + extra_align = pl_lcm(extra_align, p->min_texel_alignment); + + enum pl_buf_mem_type mem_type = params->memory_type; + bool is_texel = false; + + if (params->uniform) { + mparams.buf_usage |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; + *align = pl_lcm(*align, vk->props.limits.minUniformBufferOffsetAlignment); + mem_type = PL_BUF_MEM_DEVICE; + if (params->format) { + mparams.buf_usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT; + is_texel = true; + } + } + + if (params->storable) { + mparams.buf_usage |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + *align = pl_lcm(*align, vk->props.limits.minStorageBufferOffsetAlignment); + buf_vk->update_queue = COMPUTE; + mem_type = PL_BUF_MEM_DEVICE; + if (params->format) { + mparams.buf_usage |= VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT; + is_texel = true; + } + } + + if (is_texel) { + *align = pl_lcm(*align, vk->props.limits.minTexelBufferOffsetAlignment); + *align = pl_lcm(*align, params->format->texel_size); + } + + if (params->drawable) { + mparams.buf_usage |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | + VK_BUFFER_USAGE_INDEX_BUFFER_BIT; + mem_type = PL_BUF_MEM_DEVICE; + } + + if (params->host_writable || params->initial_data) { + // Buffers should be written using mapped memory if possible + mparams.optimal = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + // Use the transfer queue for updates on very large buffers (1 MB) + if (params->size > 1024*1024) + buf_vk->update_queue = TRANSFER; + } + + if (params->host_mapped || params->host_readable) { + mparams.required |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + + if (params->size > 1024) { + // Prefer cached memory for large buffers (1 kB) which may be read + // from, because uncached reads are extremely slow + mparams.optimal |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT; + } + } + + switch (mem_type) { + case PL_BUF_MEM_AUTO: + // We generally prefer VRAM since it's faster than RAM, but any number + // of other requirements could potentially exclude it, so just mark it + // as optimal by default. + if (!(mparams.optimal & VK_MEMORY_PROPERTY_HOST_CACHED_BIT)) + mparams.optimal |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + break; + case PL_BUF_MEM_DEVICE: + // Force device local memory. + mparams.required |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + break; + case PL_BUF_MEM_HOST: + // This isn't a true guarantee, but actually trying to restrict the + // device-local bit locks out all memory heaps on iGPUs. Requiring + // the memory be host-mapped is the easiest compromise. + mparams.required |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + mparams.optimal |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT; + break; + case PL_BUF_MEM_TYPE_COUNT: + pl_unreachable(); + } + + if (params->import_handle) { + size_t offset = params->shared_mem.offset; + if (PL_ALIGN(offset, *align) != offset) { + PL_ERR(gpu, "Imported memory offset %zu violates minimum alignment " + "requirement of enabled usage flags (%zu)!", + offset, (size_t) *align); + goto error; + } + } else { + *align = pl_lcm(*align, extra_align); + } + + if (!vk_malloc_slice(vk->ma, &buf_vk->mem, &mparams)) + goto error; + + if (params->host_mapped) + buf->data = buf_vk->mem.data; + + if (params->export_handle) { + buf->shared_mem = buf_vk->mem.shared_mem; + buf->shared_mem.drm_format_mod = DRM_FORMAT_MOD_LINEAR; + buf_vk->exported = true; + } + + if (is_texel) { + struct pl_fmt_vk *fmtp = PL_PRIV(params->format); + VkBufferViewCreateInfo vinfo = { + .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO, + .buffer = buf_vk->mem.buf, + .format = PL_DEF(fmtp->vk_fmt->bfmt, fmtp->vk_fmt->tfmt), + .offset = buf_vk->mem.offset, + .range = buf_vk->mem.size, + }; + + VK(vk->CreateBufferView(vk->dev, &vinfo, PL_VK_ALLOC, &buf_vk->view)); + PL_VK_NAME(BUFFER_VIEW, buf_vk->view, PL_DEF(params->debug_tag, "texel")); + } + + if (params->initial_data) + vk_buf_write(gpu, buf, 0, params->initial_data, params->size); + + return buf; + +error: + vk_buf_deref(gpu, buf); + return NULL; +} + +static void invalidate_buf(pl_gpu gpu, pl_buf buf) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + + if (buf_vk->mem.data && !buf_vk->mem.coherent) { + VK(vk->InvalidateMappedMemoryRanges(vk->dev, 1, &(VkMappedMemoryRange) { + .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, + .memory = buf_vk->mem.vkmem, + .offset = buf_vk->mem.map_offset, + .size = buf_vk->mem.map_size, + })); + } + + // Ignore errors (after logging), nothing useful we can do anyway +error: ; + vk_buf_deref(gpu, buf); +} + +void vk_buf_flush(pl_gpu gpu, struct vk_cmd *cmd, pl_buf buf, + size_t offset, size_t size) +{ + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + + // We need to perform a flush if the host is capable of reading back from + // the buffer, or if we intend to overwrite it using mapped memory + bool can_read = buf->params.host_readable; + bool can_write = buf_vk->mem.data && buf->params.host_writable; + if (buf->params.host_mapped || buf->params.import_handle == PL_HANDLE_HOST_PTR) + can_read = can_write = true; + + if (!can_read && !can_write) + return; + + vk_cmd_barrier(cmd, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .srcStageMask = buf_vk->sem.write.stage, + .srcAccessMask = buf_vk->sem.write.access, + .dstStageMask = VK_PIPELINE_STAGE_2_HOST_BIT, + .dstAccessMask = (can_read ? VK_ACCESS_2_HOST_READ_BIT : 0) + | (can_write ? VK_ACCESS_2_HOST_WRITE_BIT : 0), + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = buf_vk->mem.buf, + .offset = buf_vk->mem.offset + offset, + .size = size, + }, + }); + + // We need to hold on to the buffer until this barrier completes + vk_cmd_callback(cmd, (vk_cb) invalidate_buf, gpu, buf); + pl_rc_ref(&buf_vk->rc); +} + +bool vk_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t timeout) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + + // Opportunistically check if we can re-use this buffer without flush + vk_poll_commands(vk, 0); + if (pl_rc_count(&buf_vk->rc) == 1) + return false; + + // Otherwise, we're force to submit any queued command so that the + // user is guaranteed to see progress eventually, even if they call + // this in a tight loop + CMD_SUBMIT(NULL); + vk_poll_commands(vk, timeout); + + return pl_rc_count(&buf_vk->rc) > 1; +} + +void vk_buf_write(pl_gpu gpu, pl_buf buf, size_t offset, + const void *data, size_t size) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + + // For host-mapped buffers, we can just directly memcpy the buffer contents. + // Otherwise, we can update the buffer from the GPU using a command buffer. + if (buf_vk->mem.data) { + // ensure no queued operations + while (vk_buf_poll(gpu, buf, UINT64_MAX)) + ; // do nothing + + uintptr_t addr = (uintptr_t) buf_vk->mem.data + offset; + memcpy((void *) addr, data, size); + buf_vk->needs_flush = true; + } else { + struct vk_cmd *cmd = CMD_BEGIN(buf_vk->update_queue); + if (!cmd) { + PL_ERR(gpu, "Failed updating buffer!"); + return; + } + + vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, offset, size, false); + + // Vulkan requires `size` to be a multiple of 4, so we need to make + // sure to handle the end separately if the original data is not + const size_t max_transfer = 64 * 1024; + size_t size_rem = size % 4; + size_t size_base = size - size_rem; + VkDeviceSize buf_offset = buf_vk->mem.offset + offset; + + if (size_base > max_transfer) { + PL_TRACE(gpu, "Using multiple vkCmdUpdateBuffer calls to upload " + "large buffer. Consider using buffer-buffer transfers " + "instead!"); + } + + for (size_t xfer = 0; xfer < size_base; xfer += max_transfer) { + vk->CmdUpdateBuffer(cmd->buf, buf_vk->mem.buf, + buf_offset + xfer, + PL_MIN(size_base, max_transfer), + (void *) ((uint8_t *) data + xfer)); + } + + if (size_rem) { + uint8_t tail[4] = {0}; + memcpy(tail, data, size_rem); + vk->CmdUpdateBuffer(cmd->buf, buf_vk->mem.buf, buf_offset + size_base, + sizeof(tail), tail); + } + + pl_assert(!buf->params.host_readable); // no flush needed due to this + CMD_FINISH(&cmd); + } +} + +bool vk_buf_read(pl_gpu gpu, pl_buf buf, size_t offset, void *dest, size_t size) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + pl_assert(buf_vk->mem.data); + + if (vk_buf_poll(gpu, buf, 0) && buf_vk->sem.write.sync.sem) { + // ensure no more queued writes + VK(vk->WaitSemaphores(vk->dev, &(VkSemaphoreWaitInfo) { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO, + .semaphoreCount = 1, + .pSemaphores = &buf_vk->sem.write.sync.sem, + .pValues = &buf_vk->sem.write.sync.value, + }, UINT64_MAX)); + + // process callbacks + vk_poll_commands(vk, 0); + } + + uintptr_t addr = (uintptr_t) buf_vk->mem.data + (size_t) offset; + memcpy(dest, (void *) addr, size); + return true; + +error: + return false; +} + +void vk_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset, + pl_buf src, size_t src_offset, size_t size) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_buf_vk *dst_vk = PL_PRIV(dst); + struct pl_buf_vk *src_vk = PL_PRIV(src); + + struct vk_cmd *cmd = CMD_BEGIN(dst_vk->update_queue); + if (!cmd) { + PL_ERR(gpu, "Failed copying buffer!"); + return; + } + + vk_buf_barrier(gpu, cmd, dst, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, dst_offset, size, false); + vk_buf_barrier(gpu, cmd, src, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_READ_BIT, src_offset, size, false); + + VkBufferCopy region = { + .srcOffset = src_vk->mem.offset + src_offset, + .dstOffset = dst_vk->mem.offset + dst_offset, + .size = size, + }; + + vk->CmdCopyBuffer(cmd->buf, src_vk->mem.buf, dst_vk->mem.buf, + 1, ®ion); + + vk_buf_flush(gpu, cmd, dst, dst_offset, size); + CMD_FINISH(&cmd); +} + +bool vk_buf_export(pl_gpu gpu, pl_buf buf) +{ + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + if (buf_vk->exported) + return true; + + struct vk_cmd *cmd = CMD_BEGIN(ANY); + if (!cmd) { + PL_ERR(gpu, "Failed exporting buffer!"); + return false; + } + + // For the queue family ownership transfer, we can ignore all pipeline + // stages since the synchronization via fences/semaphores is required + vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_NONE, 0, 0, + buf->params.size, true); + + + return CMD_SUBMIT(&cmd); +} diff --git a/src/vulkan/gpu_pass.c b/src/vulkan/gpu_pass.c new file mode 100644 index 0000000..5ffe77d --- /dev/null +++ b/src/vulkan/gpu_pass.c @@ -0,0 +1,964 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "gpu.h" +#include "cache.h" +#include "glsl/spirv.h" + +// For pl_pass.priv +struct pl_pass_vk { + // Pipeline / render pass + VkPipeline base; + VkPipeline pipe; + VkPipelineLayout pipeLayout; + VkRenderPass renderPass; + // Descriptor set (bindings) + bool use_pushd; + VkDescriptorSetLayout dsLayout; + VkDescriptorPool dsPool; + // To keep track of which descriptor sets are and aren't available, we + // allocate a fixed number and use a bitmask of all available sets. + VkDescriptorSet dss[16]; + uint16_t dmask; + + // For recompilation + VkVertexInputAttributeDescription *attrs; + VkPipelineCache cache; + VkShaderModule vert; + VkShaderModule shader; + + // For updating + VkWriteDescriptorSet *dswrite; + VkDescriptorImageInfo *dsiinfo; + VkDescriptorBufferInfo *dsbinfo; + VkSpecializationInfo specInfo; + size_t spec_size; +}; + +int vk_desc_namespace(pl_gpu gpu, enum pl_desc_type type) +{ + return 0; +} + +static void pass_destroy_cb(pl_gpu gpu, pl_pass pass) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_pass_vk *pass_vk = PL_PRIV(pass); + + vk->DestroyPipeline(vk->dev, pass_vk->pipe, PL_VK_ALLOC); + vk->DestroyPipeline(vk->dev, pass_vk->base, PL_VK_ALLOC); + vk->DestroyRenderPass(vk->dev, pass_vk->renderPass, PL_VK_ALLOC); + vk->DestroyPipelineLayout(vk->dev, pass_vk->pipeLayout, PL_VK_ALLOC); + vk->DestroyPipelineCache(vk->dev, pass_vk->cache, PL_VK_ALLOC); + vk->DestroyDescriptorPool(vk->dev, pass_vk->dsPool, PL_VK_ALLOC); + vk->DestroyDescriptorSetLayout(vk->dev, pass_vk->dsLayout, PL_VK_ALLOC); + vk->DestroyShaderModule(vk->dev, pass_vk->vert, PL_VK_ALLOC); + vk->DestroyShaderModule(vk->dev, pass_vk->shader, PL_VK_ALLOC); + + pl_free((void *) pass); +} + +void vk_pass_destroy(pl_gpu gpu, pl_pass pass) +{ + vk_gpu_idle_callback(gpu, (vk_cb) pass_destroy_cb, gpu, pass); +} + +static const VkDescriptorType dsType[] = { + [PL_DESC_SAMPLED_TEX] = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + [PL_DESC_STORAGE_IMG] = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + [PL_DESC_BUF_UNIFORM] = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + [PL_DESC_BUF_STORAGE] = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + [PL_DESC_BUF_TEXEL_UNIFORM] = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, + [PL_DESC_BUF_TEXEL_STORAGE] = VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER, +}; + +static VkResult vk_compile_glsl(pl_gpu gpu, void *alloc, + enum glsl_shader_stage stage, + const char *shader, + pl_cache_obj *out_spirv) +{ + struct pl_vk *p = PL_PRIV(gpu); + pl_cache cache = pl_gpu_cache(gpu); + uint64_t key = CACHE_KEY_SPIRV; + if (cache) { // skip computing key if `cache + pl_hash_merge(&key, p->spirv->signature); + pl_hash_merge(&key, pl_str0_hash(shader)); + out_spirv->key = key; + if (pl_cache_get(cache, out_spirv)) { + PL_DEBUG(gpu, "Re-using cached SPIR-V object 0x%"PRIx64, key); + return VK_SUCCESS; + } + } + + pl_clock_t start = pl_clock_now(); + pl_str spirv = pl_spirv_compile_glsl(p->spirv, alloc, gpu->glsl, stage, shader); + pl_log_cpu_time(gpu->log, start, pl_clock_now(), "translating SPIR-V"); + out_spirv->data = spirv.buf; + out_spirv->size = spirv.len; + out_spirv->free = pl_free; + return spirv.len ? VK_SUCCESS : VK_ERROR_INITIALIZATION_FAILED; +} + +static const VkShaderStageFlags stageFlags[] = { + [PL_PASS_RASTER] = VK_SHADER_STAGE_FRAGMENT_BIT | + VK_SHADER_STAGE_VERTEX_BIT, + [PL_PASS_COMPUTE] = VK_SHADER_STAGE_COMPUTE_BIT, +}; + +static void destroy_pipeline(struct vk_ctx *vk, void *pipeline) +{ + vk->DestroyPipeline(vk->dev, vk_unwrap_handle(pipeline), PL_VK_ALLOC); +} + +static VkResult vk_recreate_pipelines(struct vk_ctx *vk, pl_pass pass, + bool derivable, VkPipeline base, + VkPipeline *out_pipe) +{ + struct pl_pass_vk *pass_vk = PL_PRIV(pass); + const struct pl_pass_params *params = &pass->params; + + // The old pipeline might still be in use, so we have to destroy it + // asynchronously with a device idle callback + if (*out_pipe) { + // We don't need to use `vk_gpu_idle_callback` because the only command + // that can access a VkPipeline, `vk_pass_run`, always flushes `p->cmd`. + vk_dev_callback(vk, (vk_cb) destroy_pipeline, vk, vk_wrap_handle(*out_pipe)); + *out_pipe = VK_NULL_HANDLE; + } + + VkPipelineCreateFlags flags = 0; + if (derivable) + flags |= VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT; + if (base) + flags |= VK_PIPELINE_CREATE_DERIVATIVE_BIT; + + const VkSpecializationInfo *specInfo = &pass_vk->specInfo; + if (!specInfo->dataSize) + specInfo = NULL; + + switch (params->type) { + case PL_PASS_RASTER: { + static const VkBlendFactor blendFactors[] = { + [PL_BLEND_ZERO] = VK_BLEND_FACTOR_ZERO, + [PL_BLEND_ONE] = VK_BLEND_FACTOR_ONE, + [PL_BLEND_SRC_ALPHA] = VK_BLEND_FACTOR_SRC_ALPHA, + [PL_BLEND_ONE_MINUS_SRC_ALPHA] = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA, + }; + + VkPipelineColorBlendAttachmentState blendState = { + .colorBlendOp = VK_BLEND_OP_ADD, + .alphaBlendOp = VK_BLEND_OP_ADD, + .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | + VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | + VK_COLOR_COMPONENT_A_BIT, + }; + + const struct pl_blend_params *blend = params->blend_params; + if (blend) { + blendState.blendEnable = true; + blendState.srcColorBlendFactor = blendFactors[blend->src_rgb]; + blendState.dstColorBlendFactor = blendFactors[blend->dst_rgb]; + blendState.srcAlphaBlendFactor = blendFactors[blend->src_alpha]; + blendState.dstAlphaBlendFactor = blendFactors[blend->dst_alpha]; + } + + static const VkPrimitiveTopology topologies[PL_PRIM_TYPE_COUNT] = { + [PL_PRIM_TRIANGLE_LIST] = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST, + [PL_PRIM_TRIANGLE_STRIP] = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP, + }; + + VkGraphicsPipelineCreateInfo cinfo = { + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .flags = flags, + .stageCount = 2, + .pStages = (VkPipelineShaderStageCreateInfo[]) { + { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .module = pass_vk->vert, + .pName = "main", + }, { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .module = pass_vk->shader, + .pName = "main", + .pSpecializationInfo = specInfo, + } + }, + .pVertexInputState = &(VkPipelineVertexInputStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .vertexBindingDescriptionCount = 1, + .pVertexBindingDescriptions = &(VkVertexInputBindingDescription) { + .binding = 0, + .stride = params->vertex_stride, + .inputRate = VK_VERTEX_INPUT_RATE_VERTEX, + }, + .vertexAttributeDescriptionCount = params->num_vertex_attribs, + .pVertexAttributeDescriptions = pass_vk->attrs, + }, + .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .topology = topologies[params->vertex_type], + }, + .pViewportState = &(VkPipelineViewportStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .viewportCount = 1, + .scissorCount = 1, + }, + .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .polygonMode = VK_POLYGON_MODE_FILL, + .cullMode = VK_CULL_MODE_NONE, + .lineWidth = 1.0f, + }, + .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT, + }, + .pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &blendState, + }, + .pDynamicState = &(VkPipelineDynamicStateCreateInfo) { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .dynamicStateCount = 2, + .pDynamicStates = (VkDynamicState[]){ + VK_DYNAMIC_STATE_VIEWPORT, + VK_DYNAMIC_STATE_SCISSOR, + }, + }, + .layout = pass_vk->pipeLayout, + .renderPass = pass_vk->renderPass, + .basePipelineHandle = base, + .basePipelineIndex = -1, + }; + + return vk->CreateGraphicsPipelines(vk->dev, pass_vk->cache, 1, &cinfo, + PL_VK_ALLOC, out_pipe); + } + + case PL_PASS_COMPUTE: { + VkComputePipelineCreateInfo cinfo = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .flags = flags, + .stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .module = pass_vk->shader, + .pName = "main", + .pSpecializationInfo = specInfo, + }, + .layout = pass_vk->pipeLayout, + .basePipelineHandle = base, + .basePipelineIndex = -1, + }; + + return vk->CreateComputePipelines(vk->dev, pass_vk->cache, 1, &cinfo, + PL_VK_ALLOC, out_pipe); + } + + case PL_PASS_INVALID: + case PL_PASS_TYPE_COUNT: + break; + } + + pl_unreachable(); +} + +pl_pass vk_pass_create(pl_gpu gpu, const struct pl_pass_params *params) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + bool success = false; + + struct pl_pass_t *pass = pl_zalloc_obj(NULL, pass, struct pl_pass_vk); + pass->params = pl_pass_params_copy(pass, params); + + struct pl_pass_vk *pass_vk = PL_PRIV(pass); + pass_vk->dmask = -1; // all descriptors available + + // temporary allocations + void *tmp = pl_tmp(NULL); + + int num_desc = params->num_descriptors; + if (!num_desc) + goto no_descriptors; + if (num_desc > vk->props.limits.maxPerStageResources) { + PL_ERR(gpu, "Pass with %d descriptors exceeds the maximum number of " + "per-stage resources %" PRIu32"!", + num_desc, vk->props.limits.maxPerStageResources); + goto error; + } + + pass_vk->dswrite = pl_calloc(pass, num_desc, sizeof(VkWriteDescriptorSet)); + pass_vk->dsiinfo = pl_calloc(pass, num_desc, sizeof(VkDescriptorImageInfo)); + pass_vk->dsbinfo = pl_calloc(pass, num_desc, sizeof(VkDescriptorBufferInfo)); + +#define NUM_DS (PL_ARRAY_SIZE(pass_vk->dss)) + + int dsSize[PL_DESC_TYPE_COUNT] = {0}; + VkDescriptorSetLayoutBinding *bindings = pl_calloc_ptr(tmp, num_desc, bindings); + + uint32_t max_tex = vk->props.limits.maxPerStageDescriptorSampledImages, + max_img = vk->props.limits.maxPerStageDescriptorStorageImages, + max_ubo = vk->props.limits.maxPerStageDescriptorUniformBuffers, + max_ssbo = vk->props.limits.maxPerStageDescriptorStorageBuffers; + + uint32_t *dsLimits[PL_DESC_TYPE_COUNT] = { + [PL_DESC_SAMPLED_TEX] = &max_tex, + [PL_DESC_STORAGE_IMG] = &max_img, + [PL_DESC_BUF_UNIFORM] = &max_ubo, + [PL_DESC_BUF_STORAGE] = &max_ssbo, + [PL_DESC_BUF_TEXEL_UNIFORM] = &max_tex, + [PL_DESC_BUF_TEXEL_STORAGE] = &max_img, + }; + + for (int i = 0; i < num_desc; i++) { + struct pl_desc *desc = ¶ms->descriptors[i]; + if (!(*dsLimits[desc->type])--) { + PL_ERR(gpu, "Pass exceeds the maximum number of per-stage " + "descriptors of type %u!", (unsigned) desc->type); + goto error; + } + + dsSize[desc->type]++; + bindings[i] = (VkDescriptorSetLayoutBinding) { + .binding = desc->binding, + .descriptorType = dsType[desc->type], + .descriptorCount = 1, + .stageFlags = stageFlags[params->type], + }; + } + + VkDescriptorSetLayoutCreateInfo dinfo = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .pBindings = bindings, + .bindingCount = num_desc, + }; + + if (p->max_push_descriptors && num_desc <= p->max_push_descriptors) { + dinfo.flags |= VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR; + pass_vk->use_pushd = true; + } else if (p->max_push_descriptors) { + PL_INFO(gpu, "Pass with %d descriptors exceeds the maximum push " + "descriptor count (%d). Falling back to descriptor sets!", + num_desc, p->max_push_descriptors); + } + + VK(vk->CreateDescriptorSetLayout(vk->dev, &dinfo, PL_VK_ALLOC, + &pass_vk->dsLayout)); + + if (!pass_vk->use_pushd) { + PL_ARRAY(VkDescriptorPoolSize) dsPoolSizes = {0}; + + for (enum pl_desc_type t = 0; t < PL_DESC_TYPE_COUNT; t++) { + if (dsSize[t] > 0) { + PL_ARRAY_APPEND(tmp, dsPoolSizes, (VkDescriptorPoolSize) { + .type = dsType[t], + .descriptorCount = dsSize[t] * NUM_DS, + }); + } + } + + if (dsPoolSizes.num) { + VkDescriptorPoolCreateInfo pinfo = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, + .maxSets = NUM_DS, + .pPoolSizes = dsPoolSizes.elem, + .poolSizeCount = dsPoolSizes.num, + }; + + VK(vk->CreateDescriptorPool(vk->dev, &pinfo, PL_VK_ALLOC, &pass_vk->dsPool)); + + VkDescriptorSetLayout layouts[NUM_DS]; + for (int i = 0; i < NUM_DS; i++) + layouts[i] = pass_vk->dsLayout; + + VkDescriptorSetAllocateInfo ainfo = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, + .descriptorPool = pass_vk->dsPool, + .descriptorSetCount = NUM_DS, + .pSetLayouts = layouts, + }; + + VK(vk->AllocateDescriptorSets(vk->dev, &ainfo, pass_vk->dss)); + } + } + +no_descriptors: ; + + bool has_spec = params->num_constants; + if (has_spec) { + PL_ARRAY(VkSpecializationMapEntry) entries = {0}; + PL_ARRAY_RESIZE(pass, entries, params->num_constants); + size_t spec_size = 0; + + for (int i = 0; i < params->num_constants; i++) { + const struct pl_constant *con = ¶ms->constants[i]; + size_t con_size = pl_var_type_size(con->type); + entries.elem[i] = (VkSpecializationMapEntry) { + .constantID = con->id, + .offset = con->offset, + .size = con_size, + }; + + size_t req_size = con->offset + con_size; + spec_size = PL_MAX(spec_size, req_size); + } + + pass_vk->spec_size = spec_size; + pass_vk->specInfo = (VkSpecializationInfo) { + .mapEntryCount = params->num_constants, + .pMapEntries = entries.elem, + }; + + if (params->constant_data) { + pass_vk->specInfo.pData = pl_memdup(pass, params->constant_data, spec_size); + pass_vk->specInfo.dataSize = spec_size; + } + } + + VkPipelineLayoutCreateInfo linfo = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = num_desc ? 1 : 0, + .pSetLayouts = &pass_vk->dsLayout, + .pushConstantRangeCount = params->push_constants_size ? 1 : 0, + .pPushConstantRanges = &(VkPushConstantRange){ + .stageFlags = stageFlags[params->type], + .offset = 0, + .size = params->push_constants_size, + }, + }; + + VK(vk->CreatePipelineLayout(vk->dev, &linfo, PL_VK_ALLOC, + &pass_vk->pipeLayout)); + + pl_cache_obj vert = {0}, frag = {0}, comp = {0}; + switch (params->type) { + case PL_PASS_RASTER: ; + VK(vk_compile_glsl(gpu, tmp, GLSL_SHADER_VERTEX, params->vertex_shader, &vert)); + VK(vk_compile_glsl(gpu, tmp, GLSL_SHADER_FRAGMENT, params->glsl_shader, &frag)); + break; + case PL_PASS_COMPUTE: + VK(vk_compile_glsl(gpu, tmp, GLSL_SHADER_COMPUTE, params->glsl_shader, &comp)); + break; + case PL_PASS_INVALID: + case PL_PASS_TYPE_COUNT: + pl_unreachable(); + } + + // Use hash of generated SPIR-V as key for pipeline cache + const pl_cache cache = pl_gpu_cache(gpu); + pl_cache_obj pipecache = {0}; + if (cache) { + pipecache.key = CACHE_KEY_VK_PIPE; + pl_hash_merge(&pipecache.key, pl_var_hash(vk->props.pipelineCacheUUID)); + pl_hash_merge(&pipecache.key, pl_mem_hash(vert.data, vert.size)); + pl_hash_merge(&pipecache.key, pl_mem_hash(frag.data, frag.size)); + pl_hash_merge(&pipecache.key, pl_mem_hash(comp.data, comp.size)); + pl_cache_get(cache, &pipecache); + } + + if (cache || has_spec) { + // Don't create pipeline cache unless we either plan on caching the + // result of this shader to a pl_cache, or if we will possibly re-use + // it due to the presence of specialization constants + VkPipelineCacheCreateInfo pcinfo = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO, + .pInitialData = pipecache.data, + .initialDataSize = pipecache.size, + }; + + VK(vk->CreatePipelineCache(vk->dev, &pcinfo, PL_VK_ALLOC, &pass_vk->cache)); + } + + VkShaderModuleCreateInfo sinfo = { + .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, + }; + + pl_clock_t start = pl_clock_now(); + switch (params->type) { + case PL_PASS_RASTER: { + sinfo.pCode = (uint32_t *) vert.data; + sinfo.codeSize = vert.size; + VK(vk->CreateShaderModule(vk->dev, &sinfo, PL_VK_ALLOC, &pass_vk->vert)); + PL_VK_NAME(SHADER_MODULE, pass_vk->vert, "vertex"); + + sinfo.pCode = (uint32_t *) frag.data; + sinfo.codeSize = frag.size; + VK(vk->CreateShaderModule(vk->dev, &sinfo, PL_VK_ALLOC, &pass_vk->shader)); + PL_VK_NAME(SHADER_MODULE, pass_vk->shader, "fragment"); + + pass_vk->attrs = pl_calloc_ptr(pass, params->num_vertex_attribs, pass_vk->attrs); + for (int i = 0; i < params->num_vertex_attribs; i++) { + struct pl_vertex_attrib *va = ¶ms->vertex_attribs[i]; + const struct vk_format **pfmt_vk = PL_PRIV(va->fmt); + + pass_vk->attrs[i] = (VkVertexInputAttributeDescription) { + .binding = 0, + .location = va->location, + .offset = va->offset, + .format = PL_DEF((*pfmt_vk)->bfmt, (*pfmt_vk)->tfmt), + }; + } + + VkRenderPassCreateInfo rinfo = { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &(VkAttachmentDescription) { + .format = (VkFormat) params->target_format->signature, + .samples = VK_SAMPLE_COUNT_1_BIT, + .loadOp = pass->params.load_target + ? VK_ATTACHMENT_LOAD_OP_LOAD + : VK_ATTACHMENT_LOAD_OP_DONT_CARE, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .initialLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + .finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + }, + .subpassCount = 1, + .pSubpasses = &(VkSubpassDescription) { + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .colorAttachmentCount = 1, + .pColorAttachments = &(VkAttachmentReference) { + .attachment = 0, + .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + }, + }, + }; + + VK(vk->CreateRenderPass(vk->dev, &rinfo, PL_VK_ALLOC, &pass_vk->renderPass)); + break; + } + case PL_PASS_COMPUTE: { + sinfo.pCode = (uint32_t *) comp.data; + sinfo.codeSize = comp.size; + VK(vk->CreateShaderModule(vk->dev, &sinfo, PL_VK_ALLOC, &pass_vk->shader)); + PL_VK_NAME(SHADER_MODULE, pass_vk->shader, "compute"); + break; + } + case PL_PASS_INVALID: + case PL_PASS_TYPE_COUNT: + pl_unreachable(); + } + + pl_clock_t after_compilation = pl_clock_now(); + pl_log_cpu_time(gpu->log, start, after_compilation, "compiling shader"); + + // Update cache entries on successful compilation + pl_cache_steal(cache, &vert); + pl_cache_steal(cache, &frag); + pl_cache_steal(cache, &comp); + + // Create the graphics/compute pipeline + VkPipeline *pipe = has_spec ? &pass_vk->base : &pass_vk->pipe; + VK(vk_recreate_pipelines(vk, pass, has_spec, VK_NULL_HANDLE, pipe)); + pl_log_cpu_time(gpu->log, after_compilation, pl_clock_now(), "creating pipeline"); + + // Update pipeline cache + if (cache) { + size_t size = 0; + VK(vk->GetPipelineCacheData(vk->dev, pass_vk->cache, &size, NULL)); + pl_cache_obj_resize(tmp, &pipecache, size); + VK(vk->GetPipelineCacheData(vk->dev, pass_vk->cache, &size, pipecache.data)); + pl_cache_steal(cache, &pipecache); + } + + if (!has_spec) { + // We can free these if we no longer need them for specialization + pl_free_ptr(&pass_vk->attrs); + vk->DestroyShaderModule(vk->dev, pass_vk->vert, PL_VK_ALLOC); + vk->DestroyShaderModule(vk->dev, pass_vk->shader, PL_VK_ALLOC); + vk->DestroyPipelineCache(vk->dev, pass_vk->cache, PL_VK_ALLOC); + pass_vk->vert = VK_NULL_HANDLE; + pass_vk->shader = VK_NULL_HANDLE; + pass_vk->cache = VK_NULL_HANDLE; + } + + PL_DEBUG(vk, "Pass statistics: size %zu, SPIR-V: vert %zu frag %zu comp %zu", + pipecache.size, vert.size, frag.size, comp.size); + + success = true; + +error: + if (!success) { + pass_destroy_cb(gpu, pass); + pass = NULL; + } + +#undef NUM_DS + + pl_free(tmp); + return pass; +} + +static const VkPipelineStageFlags2 shaderStages[] = { + [PL_PASS_RASTER] = VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT, + [PL_PASS_COMPUTE] = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, +}; + +static void vk_update_descriptor(pl_gpu gpu, struct vk_cmd *cmd, pl_pass pass, + struct pl_desc_binding db, + VkDescriptorSet ds, int idx) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct pl_pass_vk *pass_vk = PL_PRIV(pass); + struct pl_desc *desc = &pass->params.descriptors[idx]; + + VkWriteDescriptorSet *wds = &pass_vk->dswrite[idx]; + *wds = (VkWriteDescriptorSet) { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = ds, + .dstBinding = desc->binding, + .descriptorCount = 1, + .descriptorType = dsType[desc->type], + }; + + static const VkAccessFlags2 storageAccess[PL_DESC_ACCESS_COUNT] = { + [PL_DESC_ACCESS_READONLY] = VK_ACCESS_2_SHADER_STORAGE_READ_BIT, + [PL_DESC_ACCESS_WRITEONLY] = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + [PL_DESC_ACCESS_READWRITE] = VK_ACCESS_2_SHADER_STORAGE_READ_BIT | + VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + }; + + switch (desc->type) { + case PL_DESC_SAMPLED_TEX: { + pl_tex tex = db.object; + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + + vk_tex_barrier(gpu, cmd, tex, shaderStages[pass->params.type], + VK_ACCESS_2_SHADER_SAMPLED_READ_BIT, + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + VK_QUEUE_FAMILY_IGNORED); + + VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx]; + *iinfo = (VkDescriptorImageInfo) { + .sampler = p->samplers[db.sample_mode][db.address_mode], + .imageView = tex_vk->view, + .imageLayout = tex_vk->layout, + }; + + wds->pImageInfo = iinfo; + return; + } + case PL_DESC_STORAGE_IMG: { + pl_tex tex = db.object; + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + + vk_tex_barrier(gpu, cmd, tex, shaderStages[pass->params.type], + storageAccess[desc->access], VK_IMAGE_LAYOUT_GENERAL, + VK_QUEUE_FAMILY_IGNORED); + + VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx]; + *iinfo = (VkDescriptorImageInfo) { + .imageView = tex_vk->view, + .imageLayout = tex_vk->layout, + }; + + wds->pImageInfo = iinfo; + return; + } + case PL_DESC_BUF_UNIFORM: + case PL_DESC_BUF_STORAGE: { + pl_buf buf = db.object; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + + VkAccessFlags2 access = VK_ACCESS_2_UNIFORM_READ_BIT; + if (desc->type == PL_DESC_BUF_STORAGE) + access = storageAccess[desc->access]; + + vk_buf_barrier(gpu, cmd, buf, shaderStages[pass->params.type], + access, 0, buf->params.size, false); + + VkDescriptorBufferInfo *binfo = &pass_vk->dsbinfo[idx]; + *binfo = (VkDescriptorBufferInfo) { + .buffer = buf_vk->mem.buf, + .offset = buf_vk->mem.offset, + .range = buf->params.size, + }; + + wds->pBufferInfo = binfo; + return; + } + case PL_DESC_BUF_TEXEL_UNIFORM: + case PL_DESC_BUF_TEXEL_STORAGE: { + pl_buf buf = db.object; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + + VkAccessFlags2 access = VK_ACCESS_2_SHADER_SAMPLED_READ_BIT; + if (desc->type == PL_DESC_BUF_TEXEL_STORAGE) + access = storageAccess[desc->access]; + + vk_buf_barrier(gpu, cmd, buf, shaderStages[pass->params.type], + access, 0, buf->params.size, false); + + wds->pTexelBufferView = &buf_vk->view; + return; + } + case PL_DESC_INVALID: + case PL_DESC_TYPE_COUNT: + break; + } + + pl_unreachable(); +} + +static void vk_release_descriptor(pl_gpu gpu, struct vk_cmd *cmd, pl_pass pass, + struct pl_desc_binding db, int idx) +{ + const struct pl_desc *desc = &pass->params.descriptors[idx]; + + switch (desc->type) { + case PL_DESC_BUF_UNIFORM: + case PL_DESC_BUF_STORAGE: + case PL_DESC_BUF_TEXEL_UNIFORM: + case PL_DESC_BUF_TEXEL_STORAGE: + if (desc->access != PL_DESC_ACCESS_READONLY) { + pl_buf buf = db.object; + vk_buf_flush(gpu, cmd, buf, 0, buf->params.size); + } + return; + case PL_DESC_SAMPLED_TEX: + case PL_DESC_STORAGE_IMG: + return; + case PL_DESC_INVALID: + case PL_DESC_TYPE_COUNT: + break; + } + + pl_unreachable(); +} + +static void set_ds(struct pl_pass_vk *pass_vk, void *dsbit) +{ + pass_vk->dmask |= (uintptr_t) dsbit; +} + +static bool need_respec(pl_pass pass, const struct pl_pass_run_params *params) +{ + struct pl_pass_vk *pass_vk = PL_PRIV(pass); + if (!pass_vk->spec_size || !params->constant_data) + return false; + + VkSpecializationInfo *specInfo = &pass_vk->specInfo; + size_t size = pass_vk->spec_size; + if (!specInfo->pData) { + // Shader was never specialized before + specInfo->pData = pl_memdup((void *) pass, params->constant_data, size); + specInfo->dataSize = size; + return true; + } + + // Shader is being re-specialized with new values + if (memcmp(specInfo->pData, params->constant_data, size) != 0) { + memcpy((void *) specInfo->pData, params->constant_data, size); + return true; + } + + return false; +} + +void vk_pass_run(pl_gpu gpu, const struct pl_pass_run_params *params) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + pl_pass pass = params->pass; + struct pl_pass_vk *pass_vk = PL_PRIV(pass); + + if (params->vertex_data || params->index_data) + return pl_pass_run_vbo(gpu, params); + + // Check if we need to re-specialize this pipeline + if (need_respec(pass, params)) { + pl_clock_t start = pl_clock_now(); + VK(vk_recreate_pipelines(vk, pass, false, pass_vk->base, &pass_vk->pipe)); + pl_log_cpu_time(gpu->log, start, pl_clock_now(), "re-specializing shader"); + } + + if (!pass_vk->use_pushd) { + // Wait for a free descriptor set + while (!pass_vk->dmask) { + PL_TRACE(gpu, "No free descriptor sets! ...blocking (slow path)"); + vk_poll_commands(vk, 10000000); // 10 ms + } + } + + static const enum queue_type types[] = { + [PL_PASS_RASTER] = GRAPHICS, + [PL_PASS_COMPUTE] = COMPUTE, + }; + + struct vk_cmd *cmd = CMD_BEGIN_TIMED(types[pass->params.type], params->timer); + if (!cmd) + goto error; + + // Find a descriptor set to use + VkDescriptorSet ds = VK_NULL_HANDLE; + if (!pass_vk->use_pushd) { + for (int i = 0; i < PL_ARRAY_SIZE(pass_vk->dss); i++) { + uint16_t dsbit = 1u << i; + if (pass_vk->dmask & dsbit) { + ds = pass_vk->dss[i]; + pass_vk->dmask &= ~dsbit; // unset + vk_cmd_callback(cmd, (vk_cb) set_ds, pass_vk, + (void *)(uintptr_t) dsbit); + break; + } + } + } + + // Update the dswrite structure with all of the new values + for (int i = 0; i < pass->params.num_descriptors; i++) + vk_update_descriptor(gpu, cmd, pass, params->desc_bindings[i], ds, i); + + if (!pass_vk->use_pushd) { + vk->UpdateDescriptorSets(vk->dev, pass->params.num_descriptors, + pass_vk->dswrite, 0, NULL); + } + + // Bind the pipeline, descriptor set, etc. + static const VkPipelineBindPoint bindPoint[] = { + [PL_PASS_RASTER] = VK_PIPELINE_BIND_POINT_GRAPHICS, + [PL_PASS_COMPUTE] = VK_PIPELINE_BIND_POINT_COMPUTE, + }; + + vk->CmdBindPipeline(cmd->buf, bindPoint[pass->params.type], + PL_DEF(pass_vk->pipe, pass_vk->base)); + + if (ds) { + vk->CmdBindDescriptorSets(cmd->buf, bindPoint[pass->params.type], + pass_vk->pipeLayout, 0, 1, &ds, 0, NULL); + } + + if (pass_vk->use_pushd) { + vk->CmdPushDescriptorSetKHR(cmd->buf, bindPoint[pass->params.type], + pass_vk->pipeLayout, 0, + pass->params.num_descriptors, + pass_vk->dswrite); + } + + if (pass->params.push_constants_size) { + vk->CmdPushConstants(cmd->buf, pass_vk->pipeLayout, + stageFlags[pass->params.type], 0, + pass->params.push_constants_size, + params->push_constants); + } + + switch (pass->params.type) { + case PL_PASS_RASTER: { + pl_tex tex = params->target; + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + pl_buf vert = params->vertex_buf; + struct pl_buf_vk *vert_vk = PL_PRIV(vert); + pl_buf index = params->index_buf; + struct pl_buf_vk *index_vk = index ? PL_PRIV(index) : NULL; + pl_assert(vert); + + // In the edge case that vert = index buffer, we need to synchronize + // for both flags simultaneously + VkPipelineStageFlags2 vbo_stage = VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT; + VkAccessFlags2 vbo_flags = VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT; + if (index == vert) { + vbo_stage |= VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT; + vbo_flags |= VK_ACCESS_2_INDEX_READ_BIT; + } + + vk_buf_barrier(gpu, cmd, vert, vbo_stage, vbo_flags, 0, vert->params.size, false); + + VkDeviceSize offset = vert_vk->mem.offset + params->buf_offset; + vk->CmdBindVertexBuffers(cmd->buf, 0, 1, &vert_vk->mem.buf, &offset); + + if (index) { + if (index != vert) { + vk_buf_barrier(gpu, cmd, index, VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT, + VK_ACCESS_2_INDEX_READ_BIT, 0, index->params.size, + false); + } + + static const VkIndexType index_fmts[PL_INDEX_FORMAT_COUNT] = { + [PL_INDEX_UINT16] = VK_INDEX_TYPE_UINT16, + [PL_INDEX_UINT32] = VK_INDEX_TYPE_UINT32, + }; + + vk->CmdBindIndexBuffer(cmd->buf, index_vk->mem.buf, + index_vk->mem.offset + params->index_offset, + index_fmts[params->index_fmt]); + } + + + VkAccessFlags2 fbo_access = VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT; + if (pass->params.load_target) + fbo_access |= VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT; + + vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, + fbo_access, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + VK_QUEUE_FAMILY_IGNORED); + + VkViewport viewport = { + .x = params->viewport.x0, + .y = params->viewport.y0, + .width = pl_rect_w(params->viewport), + .height = pl_rect_h(params->viewport), + }; + + VkRect2D scissor = { + .offset = {params->scissors.x0, params->scissors.y0}, + .extent = {pl_rect_w(params->scissors), pl_rect_h(params->scissors)}, + }; + + vk->CmdSetViewport(cmd->buf, 0, 1, &viewport); + vk->CmdSetScissor(cmd->buf, 0, 1, &scissor); + + VkRenderPassBeginInfo binfo = { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .renderPass = pass_vk->renderPass, + .framebuffer = tex_vk->framebuffer, + .renderArea.extent = {tex->params.w, tex->params.h}, + }; + + vk->CmdBeginRenderPass(cmd->buf, &binfo, VK_SUBPASS_CONTENTS_INLINE); + + if (index) { + vk->CmdDrawIndexed(cmd->buf, params->vertex_count, 1, 0, 0, 0); + } else { + vk->CmdDraw(cmd->buf, params->vertex_count, 1, 0, 0); + } + + vk->CmdEndRenderPass(cmd->buf); + break; + } + case PL_PASS_COMPUTE: + vk->CmdDispatch(cmd->buf, params->compute_groups[0], + params->compute_groups[1], + params->compute_groups[2]); + break; + case PL_PASS_INVALID: + case PL_PASS_TYPE_COUNT: + pl_unreachable(); + }; + + for (int i = 0; i < pass->params.num_descriptors; i++) + vk_release_descriptor(gpu, cmd, pass, params->desc_bindings[i], i); + + // submit this command buffer for better intra-frame granularity + CMD_SUBMIT(&cmd); + +error: + return; +} diff --git a/src/vulkan/gpu_tex.c b/src/vulkan/gpu_tex.c new file mode 100644 index 0000000..7ab83b7 --- /dev/null +++ b/src/vulkan/gpu_tex.c @@ -0,0 +1,1453 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "gpu.h" + +void vk_tex_barrier(pl_gpu gpu, struct vk_cmd *cmd, pl_tex tex, + VkPipelineStageFlags2 stage, VkAccessFlags2 access, + VkImageLayout layout, uint32_t qf) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + pl_rc_ref(&tex_vk->rc); + pl_assert(!tex_vk->held); + pl_assert(!tex_vk->num_planes); + + // CONCURRENT images require transitioning to/from IGNORED, EXCLUSIVE + // images require transitioning to/from the concrete QF index + if (vk->pools.num == 1) { + if (tex_vk->qf == VK_QUEUE_FAMILY_IGNORED) + tex_vk->qf = cmd->pool->qf; + if (qf == VK_QUEUE_FAMILY_IGNORED) + qf = cmd->pool->qf; + } + + struct vk_sync_scope last; + bool is_trans = layout != tex_vk->layout, is_xfer = qf != tex_vk->qf; + last = vk_sem_barrier(cmd, &tex_vk->sem, stage, access, is_trans || is_xfer); + + VkImageMemoryBarrier2 barr = { + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER_2, + .srcStageMask = last.stage, + .srcAccessMask = last.access, + .dstStageMask = stage, + .dstAccessMask = access, + .oldLayout = tex_vk->layout, + .newLayout = layout, + .srcQueueFamilyIndex = tex_vk->qf, + .dstQueueFamilyIndex = qf, + .image = tex_vk->img, + .subresourceRange = { + .aspectMask = tex_vk->aspect, + .levelCount = 1, + .layerCount = 1, + }, + }; + + if (tex_vk->may_invalidate) { + tex_vk->may_invalidate = false; + barr.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + } + + if (last.access || is_trans || is_xfer) { + vk_cmd_barrier(cmd, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .imageMemoryBarrierCount = 1, + .pImageMemoryBarriers = &barr, + }); + } + + tex_vk->qf = qf; + tex_vk->layout = layout; + vk_cmd_callback(cmd, (vk_cb) vk_tex_deref, gpu, tex); + + for (int i = 0; i < tex_vk->ext_deps.num; i++) + vk_cmd_dep(cmd, stage, tex_vk->ext_deps.elem[i]); + tex_vk->ext_deps.num = 0; + + if (tex_vk->ext_sync) { + vk_cmd_callback(cmd, (vk_cb) vk_sync_deref, gpu, tex_vk->ext_sync); + tex_vk->ext_sync = NULL; + } +} + +static void vk_tex_destroy(pl_gpu gpu, struct pl_tex_t *tex) +{ + if (!tex) + return; + + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + + vk_sync_deref(gpu, tex_vk->ext_sync); + vk->DestroyFramebuffer(vk->dev, tex_vk->framebuffer, PL_VK_ALLOC); + vk->DestroyImageView(vk->dev, tex_vk->view, PL_VK_ALLOC); + for (int i = 0; i < tex_vk->num_planes; i++) + vk_tex_deref(gpu, tex->planes[i]); + if (!tex_vk->external_img) { + vk->DestroyImage(vk->dev, tex_vk->img, PL_VK_ALLOC); + vk_malloc_free(vk->ma, &tex_vk->mem); + } + + pl_free(tex); +} + +void vk_tex_deref(pl_gpu gpu, pl_tex tex) +{ + if (!tex) + return; + + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + if (pl_rc_deref(&tex_vk->rc)) + vk_tex_destroy(gpu, (struct pl_tex_t *) tex); +} + + +// Initializes non-VkImage values like the image view, framebuffers, etc. +static bool vk_init_image(pl_gpu gpu, pl_tex tex, pl_debug_tag debug_tag) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + const struct pl_tex_params *params = &tex->params; + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + pl_assert(tex_vk->img); + PL_VK_NAME(IMAGE, tex_vk->img, debug_tag); + pl_rc_init(&tex_vk->rc); + if (tex_vk->num_planes) + return true; + tex_vk->layout = VK_IMAGE_LAYOUT_UNDEFINED; + tex_vk->transfer_queue = GRAPHICS; + tex_vk->qf = VK_QUEUE_FAMILY_IGNORED; // will be set on first use, if needed + + // Always use the transfer pool if available, for efficiency + if ((params->host_writable || params->host_readable) && vk->pool_transfer) + tex_vk->transfer_queue = TRANSFER; + + // For emulated formats: force usage of the compute queue, because we + // can't properly track cross-queue dependencies for buffers (yet?) + if (params->format->emulated) + tex_vk->transfer_queue = COMPUTE; + + bool ret = false; + VkRenderPass dummyPass = VK_NULL_HANDLE; + + if (params->sampleable || params->renderable || params->storable) { + static const VkImageViewType viewType[] = { + [VK_IMAGE_TYPE_1D] = VK_IMAGE_VIEW_TYPE_1D, + [VK_IMAGE_TYPE_2D] = VK_IMAGE_VIEW_TYPE_2D, + [VK_IMAGE_TYPE_3D] = VK_IMAGE_VIEW_TYPE_3D, + }; + + const VkImageViewCreateInfo vinfo = { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .image = tex_vk->img, + .viewType = viewType[tex_vk->type], + .format = tex_vk->img_fmt, + .subresourceRange = { + .aspectMask = tex_vk->aspect, + .levelCount = 1, + .layerCount = 1, + }, + }; + + VK(vk->CreateImageView(vk->dev, &vinfo, PL_VK_ALLOC, &tex_vk->view)); + PL_VK_NAME(IMAGE_VIEW, tex_vk->view, debug_tag); + } + + if (params->renderable) { + // Framebuffers need to be created against a specific render pass + // layout, so we need to temporarily create a skeleton/dummy render + // pass for vulkan to figure out the compatibility + VkRenderPassCreateInfo rinfo = { + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = &(VkAttachmentDescription) { + .format = tex_vk->img_fmt, + .samples = VK_SAMPLE_COUNT_1_BIT, + .loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, + .finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + }, + .subpassCount = 1, + .pSubpasses = &(VkSubpassDescription) { + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .colorAttachmentCount = 1, + .pColorAttachments = &(VkAttachmentReference) { + .attachment = 0, + .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + }, + }, + }; + + VK(vk->CreateRenderPass(vk->dev, &rinfo, PL_VK_ALLOC, &dummyPass)); + + VkFramebufferCreateInfo finfo = { + .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, + .renderPass = dummyPass, + .attachmentCount = 1, + .pAttachments = &tex_vk->view, + .width = tex->params.w, + .height = tex->params.h, + .layers = 1, + }; + + if (finfo.width > vk->props.limits.maxFramebufferWidth || + finfo.height > vk->props.limits.maxFramebufferHeight) + { + PL_ERR(gpu, "Framebuffer of size %dx%d exceeds the maximum allowed " + "dimensions: %dx%d", finfo.width, finfo.height, + vk->props.limits.maxFramebufferWidth, + vk->props.limits.maxFramebufferHeight); + goto error; + } + + VK(vk->CreateFramebuffer(vk->dev, &finfo, PL_VK_ALLOC, + &tex_vk->framebuffer)); + PL_VK_NAME(FRAMEBUFFER, tex_vk->framebuffer, debug_tag); + } + + ret = true; + +error: + vk->DestroyRenderPass(vk->dev, dummyPass, PL_VK_ALLOC); + return ret; +} + +pl_tex vk_tex_create(pl_gpu gpu, const struct pl_tex_params *params) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + enum pl_handle_type handle_type = params->export_handle | + params->import_handle; + VkExternalMemoryHandleTypeFlagBitsKHR vk_handle_type = vk_mem_handle_type(handle_type); + + struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct pl_tex_vk); + pl_fmt fmt = params->format; + tex->params = *params; + tex->params.initial_data = NULL; + tex->sampler_type = PL_SAMPLER_NORMAL; + + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + struct pl_fmt_vk *fmtp = PL_PRIV(fmt); + tex_vk->img_fmt = fmtp->vk_fmt->tfmt; + tex_vk->num_planes = fmt->num_planes; + for (int i = 0; i < tex_vk->num_planes; i++) + tex_vk->aspect |= VK_IMAGE_ASPECT_PLANE_0_BIT << i; + tex_vk->aspect = PL_DEF(tex_vk->aspect, VK_IMAGE_ASPECT_COLOR_BIT); + + switch (pl_tex_params_dimension(*params)) { + case 1: tex_vk->type = VK_IMAGE_TYPE_1D; break; + case 2: tex_vk->type = VK_IMAGE_TYPE_2D; break; + case 3: tex_vk->type = VK_IMAGE_TYPE_3D; break; + } + + if (fmt->emulated) { + tex_vk->texel_fmt = pl_find_fmt(gpu, fmt->type, 1, 0, + fmt->host_bits[0], + PL_FMT_CAP_TEXEL_UNIFORM); + if (!tex_vk->texel_fmt) { + PL_ERR(gpu, "Failed picking texel format for emulated texture!"); + goto error; + } + + // Our format emulation requires storage image support. In order to + // make a bunch of checks happy, just mark it off as storable (and also + // enable VK_IMAGE_USAGE_STORAGE_BIT, which we do below) + tex->params.storable = true; + } + + if (fmtp->blit_emulated) { + // Enable what's required for sampling + tex->params.sampleable = fmt->caps & PL_FMT_CAP_SAMPLEABLE; + tex->params.storable = true; + } + + // Blit emulation on planar textures requires storage + if ((params->blit_src || params->blit_dst) && tex_vk->num_planes) + tex->params.storable = true; + + VkImageUsageFlags usage = 0; + VkImageCreateFlags flags = 0; + if (tex->params.sampleable) + usage |= VK_IMAGE_USAGE_SAMPLED_BIT; + if (tex->params.renderable) + usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + if (tex->params.storable) + usage |= VK_IMAGE_USAGE_STORAGE_BIT; + if (tex->params.host_readable || tex->params.blit_src) + usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT; + if (tex->params.host_writable || tex->params.blit_dst || params->initial_data) + usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT; + + if (!usage) { + // Vulkan requires images have at least *some* image usage set, but our + // API is perfectly happy with a (useless) image. So just put + // VK_IMAGE_USAGE_TRANSFER_DST_BIT since this harmless. + usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT; + } + + if (tex_vk->num_planes) { + flags |= VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT | + VK_IMAGE_CREATE_EXTENDED_USAGE_BIT; + } + + // FIXME: Since we can't keep track of queue family ownership properly, + // and we don't know in advance what types of queue families this image + // will belong to, we're forced to share all of our images between all + // command pools. + uint32_t qfs[3] = {0}; + pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs)); + for (int i = 0; i < vk->pools.num; i++) + qfs[i] = vk->pools.elem[i]->qf; + + VkImageDrmFormatModifierExplicitCreateInfoEXT drm_explicit = { + .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT, + .drmFormatModifier = params->shared_mem.drm_format_mod, + .drmFormatModifierPlaneCount = 1, + .pPlaneLayouts = &(VkSubresourceLayout) { + .rowPitch = PL_DEF(params->shared_mem.stride_w, params->w), + .depthPitch = params->d ? PL_DEF(params->shared_mem.stride_h, params->h) : 0, + .offset = params->shared_mem.offset, + }, + }; + +#ifdef VK_EXT_metal_objects + VkImportMetalTextureInfoEXT import_metal_tex = { + .sType = VK_STRUCTURE_TYPE_IMPORT_METAL_TEXTURE_INFO_EXT, + .plane = VK_IMAGE_ASPECT_PLANE_0_BIT << params->shared_mem.plane, + }; + + VkImportMetalIOSurfaceInfoEXT import_iosurface = { + .sType = VK_STRUCTURE_TYPE_IMPORT_METAL_IO_SURFACE_INFO_EXT, + }; +#endif + + VkImageDrmFormatModifierListCreateInfoEXT drm_list = { + .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT, + .drmFormatModifierCount = fmt->num_modifiers, + .pDrmFormatModifiers = fmt->modifiers, + }; + + VkExternalMemoryImageCreateInfoKHR ext_info = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO_KHR, + .handleTypes = vk_handle_type, + }; + + VkImageCreateInfo iinfo = { + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .pNext = vk_handle_type ? &ext_info : NULL, + .imageType = tex_vk->type, + .format = tex_vk->img_fmt, + .extent = (VkExtent3D) { + .width = params->w, + .height = PL_MAX(1, params->h), + .depth = PL_MAX(1, params->d) + }, + .mipLevels = 1, + .arrayLayers = 1, + .samples = VK_SAMPLE_COUNT_1_BIT, + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = usage, + .flags = flags, + .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, + .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT + : VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = vk->pools.num, + .pQueueFamilyIndices = qfs, + }; + + struct vk_malloc_params mparams = { + .optimal = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + .export_handle = params->export_handle, + .import_handle = params->import_handle, + .shared_mem = params->shared_mem, + .debug_tag = params->debug_tag, + }; + + if (params->import_handle == PL_HANDLE_DMA_BUF) { + vk_link_struct(&iinfo, &drm_explicit); + iinfo.tiling = VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT; + mparams.shared_mem.offset = 0x0; // handled via plane offsets + } + +#ifdef VK_EXT_metal_objects + if (params->import_handle == PL_HANDLE_MTL_TEX) { + vk_link_struct(&iinfo, &import_metal_tex); + import_metal_tex.mtlTexture = params->shared_mem.handle.handle; + } + + if (params->import_handle == PL_HANDLE_IOSURFACE) { + vk_link_struct(&iinfo, &import_iosurface); + import_iosurface.ioSurface = params->shared_mem.handle.handle; + } +#endif + + if (params->export_handle == PL_HANDLE_DMA_BUF) { + pl_assert(drm_list.drmFormatModifierCount > 0); + vk_link_struct(&iinfo, &drm_list); + iinfo.tiling = VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT; + } + + // Double-check physical image format limits and fail if invalid + VkPhysicalDeviceImageDrmFormatModifierInfoEXT drm_pinfo = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT, + .sharingMode = iinfo.sharingMode, + .queueFamilyIndexCount = iinfo.queueFamilyIndexCount, + .pQueueFamilyIndices = iinfo.pQueueFamilyIndices, + }; + + VkPhysicalDeviceExternalImageFormatInfoKHR ext_pinfo = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO_KHR, + .handleType = ext_info.handleTypes, + }; + + if (handle_type == PL_HANDLE_DMA_BUF) { + if (params->import_handle) { + // On import, we know exactly which format modifier to test + drm_pinfo.drmFormatModifier = drm_explicit.drmFormatModifier; + } else { + // On export, the choice of format modifier is ambiguous, because + // we offer the implementation a whole list to choose from. In + // principle, we must check *all* supported drm format modifiers, + // but in practice it should hopefully suffice to just check one + drm_pinfo.drmFormatModifier = drm_list.pDrmFormatModifiers[0]; + } + vk_link_struct(&ext_pinfo, &drm_pinfo); + } + + VkPhysicalDeviceImageFormatInfo2KHR pinfo = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2_KHR, + .pNext = vk_handle_type ? &ext_pinfo : NULL, + .format = iinfo.format, + .type = iinfo.imageType, + .tiling = iinfo.tiling, + .usage = iinfo.usage, + .flags = iinfo.flags, + }; + + VkExternalImageFormatPropertiesKHR ext_props = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES_KHR, + }; + + VkImageFormatProperties2KHR props = { + .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2_KHR, + .pNext = vk_handle_type ? &ext_props : NULL, + }; + + VkResult res; + res = vk->GetPhysicalDeviceImageFormatProperties2KHR(vk->physd, &pinfo, &props); + if (res == VK_ERROR_FORMAT_NOT_SUPPORTED) { + PL_DEBUG(gpu, "Texture creation failed: not supported"); + goto error; + } else { + PL_VK_ASSERT(res, "Querying image format properties"); + } + + VkExtent3D max = props.imageFormatProperties.maxExtent; + if (params->w > max.width || params->h > max.height || params->d > max.depth) + { + PL_ERR(gpu, "Requested image size %dx%dx%d exceeds the maximum allowed " + "dimensions %dx%dx%d for vulkan image format %x", + params->w, params->h, params->d, max.width, max.height, max.depth, + (unsigned) iinfo.format); + goto error; + } + + // Ensure the handle type is supported + if (vk_handle_type) { + bool ok = vk_external_mem_check(vk, &ext_props.externalMemoryProperties, + handle_type, params->import_handle); + if (!ok) { + PL_ERR(gpu, "Requested handle type is not compatible with the " + "specified combination of image parameters. Possibly the " + "handle type is unsupported altogether?"); + goto error; + } + } + + VK(vk->CreateImage(vk->dev, &iinfo, PL_VK_ALLOC, &tex_vk->img)); + tex_vk->usage_flags = iinfo.usage; + + VkMemoryDedicatedRequirements ded_reqs = { + .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR, + }; + + VkMemoryRequirements2 reqs = { + .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR, + .pNext = &ded_reqs, + }; + + VkImageMemoryRequirementsInfo2 req_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2_KHR, + .image = tex_vk->img, + }; + + vk->GetImageMemoryRequirements2(vk->dev, &req_info, &reqs); + mparams.reqs = reqs.memoryRequirements; + if (ded_reqs.prefersDedicatedAllocation) { + mparams.ded_image = tex_vk->img; + if (vk_mem_handle_type(params->import_handle)) + mparams.shared_mem.size = reqs.memoryRequirements.size; + } + + const char *debug_tag = params->debug_tag ? params->debug_tag : + params->import_handle ? "imported" : "created"; + + if (!params->import_handle || vk_mem_handle_type(params->import_handle)) { + struct vk_memslice *mem = &tex_vk->mem; + if (!vk_malloc_slice(vk->ma, mem, &mparams)) + goto error; + + VK(vk->BindImageMemory(vk->dev, tex_vk->img, mem->vkmem, mem->offset)); + } + + static const char * const plane_names[4] = { + "plane 0", "plane 1", "plane 2", "plane 3", + }; + + if (tex_vk->num_planes) { + for (int i = 0; i < tex_vk->num_planes; i++) { + struct pl_tex_t *plane; + + pl_assert(tex_vk->type == VK_IMAGE_TYPE_2D); + plane = (struct pl_tex_t *) pl_vulkan_wrap(gpu, pl_vulkan_wrap_params( + .image = tex_vk->img, + .aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << i, + .width = PL_RSHIFT_UP(tex->params.w, fmt->planes[i].shift_x), + .height = PL_RSHIFT_UP(tex->params.h, fmt->planes[i].shift_y), + .format = fmtp->vk_fmt->pfmt[i].fmt, + .usage = usage, + .user_data = params->user_data, + .debug_tag = PL_DEF(params->debug_tag, plane_names[i]), + )); + if (!plane) + goto error; + plane->parent = tex; + tex->planes[i] = plane; + tex_vk->planes[i] = PL_PRIV(plane); + tex_vk->planes[i]->held = false; + tex_vk->planes[i]->layout = tex_vk->layout; + } + + // Explicitly mask out all usage flags from planar parent images + pl_assert(!fmt->caps); + tex->params.sampleable = false; + tex->params.renderable = false; + tex->params.storable = false; + tex->params.blit_src = false; + tex->params.blit_dst = false; + tex->params.host_writable = false; + tex->params.host_readable = false; + } + + if (!vk_init_image(gpu, tex, debug_tag)) + goto error; + + if (params->export_handle) + tex->shared_mem = tex_vk->mem.shared_mem; + + if (params->export_handle == PL_HANDLE_DMA_BUF) { + if (vk->GetImageDrmFormatModifierPropertiesEXT) { + + // Query the DRM format modifier and plane layout from the driver + VkImageDrmFormatModifierPropertiesEXT mod_props = { + .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_PROPERTIES_EXT, + }; + + VK(vk->GetImageDrmFormatModifierPropertiesEXT(vk->dev, tex_vk->img, &mod_props)); + tex->shared_mem.drm_format_mod = mod_props.drmFormatModifier; + + VkSubresourceLayout layout = {0}; + VkImageSubresource plane = { + .aspectMask = VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT, + }; + + vk->GetImageSubresourceLayout(vk->dev, tex_vk->img, &plane, &layout); + if (layout.offset != 0) { + PL_ERR(gpu, "Exported DRM plane 0 has nonzero offset %zu, " + "this should never happen! Erroring for safety...", + (size_t) layout.offset); + goto error; + } + tex->shared_mem.stride_w = layout.rowPitch; + tex->shared_mem.stride_h = layout.depthPitch; + + } else { + + // Fallback for no modifiers, just do something stupid. + tex->shared_mem.drm_format_mod = DRM_FORMAT_MOD_INVALID; + tex->shared_mem.stride_w = params->w; + tex->shared_mem.stride_h = params->h; + + } + } + + if (params->initial_data) { + struct pl_tex_transfer_params ul_params = { + .tex = tex, + .ptr = (void *) params->initial_data, + .rc = { 0, 0, 0, params->w, params->h, params->d }, + }; + + // Since we re-use GPU helpers which require writable images, just fake it + bool writable = tex->params.host_writable; + tex->params.host_writable = true; + if (!pl_tex_upload(gpu, &ul_params)) + goto error; + tex->params.host_writable = writable; + } + + return tex; + +error: + vk_tex_destroy(gpu, tex); + return NULL; +} + +void vk_tex_invalidate(pl_gpu gpu, pl_tex tex) +{ + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + tex_vk->may_invalidate = true; + for (int i = 0; i < tex_vk->num_planes; i++) + tex_vk->planes[i]->may_invalidate = true; +} + +static bool tex_clear_fallback(pl_gpu gpu, pl_tex tex, + const union pl_clear_color color) +{ + pl_tex pixel = pl_tex_create(gpu, pl_tex_params( + .w = 1, + .h = 1, + .format = tex->params.format, + .storable = true, + .blit_src = true, + .blit_dst = true, + )); + if (!pixel) + return false; + + pl_tex_clear_ex(gpu, pixel, color); + + pl_assert(tex->params.storable); + pl_tex_blit(gpu, pl_tex_blit_params( + .src = pixel, + .dst = tex, + .sample_mode = PL_TEX_SAMPLE_NEAREST, + )); + + pl_tex_destroy(gpu, &pixel); + return true; +} + +void vk_tex_clear_ex(pl_gpu gpu, pl_tex tex, const union pl_clear_color color) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + + if (tex_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT) { + if (!tex_clear_fallback(gpu, tex, color)) { + PL_ERR(gpu, "Failed clearing imported planar image: color aspect " + "clears disallowed by spec and no shader fallback " + "available"); + } + return; + } + + struct vk_cmd *cmd = CMD_BEGIN(GRAPHICS); + if (!cmd) + return; + + vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_CLEAR_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + VK_QUEUE_FAMILY_IGNORED); + + pl_static_assert(sizeof(VkClearColorValue) == sizeof(union pl_clear_color)); + const VkClearColorValue *clearColor = (const VkClearColorValue *) &color; + + pl_assert(tex_vk->aspect == VK_IMAGE_ASPECT_COLOR_BIT); + static const VkImageSubresourceRange range = { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .levelCount = 1, + .layerCount = 1, + }; + + vk->CmdClearColorImage(cmd->buf, tex_vk->img, tex_vk->layout, + clearColor, 1, &range); + + CMD_FINISH(&cmd); +} + +void vk_tex_blit(pl_gpu gpu, const struct pl_tex_blit_params *params) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_tex_vk *src_vk = PL_PRIV(params->src); + struct pl_tex_vk *dst_vk = PL_PRIV(params->dst); + struct pl_fmt_vk *src_fmtp = PL_PRIV(params->src->params.format); + struct pl_fmt_vk *dst_fmtp = PL_PRIV(params->dst->params.format); + bool blit_emulated = src_fmtp->blit_emulated || dst_fmtp->blit_emulated; + bool planar_fallback = src_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT || + dst_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT; + + pl_rect3d src_rc = params->src_rc, dst_rc = params->dst_rc; + bool requires_scaling = !pl_rect3d_eq(src_rc, dst_rc); + if ((requires_scaling && blit_emulated) || planar_fallback) { + if (!pl_tex_blit_compute(gpu, params)) + PL_ERR(gpu, "Failed emulating texture blit, incompatible textures?"); + return; + } + + struct vk_cmd *cmd = CMD_BEGIN(GRAPHICS); + if (!cmd) + return; + + // When the blit operation doesn't require scaling, we can use the more + // efficient vkCmdCopyImage instead of vkCmdBlitImage + if (!requires_scaling) { + vk_tex_barrier(gpu, cmd, params->src, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_READ_BIT, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + VK_QUEUE_FAMILY_IGNORED); + + vk_tex_barrier(gpu, cmd, params->dst, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + VK_QUEUE_FAMILY_IGNORED); + + pl_rect3d_normalize(&src_rc); + + VkImageCopy region = { + .srcSubresource = { + .aspectMask = src_vk->aspect, + .layerCount = 1, + }, + .dstSubresource = { + .aspectMask = dst_vk->aspect, + .layerCount = 1, + }, + .srcOffset = {src_rc.x0, src_rc.y0, src_rc.z0}, + .dstOffset = {src_rc.x0, src_rc.y0, src_rc.z0}, + .extent = { + pl_rect_w(src_rc), + pl_rect_h(src_rc), + pl_rect_d(src_rc), + }, + }; + + vk->CmdCopyImage(cmd->buf, src_vk->img, src_vk->layout, + dst_vk->img, dst_vk->layout, 1, ®ion); + } else { + vk_tex_barrier(gpu, cmd, params->src, VK_PIPELINE_STAGE_2_BLIT_BIT, + VK_ACCESS_2_TRANSFER_READ_BIT, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + VK_QUEUE_FAMILY_IGNORED); + + vk_tex_barrier(gpu, cmd, params->dst, VK_PIPELINE_STAGE_2_BLIT_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + VK_QUEUE_FAMILY_IGNORED); + + VkImageBlit region = { + .srcSubresource = { + .aspectMask = src_vk->aspect, + .layerCount = 1, + }, + .dstSubresource = { + .aspectMask = dst_vk->aspect, + .layerCount = 1, + }, + .srcOffsets = {{src_rc.x0, src_rc.y0, src_rc.z0}, + {src_rc.x1, src_rc.y1, src_rc.z1}}, + .dstOffsets = {{dst_rc.x0, dst_rc.y0, dst_rc.z0}, + {dst_rc.x1, dst_rc.y1, dst_rc.z1}}, + }; + + static const VkFilter filters[PL_TEX_SAMPLE_MODE_COUNT] = { + [PL_TEX_SAMPLE_NEAREST] = VK_FILTER_NEAREST, + [PL_TEX_SAMPLE_LINEAR] = VK_FILTER_LINEAR, + }; + + vk->CmdBlitImage(cmd->buf, src_vk->img, src_vk->layout, + dst_vk->img, dst_vk->layout, 1, ®ion, + filters[params->sample_mode]); + } + + CMD_FINISH(&cmd); +} + +// Determine the best queue type to perform a buffer<->image copy on +static enum queue_type vk_img_copy_queue(pl_gpu gpu, pl_tex tex, + const struct VkBufferImageCopy *region) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + const struct pl_tex_vk *tex_vk = PL_PRIV(tex); + enum queue_type queue = tex_vk->transfer_queue; + if (queue != TRANSFER) + return queue; + + VkExtent3D alignment = vk->pool_transfer->props.minImageTransferGranularity; + + enum queue_type fallback = GRAPHICS; + if (gpu->limits.compute_queues > gpu->limits.fragment_queues) + fallback = COMPUTE; // prefer async compute queue + + int tex_w = PL_DEF(tex->params.w, 1), + tex_h = PL_DEF(tex->params.h, 1), + tex_d = PL_DEF(tex->params.d, 1); + + bool full_w = region->imageOffset.x + region->imageExtent.width == tex_w, + full_h = region->imageOffset.y + region->imageExtent.height == tex_h, + full_d = region->imageOffset.z + region->imageExtent.depth == tex_d; + + if (alignment.width) { + + bool unaligned = false; + unaligned |= region->imageOffset.x % alignment.width; + unaligned |= region->imageOffset.y % alignment.height; + unaligned |= region->imageOffset.z % alignment.depth; + unaligned |= (region->imageExtent.width % alignment.width) && !full_w; + unaligned |= (region->imageExtent.height % alignment.height) && !full_h; + unaligned |= (region->imageExtent.depth % alignment.depth) && !full_d; + + return unaligned ? fallback : queue; + + } else { + + // an alignment of {0} means the copy must span the entire image + bool unaligned = false; + unaligned |= region->imageOffset.x || !full_w; + unaligned |= region->imageOffset.y || !full_h; + unaligned |= region->imageOffset.z || !full_d; + + return unaligned ? fallback : queue; + + } +} + +static void tex_xfer_cb(void *ctx, void *arg) +{ + void (*fun)(void *priv) = ctx; + fun(arg); +} + +bool vk_tex_upload(pl_gpu gpu, const struct pl_tex_transfer_params *params) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + pl_tex tex = params->tex; + pl_fmt fmt = tex->params.format; + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + struct pl_tex_transfer_params *slices = NULL; + int num_slices = 0; + + if (!params->buf) + return pl_tex_upload_pbo(gpu, params); + + pl_buf buf = params->buf; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + pl_rect3d rc = params->rc; + const size_t size = pl_tex_transfer_size(params); + const size_t buf_offset = buf_vk->mem.offset + params->buf_offset; + bool unaligned = buf_offset % fmt->texel_size; + if (unaligned) + PL_TRACE(gpu, "vk_tex_upload: unaligned transfer (slow path)"); + + if (fmt->emulated || unaligned) { + + // Create all slice buffers first, to early-fail if OOM, and to avoid + // blocking unnecessarily on waiting for these buffers to get read from + num_slices = pl_tex_transfer_slices(gpu, tex_vk->texel_fmt, params, &slices); + for (int i = 0; i < num_slices; i++) { + slices[i].buf = pl_buf_create(gpu, pl_buf_params( + .memory_type = PL_BUF_MEM_DEVICE, + .format = tex_vk->texel_fmt, + .size = pl_tex_transfer_size(&slices[i]), + .storable = fmt->emulated, + )); + + if (!slices[i].buf) { + PL_ERR(gpu, "Failed creating buffer for tex upload fallback!"); + num_slices = i; // only clean up buffers up to here + goto error; + } + } + + // All temporary buffers successfully created, begin copying source data + struct vk_cmd *cmd = CMD_BEGIN_TIMED(tex_vk->transfer_queue, + params->timer); + if (!cmd) + goto error; + + vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_READ_BIT, params->buf_offset, size, + false); + + for (int i = 0; i < num_slices; i++) { + pl_buf slice = slices[i].buf; + struct pl_buf_vk *slice_vk = PL_PRIV(slice); + vk_buf_barrier(gpu, cmd, slice, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, 0, slice->params.size, + false); + + vk->CmdCopyBuffer(cmd->buf, buf_vk->mem.buf, slice_vk->mem.buf, 1, &(VkBufferCopy) { + .srcOffset = buf_vk->mem.offset + slices[i].buf_offset, + .dstOffset = slice_vk->mem.offset, + .size = slice->params.size, + }); + } + + if (params->callback) + vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv); + + bool ok = CMD_FINISH(&cmd); + + // Finally, dispatch the (texel) upload asynchronously. We can fire + // the callback already at the completion of previous command because + // these temporary buffers already hold persistent copies of the data + for (int i = 0; i < num_slices; i++) { + if (ok) { + slices[i].buf_offset = 0; + ok = fmt->emulated ? pl_tex_upload_texel(gpu, &slices[i]) + : pl_tex_upload(gpu, &slices[i]); + } + pl_buf_destroy(gpu, &slices[i].buf); + } + + pl_free(slices); + return ok; + + } else { + + pl_assert(fmt->texel_align == fmt->texel_size); + const VkBufferImageCopy region = { + .bufferOffset = buf_offset, + .bufferRowLength = params->row_pitch / fmt->texel_size, + .bufferImageHeight = params->depth_pitch / params->row_pitch, + .imageOffset = { rc.x0, rc.y0, rc.z0 }, + .imageExtent = { rc.x1, rc.y1, rc.z1 }, + .imageSubresource = { + .aspectMask = tex_vk->aspect, + .layerCount = 1, + }, + }; + + enum queue_type queue = vk_img_copy_queue(gpu, tex, ®ion); + struct vk_cmd *cmd = CMD_BEGIN_TIMED(queue, params->timer); + if (!cmd) + goto error; + + vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_READ_BIT, params->buf_offset, size, + false); + vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + VK_QUEUE_FAMILY_IGNORED); + vk->CmdCopyBufferToImage(cmd->buf, buf_vk->mem.buf, tex_vk->img, + tex_vk->layout, 1, ®ion); + + if (params->callback) + vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv); + + return CMD_FINISH(&cmd); + } + + pl_unreachable(); + +error: + for (int i = 0; i < num_slices; i++) + pl_buf_destroy(gpu, &slices[i].buf); + pl_free(slices); + return false; +} + +bool vk_tex_download(pl_gpu gpu, const struct pl_tex_transfer_params *params) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + pl_tex tex = params->tex; + pl_fmt fmt = tex->params.format; + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + struct pl_tex_transfer_params *slices = NULL; + int num_slices = 0; + + if (!params->buf) + return pl_tex_download_pbo(gpu, params); + + pl_buf buf = params->buf; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + pl_rect3d rc = params->rc; + const size_t size = pl_tex_transfer_size(params); + const size_t buf_offset = buf_vk->mem.offset + params->buf_offset; + bool unaligned = buf_offset % fmt->texel_size; + if (unaligned) + PL_TRACE(gpu, "vk_tex_download: unaligned transfer (slow path)"); + + if (fmt->emulated || unaligned) { + + num_slices = pl_tex_transfer_slices(gpu, tex_vk->texel_fmt, params, &slices); + for (int i = 0; i < num_slices; i++) { + slices[i].buf = pl_buf_create(gpu, pl_buf_params( + .memory_type = PL_BUF_MEM_DEVICE, + .format = tex_vk->texel_fmt, + .size = pl_tex_transfer_size(&slices[i]), + .storable = fmt->emulated, + )); + + if (!slices[i].buf) { + PL_ERR(gpu, "Failed creating buffer for tex download fallback!"); + num_slices = i; + goto error; + } + } + + for (int i = 0; i < num_slices; i++) { + // Restore buffer offset after downloading into temporary buffer, + // because we still need to copy the data from the temporary buffer + // into this offset in the original buffer + const size_t tmp_offset = slices[i].buf_offset; + slices[i].buf_offset = 0; + bool ok = fmt->emulated ? pl_tex_download_texel(gpu, &slices[i]) + : pl_tex_download(gpu, &slices[i]); + slices[i].buf_offset = tmp_offset; + if (!ok) + goto error; + } + + // Finally, download into the user buffer + struct vk_cmd *cmd = CMD_BEGIN_TIMED(tex_vk->transfer_queue, params->timer); + if (!cmd) + goto error; + + vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, params->buf_offset, size, + false); + + for (int i = 0; i < num_slices; i++) { + pl_buf slice = slices[i].buf; + struct pl_buf_vk *slice_vk = PL_PRIV(slice); + vk_buf_barrier(gpu, cmd, slice, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_READ_BIT, 0, slice->params.size, + false); + + vk->CmdCopyBuffer(cmd->buf, slice_vk->mem.buf, buf_vk->mem.buf, 1, &(VkBufferCopy) { + .srcOffset = slice_vk->mem.offset, + .dstOffset = buf_vk->mem.offset + slices[i].buf_offset, + .size = slice->params.size, + }); + + pl_buf_destroy(gpu, &slices[i].buf); + } + + vk_buf_flush(gpu, cmd, buf, params->buf_offset, size); + + if (params->callback) + vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv); + + pl_free(slices); + return CMD_FINISH(&cmd); + + } else { + + pl_assert(params->row_pitch % fmt->texel_size == 0); + pl_assert(params->depth_pitch % params->row_pitch == 0); + const VkBufferImageCopy region = { + .bufferOffset = buf_offset, + .bufferRowLength = params->row_pitch / fmt->texel_size, + .bufferImageHeight = params->depth_pitch / params->row_pitch, + .imageOffset = { rc.x0, rc.y0, rc.z0 }, + .imageExtent = { rc.x1, rc.y1, rc.z1 }, + .imageSubresource = { + .aspectMask = tex_vk->aspect, + .layerCount = 1, + }, + }; + + enum queue_type queue = vk_img_copy_queue(gpu, tex, ®ion); + + struct vk_cmd *cmd = CMD_BEGIN_TIMED(queue, params->timer); + if (!cmd) + goto error; + + vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, params->buf_offset, size, + false); + vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_READ_BIT, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + VK_QUEUE_FAMILY_IGNORED); + vk->CmdCopyImageToBuffer(cmd->buf, tex_vk->img, tex_vk->layout, + buf_vk->mem.buf, 1, ®ion); + vk_buf_flush(gpu, cmd, buf, params->buf_offset, size); + + if (params->callback) + vk_cmd_callback(cmd, tex_xfer_cb, params->callback, params->priv); + + return CMD_FINISH(&cmd); + } + + pl_unreachable(); + +error: + for (int i = 0; i < num_slices; i++) + pl_buf_destroy(gpu, &slices[i].buf); + pl_free(slices); + return false; +} + +bool vk_tex_poll(pl_gpu gpu, pl_tex tex, uint64_t timeout) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + + // Opportunistically check if we can re-use this texture without flush + vk_poll_commands(vk, 0); + if (pl_rc_count(&tex_vk->rc) == 1) + goto skip_blocking; + + // Otherwise, we're force to submit any queued command so that the user is + // guaranteed to see progress eventually, even if they call this in a loop + CMD_SUBMIT(NULL); + vk_poll_commands(vk, timeout); + if (pl_rc_count(&tex_vk->rc) > 1) + return true; + + // fall through +skip_blocking: + for (int i = 0; i < tex_vk->num_planes; i++) { + if (vk_tex_poll(gpu, tex->planes[i], timeout)) + return true; + } + + return false; +} + +bool vk_tex_export(pl_gpu gpu, pl_tex tex, pl_sync sync) +{ + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + struct pl_sync_vk *sync_vk = PL_PRIV(sync); + + if (tex_vk->num_planes) { + PL_ERR(gpu, "`pl_tex_export` cannot be called on planar textures." + "Please see `pl_vulkan_hold_ex` for a replacement."); + return false; + } + + struct vk_cmd *cmd = CMD_BEGIN(ANY); + if (!cmd) + goto error; + + vk_tex_barrier(gpu, cmd, tex, VK_PIPELINE_STAGE_2_NONE, + 0, VK_IMAGE_LAYOUT_GENERAL, VK_QUEUE_FAMILY_EXTERNAL); + + // Make the next barrier appear as though coming from a different queue + tex_vk->sem.write.queue = tex_vk->sem.read.queue = NULL; + + vk_cmd_sig(cmd, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, (pl_vulkan_sem){ sync_vk->wait }); + if (!CMD_SUBMIT(&cmd)) + goto error; + + // Remember the other dependency and hold on to the sync object + PL_ARRAY_APPEND(tex, tex_vk->ext_deps, (pl_vulkan_sem){ sync_vk->signal }); + pl_rc_ref(&sync_vk->rc); + tex_vk->ext_sync = sync; + tex_vk->qf = VK_QUEUE_FAMILY_EXTERNAL; + return true; + +error: + PL_ERR(gpu, "Failed exporting shared texture!"); + return false; +} + +pl_tex pl_vulkan_wrap(pl_gpu gpu, const struct pl_vulkan_wrap_params *params) +{ + pl_fmt fmt = NULL; + for (int i = 0; i < gpu->num_formats; i++) { + const struct vk_format **vkfmt = PL_PRIV(gpu->formats[i]); + if ((*vkfmt)->tfmt == params->format) { + fmt = gpu->formats[i]; + break; + } + } + + if (!fmt) { + PL_ERR(gpu, "Could not find pl_fmt suitable for wrapped image " + "with format %s", vk_fmt_name(params->format)); + return NULL; + } + + VkImageUsageFlags usage = params->usage; + if (fmt->num_planes) + usage = 0; // mask capabilities from the base texture + + struct pl_tex_t *tex = pl_zalloc_obj(NULL, tex, struct pl_tex_vk); + tex->params = (struct pl_tex_params) { + .format = fmt, + .w = params->width, + .h = params->height, + .d = params->depth, + .sampleable = !!(usage & VK_IMAGE_USAGE_SAMPLED_BIT), + .renderable = !!(usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT), + .storable = !!(usage & VK_IMAGE_USAGE_STORAGE_BIT), + .blit_src = !!(usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT), + .blit_dst = !!(usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT), + .host_writable = !!(usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT), + .host_readable = !!(usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT), + .user_data = params->user_data, + .debug_tag = params->debug_tag, + }; + + // Mask out capabilities not permitted by the `pl_fmt` +#define MASK(field, cap) \ + do { \ + if (tex->params.field && !(fmt->caps & cap)) { \ + PL_WARN(gpu, "Masking `" #field "` from wrapped texture because " \ + "the corresponding format '%s' does not support " #cap, \ + fmt->name); \ + tex->params.field = false; \ + } \ + } while (0) + + MASK(sampleable, PL_FMT_CAP_SAMPLEABLE); + MASK(renderable, PL_FMT_CAP_RENDERABLE); + MASK(storable, PL_FMT_CAP_STORABLE); + MASK(blit_src, PL_FMT_CAP_BLITTABLE); + MASK(blit_dst, PL_FMT_CAP_BLITTABLE); + MASK(host_readable, PL_FMT_CAP_HOST_READABLE); +#undef MASK + + // For simplicity, explicitly mask out blit emulation for wrapped textures + struct pl_fmt_vk *fmtp = PL_PRIV(fmt); + if (fmtp->blit_emulated) { + tex->params.blit_src = false; + tex->params.blit_dst = false; + } + + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + switch (pl_tex_params_dimension(tex->params)) { + case 1: tex_vk->type = VK_IMAGE_TYPE_1D; break; + case 2: tex_vk->type = VK_IMAGE_TYPE_2D; break; + case 3: tex_vk->type = VK_IMAGE_TYPE_3D; break; + } + tex_vk->external_img = true; + tex_vk->held = !fmt->num_planes; + tex_vk->img = params->image; + tex_vk->img_fmt = params->format; + tex_vk->num_planes = fmt->num_planes; + tex_vk->usage_flags = usage; + tex_vk->aspect = params->aspect; + + if (!tex_vk->aspect) { + for (int i = 0; i < tex_vk->num_planes; i++) + tex_vk->aspect |= VK_IMAGE_ASPECT_PLANE_0_BIT << i; + tex_vk->aspect = PL_DEF(tex_vk->aspect, VK_IMAGE_ASPECT_COLOR_BIT); + } + + // Blitting to planar images requires fallback via compute shaders + if (tex_vk->aspect != VK_IMAGE_ASPECT_COLOR_BIT) { + tex->params.blit_src &= tex->params.storable; + tex->params.blit_dst &= tex->params.storable; + } + + static const char * const wrapped_plane_names[4] = { + "wrapped plane 0", "wrapped plane 1", "wrapped plane 2", "wrapped plane 3", + }; + + for (int i = 0; i < tex_vk->num_planes; i++) { + struct pl_tex_t *plane; + VkImageAspectFlags aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << i; + if (!(aspect & tex_vk->aspect)) { + PL_INFO(gpu, "Not wrapping plane %d due to aspect bit 0x%x not " + "being contained in supplied params->aspect 0x%x!", + i, (unsigned) aspect, (unsigned) tex_vk->aspect); + continue; + } + + pl_assert(tex_vk->type == VK_IMAGE_TYPE_2D); + plane = (struct pl_tex_t *) pl_vulkan_wrap(gpu, pl_vulkan_wrap_params( + .image = tex_vk->img, + .aspect = aspect, + .width = PL_RSHIFT_UP(tex->params.w, fmt->planes[i].shift_x), + .height = PL_RSHIFT_UP(tex->params.h, fmt->planes[i].shift_y), + .format = fmtp->vk_fmt->pfmt[i].fmt, + .usage = params->usage, + .user_data = params->user_data, + .debug_tag = PL_DEF(params->debug_tag, wrapped_plane_names[i]), + )); + if (!plane) + goto error; + plane->parent = tex; + tex->planes[i] = plane; + tex_vk->planes[i] = PL_PRIV(plane); + } + + if (!vk_init_image(gpu, tex, PL_DEF(params->debug_tag, "wrapped"))) + goto error; + + return tex; + +error: + vk_tex_destroy(gpu, tex); + return NULL; +} + +VkImage pl_vulkan_unwrap(pl_gpu gpu, pl_tex tex, VkFormat *out_format, + VkImageUsageFlags *out_flags) +{ + struct pl_tex_vk *tex_vk = PL_PRIV(tex); + + if (out_format) + *out_format = tex_vk->img_fmt; + if (out_flags) + *out_flags = tex_vk->usage_flags; + + return tex_vk->img; +} + +bool pl_vulkan_hold_ex(pl_gpu gpu, const struct pl_vulkan_hold_params *params) +{ + struct pl_tex_vk *tex_vk = PL_PRIV(params->tex); + pl_assert(params->semaphore.sem); + + bool held = tex_vk->held; + for (int i = 0; i < tex_vk->num_planes; i++) + held |= tex_vk->planes[i]->held; + + if (held) { + PL_ERR(gpu, "Attempting to hold an already held image!"); + return false; + } + + struct vk_cmd *cmd = CMD_BEGIN(GRAPHICS); + if (!cmd) { + PL_ERR(gpu, "Failed holding external image!"); + return false; + } + + VkImageLayout layout = params->layout; + if (params->out_layout) { + // For planar images, arbitrarily pick the current image layout of the + // first plane. This should be fine in practice, since all planes will + // share the same usage capabilities. + if (tex_vk->num_planes) { + layout = tex_vk->planes[0]->layout; + } else { + layout = tex_vk->layout; + } + } + + bool may_invalidate = true; + if (!tex_vk->num_planes) { + may_invalidate &= tex_vk->may_invalidate; + vk_tex_barrier(gpu, cmd, params->tex, VK_PIPELINE_STAGE_2_NONE, + 0, layout, params->qf); + } + + for (int i = 0; i < tex_vk->num_planes; i++) { + may_invalidate &= tex_vk->planes[i]->may_invalidate; + vk_tex_barrier(gpu, cmd, params->tex->planes[i], + VK_PIPELINE_STAGE_2_NONE, 0, layout, params->qf); + } + + vk_cmd_sig(cmd, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, params->semaphore); + bool ok = CMD_SUBMIT(&cmd); + + if (!tex_vk->num_planes) { + tex_vk->sem.write.queue = tex_vk->sem.read.queue = NULL; + tex_vk->held = ok; + } + + for (int i = 0; i < tex_vk->num_planes; i++) { + struct pl_tex_vk *plane_vk = tex_vk->planes[i]; + plane_vk->sem.write.queue = plane_vk->sem.read.queue = NULL; + plane_vk->held = ok; + } + + if (ok && params->out_layout) + *params->out_layout = may_invalidate ? VK_IMAGE_LAYOUT_UNDEFINED : layout; + + return ok; +} + +void pl_vulkan_release_ex(pl_gpu gpu, const struct pl_vulkan_release_params *params) +{ + struct pl_tex_vk *tex_vk = PL_PRIV(params->tex); + if (tex_vk->num_planes) { + struct pl_vulkan_release_params plane_pars = *params; + for (int i = 0; i < tex_vk->num_planes; i++) { + plane_pars.tex = params->tex->planes[i]; + pl_vulkan_release_ex(gpu, &plane_pars); + } + return; + } + + if (!tex_vk->held) { + PL_ERR(gpu, "Attempting to release an unheld image?"); + return; + } + + if (params->semaphore.sem) + PL_ARRAY_APPEND(params->tex, tex_vk->ext_deps, params->semaphore); + + tex_vk->qf = params->qf; + tex_vk->layout = params->layout; + tex_vk->held = false; +} + +bool pl_vulkan_hold(pl_gpu gpu, pl_tex tex, VkImageLayout layout, + pl_vulkan_sem sem_out) +{ + return pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params( + .tex = tex, + .layout = layout, + .semaphore = sem_out, + .qf = VK_QUEUE_FAMILY_IGNORED, + )); +} + +bool pl_vulkan_hold_raw(pl_gpu gpu, pl_tex tex, + VkImageLayout *out_layout, + pl_vulkan_sem sem_out) +{ + return pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params( + .tex = tex, + .out_layout = out_layout, + .semaphore = sem_out, + .qf = VK_QUEUE_FAMILY_IGNORED, + )); +} + +void pl_vulkan_release(pl_gpu gpu, pl_tex tex, VkImageLayout layout, + pl_vulkan_sem sem_in) +{ + pl_vulkan_release_ex(gpu, pl_vulkan_release_params( + .tex = tex, + .layout = layout, + .semaphore = sem_in, + .qf = VK_QUEUE_FAMILY_IGNORED, + )); +} diff --git a/src/vulkan/malloc.c b/src/vulkan/malloc.c new file mode 100644 index 0000000..c35183b --- /dev/null +++ b/src/vulkan/malloc.c @@ -0,0 +1,1058 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "malloc.h" +#include "command.h" +#include "utils.h" +#include "pl_thread.h" + +#ifdef PL_HAVE_UNIX +#include <errno.h> +#include <unistd.h> +#endif + +// Controls the page size alignment, to help coalesce allocations into the same +// slab. Pages are rounded up to multiples of this value. (Default: 4 KB) +#define PAGE_SIZE_ALIGN (1LLU << 12) + +// Controls the minimum/maximum number of pages for new slabs. As slabs are +// exhausted of memory, the number of pages per new slab grows exponentially, +// starting with the minimum until the maximum is reached. +// +// Note: The maximum must never exceed the size of `vk_slab.spacemap`. +#define MINIMUM_PAGE_COUNT 4 +#define MAXIMUM_PAGE_COUNT (sizeof(uint64_t) * 8) + +// Controls the maximum page size. Any allocations above this threshold +// (absolute size or fraction of VRAM, whichever is higher) will be served by +// dedicated allocations. (Default: 64 MB or 1/16 of VRAM) +#define MAXIMUM_PAGE_SIZE_ABSOLUTE (1LLU << 26) +#define MAXIMUM_PAGE_SIZE_RELATIVE 16 + +// Controls the minimum slab size, to avoid excessive re-allocation of very +// small slabs. (Default: 256 KB) +#define MINIMUM_SLAB_SIZE (1LLU << 18) + +// How long to wait before garbage collecting empty slabs. Slabs older than +// this many invocations of `vk_malloc_garbage_collect` will be released. +#define MAXIMUM_SLAB_AGE 32 + +// A single slab represents a contiguous region of allocated memory. Actual +// allocations are served as pages of this. Slabs are organized into pools, +// each of which contains a list of slabs of differing page sizes. +struct vk_slab { + pl_mutex lock; + pl_debug_tag debug_tag; // debug tag of the triggering allocation + VkDeviceMemory mem; // underlying device allocation + VkDeviceSize size; // total allocated size of `mem` + VkMemoryType mtype; // underlying memory type + bool dedicated; // slab is allocated specifically for one object + bool imported; // slab represents an imported memory allocation + + // free space accounting (only for non-dedicated slabs) + uint64_t spacemap; // bitset of available pages + size_t pagesize; // size in bytes per page + size_t used; // number of bytes actually in use + uint64_t age; // timestamp of last use + + // optional, depends on the memory type: + VkBuffer buffer; // buffer spanning the entire slab + void *data; // mapped memory corresponding to `mem` + bool coherent; // mapped memory is coherent + union pl_handle handle; // handle associated with this device memory + enum pl_handle_type handle_type; +}; + +// Represents a single memory pool. We keep track of a vk_pool for each +// combination of malloc parameters. This shouldn't actually be that many in +// practice, because some combinations simply never occur, and others will +// generally be the same for the same objects. +// +// Note: `vk_pool` addresses are not immutable, so we mustn't expose any +// dangling references to a `vk_pool` from e.g. `vk_memslice.priv = vk_slab`. +struct vk_pool { + struct vk_malloc_params params; // allocation params (with some fields nulled) + PL_ARRAY(struct vk_slab *) slabs; // array of slabs, unsorted + int index; // running index in `vk_malloc.pools` +}; + +// The overall state of the allocator, which keeps track of a vk_pool for each +// memory type. +struct vk_malloc { + struct vk_ctx *vk; + pl_mutex lock; + VkPhysicalDeviceMemoryProperties props; + size_t maximum_page_size; + PL_ARRAY(struct vk_pool) pools; + uint64_t age; +}; + +static inline float efficiency(size_t used, size_t total) +{ + if (!total) + return 100.0; + + return 100.0f * used / total; +} + +static const char *print_size(char buf[8], size_t size) +{ + const char *suffixes = "\0KMG"; + while (suffixes[1] && size > 9999) { + size >>= 10; + suffixes++; + } + + int ret = *suffixes ? snprintf(buf, 8, "%4zu%c", size, *suffixes) + : snprintf(buf, 8, "%5zu", size); + + return ret >= 0 ? buf : "(error)"; +} + +#define PRINT_SIZE(x) (print_size((char[8]){0}, (size_t) (x))) + +void vk_malloc_print_stats(struct vk_malloc *ma, enum pl_log_level lev) +{ + struct vk_ctx *vk = ma->vk; + size_t total_size = 0; + size_t total_used = 0; + size_t total_res = 0; + + PL_MSG(vk, lev, "Memory heaps supported by device:"); + for (int i = 0; i < ma->props.memoryHeapCount; i++) { + VkMemoryHeap heap = ma->props.memoryHeaps[i]; + PL_MSG(vk, lev, " %d: flags 0x%x size %s", + i, (unsigned) heap.flags, PRINT_SIZE(heap.size)); + } + + PL_DEBUG(vk, "Memory types supported by device:"); + for (int i = 0; i < ma->props.memoryTypeCount; i++) { + VkMemoryType type = ma->props.memoryTypes[i]; + PL_DEBUG(vk, " %d: flags 0x%x heap %d", + i, (unsigned) type.propertyFlags, (int) type.heapIndex); + } + + pl_mutex_lock(&ma->lock); + for (int i = 0; i < ma->pools.num; i++) { + struct vk_pool *pool = &ma->pools.elem[i]; + const struct vk_malloc_params *par = &pool->params; + + PL_MSG(vk, lev, "Memory pool %d:", i); + PL_MSG(vk, lev, " Compatible types: 0x%"PRIx32, par->reqs.memoryTypeBits); + if (par->required) + PL_MSG(vk, lev, " Required flags: 0x%"PRIx32, par->required); + if (par->optimal) + PL_MSG(vk, lev, " Optimal flags: 0x%"PRIx32, par->optimal); + if (par->buf_usage) + PL_MSG(vk, lev, " Buffer flags: 0x%"PRIx32, par->buf_usage); + if (par->export_handle) + PL_MSG(vk, lev, " Export handle: 0x%x", par->export_handle); + + size_t pool_size = 0; + size_t pool_used = 0; + size_t pool_res = 0; + + for (int j = 0; j < pool->slabs.num; j++) { + struct vk_slab *slab = pool->slabs.elem[j]; + pl_mutex_lock(&slab->lock); + + size_t avail = __builtin_popcountll(slab->spacemap) * slab->pagesize; + size_t slab_res = slab->size - avail; + + PL_MSG(vk, lev, " Slab %2d: %8"PRIx64" x %s: " + "%s used %s res %s alloc from heap %d, efficiency %.2f%% [%s]", + j, slab->spacemap, PRINT_SIZE(slab->pagesize), + PRINT_SIZE(slab->used), PRINT_SIZE(slab_res), + PRINT_SIZE(slab->size), (int) slab->mtype.heapIndex, + efficiency(slab->used, slab_res), + PL_DEF(slab->debug_tag, "unknown")); + + pool_size += slab->size; + pool_used += slab->used; + pool_res += slab_res; + pl_mutex_unlock(&slab->lock); + } + + PL_MSG(vk, lev, " Pool summary: %s used %s res %s alloc, " + "efficiency %.2f%%, utilization %.2f%%", + PRINT_SIZE(pool_used), PRINT_SIZE(pool_res), + PRINT_SIZE(pool_size), efficiency(pool_used, pool_res), + efficiency(pool_res, pool_size)); + + total_size += pool_size; + total_used += pool_used; + total_res += pool_res; + } + pl_mutex_unlock(&ma->lock); + + PL_MSG(vk, lev, "Memory summary: %s used %s res %s alloc, " + "efficiency %.2f%%, utilization %.2f%%, max page: %s", + PRINT_SIZE(total_used), PRINT_SIZE(total_res), + PRINT_SIZE(total_size), efficiency(total_used, total_res), + efficiency(total_res, total_size), + PRINT_SIZE(ma->maximum_page_size)); +} + +static void slab_free(struct vk_ctx *vk, struct vk_slab *slab) +{ + if (!slab) + return; + +#ifndef NDEBUG + if (!slab->dedicated && slab->used > 0) { + PL_WARN(vk, "Leaked %zu bytes of vulkan memory!", slab->used); + PL_WARN(vk, "slab total size: %zu bytes, heap: %d, flags: 0x%"PRIX64, + (size_t) slab->size, (int) slab->mtype.heapIndex, + (uint64_t) slab->mtype.propertyFlags); + if (slab->debug_tag) + PL_WARN(vk, "last used for: %s", slab->debug_tag); + pl_log_stack_trace(vk->log, PL_LOG_WARN); + pl_debug_abort(); + } +#endif + + if (slab->imported) { + switch (slab->handle_type) { + case PL_HANDLE_FD: + case PL_HANDLE_DMA_BUF: + PL_TRACE(vk, "Unimporting slab of size %s from fd: %d", + PRINT_SIZE(slab->size), slab->handle.fd); + break; + case PL_HANDLE_WIN32: + case PL_HANDLE_WIN32_KMT: +#ifdef PL_HAVE_WIN32 + PL_TRACE(vk, "Unimporting slab of size %s from handle: %p", + PRINT_SIZE(slab->size), (void *) slab->handle.handle); +#endif + break; + case PL_HANDLE_HOST_PTR: + PL_TRACE(vk, "Unimporting slab of size %s from ptr: %p", + PRINT_SIZE(slab->size), (void *) slab->handle.ptr); + break; + case PL_HANDLE_IOSURFACE: + case PL_HANDLE_MTL_TEX: + pl_unreachable(); + } + } else { + switch (slab->handle_type) { + case PL_HANDLE_FD: + case PL_HANDLE_DMA_BUF: +#ifdef PL_HAVE_UNIX + if (slab->handle.fd > -1) + close(slab->handle.fd); +#endif + break; + case PL_HANDLE_WIN32: +#ifdef PL_HAVE_WIN32 + if (slab->handle.handle != NULL) + CloseHandle(slab->handle.handle); +#endif + break; + case PL_HANDLE_WIN32_KMT: + // PL_HANDLE_WIN32_KMT is just an identifier. It doesn't get closed. + break; + case PL_HANDLE_HOST_PTR: + // Implicitly unmapped + break; + case PL_HANDLE_IOSURFACE: + case PL_HANDLE_MTL_TEX: + pl_unreachable(); + } + + PL_DEBUG(vk, "Freeing slab of size %s", PRINT_SIZE(slab->size)); + } + + vk->DestroyBuffer(vk->dev, slab->buffer, PL_VK_ALLOC); + // also implicitly unmaps the memory if needed + vk->FreeMemory(vk->dev, slab->mem, PL_VK_ALLOC); + + pl_mutex_destroy(&slab->lock); + pl_free(slab); +} + +// type_mask: optional +// thread-safety: safe +static bool find_best_memtype(const struct vk_malloc *ma, uint32_t type_mask, + const struct vk_malloc_params *params, + uint32_t *out_index) +{ + struct vk_ctx *vk = ma->vk; + int best = -1; + + // The vulkan spec requires memory types to be sorted in the "optimal" + // order, so the first matching type we find will be the best/fastest one. + // That being said, we still want to prioritize memory types that have + // better optional flags. + + type_mask &= params->reqs.memoryTypeBits; + for (int i = 0; i < ma->props.memoryTypeCount; i++) { + const VkMemoryType *mtype = &ma->props.memoryTypes[i]; + + // The memory type flags must include our properties + if ((mtype->propertyFlags & params->required) != params->required) + continue; + + // The memory heap must be large enough for the allocation + VkDeviceSize heapSize = ma->props.memoryHeaps[mtype->heapIndex].size; + if (params->reqs.size > heapSize) + continue; + + // The memory type must be supported by the type mask (bitfield) + if (!(type_mask & (1LU << i))) + continue; + + // Calculate the score as the number of optimal property flags matched + int score = __builtin_popcountl(mtype->propertyFlags & params->optimal); + if (score > best) { + *out_index = i; + best = score; + } + } + + if (best < 0) { + PL_ERR(vk, "Found no memory type matching property flags 0x%x and type " + "bits 0x%x!", + (unsigned) params->required, (unsigned) type_mask); + return false; + } + + return true; +} + +static bool buf_external_check(struct vk_ctx *vk, VkBufferUsageFlags usage, + enum pl_handle_type handle_type, bool import) +{ + if (!handle_type) + return true; + + VkPhysicalDeviceExternalBufferInfo info = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO_KHR, + .usage = usage, + .handleType = vk_mem_handle_type(handle_type), + }; + + VkExternalBufferProperties props = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_BUFFER_PROPERTIES_KHR, + }; + + if (!info.handleType) + return false; + + vk->GetPhysicalDeviceExternalBufferProperties(vk->physd, &info, &props); + return vk_external_mem_check(vk, &props.externalMemoryProperties, + handle_type, import); +} + +// thread-safety: safe +static struct vk_slab *slab_alloc(struct vk_malloc *ma, + const struct vk_malloc_params *params) +{ + struct vk_ctx *vk = ma->vk; + struct vk_slab *slab = pl_alloc_ptr(NULL, slab); + *slab = (struct vk_slab) { + .age = ma->age, + .size = params->reqs.size, + .handle_type = params->export_handle, + .debug_tag = params->debug_tag, + }; + pl_mutex_init(&slab->lock); + + switch (slab->handle_type) { + case PL_HANDLE_FD: + case PL_HANDLE_DMA_BUF: + slab->handle.fd = -1; + break; + case PL_HANDLE_WIN32: + case PL_HANDLE_WIN32_KMT: + case PL_HANDLE_MTL_TEX: + case PL_HANDLE_IOSURFACE: + slab->handle.handle = NULL; + break; + case PL_HANDLE_HOST_PTR: + slab->handle.ptr = NULL; + break; + } + + VkExportMemoryAllocateInfoKHR ext_info = { + .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR, + .handleTypes = vk_mem_handle_type(slab->handle_type), + }; + + uint32_t type_mask = UINT32_MAX; + if (params->buf_usage) { + // Queue family sharing modes don't matter for buffers, so we just + // set them as concurrent and stop worrying about it. + uint32_t qfs[3] = {0}; + pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs)); + for (int i = 0; i < vk->pools.num; i++) + qfs[i] = vk->pools.elem[i]->qf; + + VkExternalMemoryBufferCreateInfoKHR ext_buf_info = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR, + .handleTypes = ext_info.handleTypes, + }; + + VkBufferCreateInfo binfo = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = slab->handle_type ? &ext_buf_info : NULL, + .size = slab->size, + .usage = params->buf_usage, + .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT + : VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = vk->pools.num, + .pQueueFamilyIndices = qfs, + }; + + if (!buf_external_check(vk, binfo.usage, slab->handle_type, false)) { + PL_ERR(vk, "Failed allocating shared memory buffer: possibly " + "the handle type is unsupported?"); + goto error; + } + + VK(vk->CreateBuffer(vk->dev, &binfo, PL_VK_ALLOC, &slab->buffer)); + PL_VK_NAME(BUFFER, slab->buffer, "slab"); + + VkMemoryRequirements reqs = {0}; + vk->GetBufferMemoryRequirements(vk->dev, slab->buffer, &reqs); + slab->size = reqs.size; // this can be larger than `slab->size` + type_mask = reqs.memoryTypeBits; + + // Note: we can ignore `reqs.align` because we always bind the buffer + // memory to offset 0 + } + + VkMemoryAllocateInfo minfo = { + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .allocationSize = slab->size, + }; + + if (params->export_handle) + vk_link_struct(&minfo, &ext_info); + + VkMemoryDedicatedAllocateInfoKHR dinfo = { + .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR, + .image = params->ded_image, + }; + + if (params->ded_image) + vk_link_struct(&minfo, &dinfo); + + if (!find_best_memtype(ma, type_mask, params, &minfo.memoryTypeIndex)) + goto error; + + const VkMemoryType *mtype = &ma->props.memoryTypes[minfo.memoryTypeIndex]; + PL_DEBUG(vk, "Allocating %zu memory of type 0x%x (id %d) in heap %d: %s", + (size_t) slab->size, (unsigned) mtype->propertyFlags, + (int) minfo.memoryTypeIndex, (int) mtype->heapIndex, + PL_DEF(params->debug_tag, "unknown")); + + pl_clock_t start = pl_clock_now(); + + VkResult res = vk->AllocateMemory(vk->dev, &minfo, PL_VK_ALLOC, &slab->mem); + switch (res) { + case VK_ERROR_OUT_OF_DEVICE_MEMORY: + case VK_ERROR_OUT_OF_HOST_MEMORY: + PL_ERR(vk, "Allocation of size %s failed: %s!", + PRINT_SIZE(slab->size), vk_res_str(res)); + vk_malloc_print_stats(ma, PL_LOG_ERR); + pl_log_stack_trace(vk->log, PL_LOG_ERR); + pl_debug_abort(); + goto error; + + default: + PL_VK_ASSERT(res, "vkAllocateMemory"); + } + + slab->mtype = *mtype; + if (mtype->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) { + VK(vk->MapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data)); + slab->coherent = mtype->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + } + + if (slab->buffer) + VK(vk->BindBufferMemory(vk->dev, slab->buffer, slab->mem, 0)); + +#ifdef PL_HAVE_UNIX + if (slab->handle_type == PL_HANDLE_FD || + slab->handle_type == PL_HANDLE_DMA_BUF) + { + VkMemoryGetFdInfoKHR fd_info = { + .sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR, + .memory = slab->mem, + .handleType = ext_info.handleTypes, + }; + + VK(vk->GetMemoryFdKHR(vk->dev, &fd_info, &slab->handle.fd)); + } +#endif + +#ifdef PL_HAVE_WIN32 + if (slab->handle_type == PL_HANDLE_WIN32 || + slab->handle_type == PL_HANDLE_WIN32_KMT) + { + VkMemoryGetWin32HandleInfoKHR handle_info = { + .sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR, + .memory = slab->mem, + .handleType = ext_info.handleTypes, + }; + + VK(vk->GetMemoryWin32HandleKHR(vk->dev, &handle_info, + &slab->handle.handle)); + } +#endif + + pl_log_cpu_time(vk->log, start, pl_clock_now(), "allocating slab"); + + // free space accounting is done by the caller + return slab; + +error: + if (params->debug_tag) + PL_ERR(vk, " for malloc: %s", params->debug_tag); + slab_free(vk, slab); + return NULL; +} + +static void pool_uninit(struct vk_ctx *vk, struct vk_pool *pool) +{ + for (int i = 0; i < pool->slabs.num; i++) + slab_free(vk, pool->slabs.elem[i]); + + pl_free(pool->slabs.elem); + *pool = (struct vk_pool) {0}; +} + +struct vk_malloc *vk_malloc_create(struct vk_ctx *vk) +{ + struct vk_malloc *ma = pl_zalloc_ptr(NULL, ma); + pl_mutex_init(&ma->lock); + vk->GetPhysicalDeviceMemoryProperties(vk->physd, &ma->props); + ma->vk = vk; + + // Determine maximum page size + ma->maximum_page_size = MAXIMUM_PAGE_SIZE_ABSOLUTE; + for (int i = 0; i < ma->props.memoryHeapCount; i++) { + VkMemoryHeap heap = ma->props.memoryHeaps[i]; + if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) { + size_t size_max = heap.size / MAXIMUM_PAGE_SIZE_RELATIVE; + ma->maximum_page_size = PL_MAX(ma->maximum_page_size, size_max); + } + } + + vk_malloc_print_stats(ma, PL_LOG_INFO); + return ma; +} + +void vk_malloc_destroy(struct vk_malloc **ma_ptr) +{ + struct vk_malloc *ma = *ma_ptr; + if (!ma) + return; + + vk_malloc_print_stats(ma, PL_LOG_DEBUG); + for (int i = 0; i < ma->pools.num; i++) + pool_uninit(ma->vk, &ma->pools.elem[i]); + + pl_mutex_destroy(&ma->lock); + pl_free_ptr(ma_ptr); +} + +void vk_malloc_garbage_collect(struct vk_malloc *ma) +{ + struct vk_ctx *vk = ma->vk; + + pl_mutex_lock(&ma->lock); + ma->age++; + + for (int i = 0; i < ma->pools.num; i++) { + struct vk_pool *pool = &ma->pools.elem[i]; + for (int n = 0; n < pool->slabs.num; n++) { + struct vk_slab *slab = pool->slabs.elem[n]; + pl_mutex_lock(&slab->lock); + if (slab->used || (ma->age - slab->age) <= MAXIMUM_SLAB_AGE) { + pl_mutex_unlock(&slab->lock); + continue; + } + + PL_DEBUG(vk, "Garbage collected slab of size %s from pool %d", + PRINT_SIZE(slab->size), pool->index); + + pl_mutex_unlock(&slab->lock); + slab_free(ma->vk, slab); + PL_ARRAY_REMOVE_AT(pool->slabs, n--); + } + } + + pl_mutex_unlock(&ma->lock); +} + +pl_handle_caps vk_malloc_handle_caps(const struct vk_malloc *ma, bool import) +{ + struct vk_ctx *vk = ma->vk; + pl_handle_caps caps = 0; + + for (int i = 0; vk_mem_handle_list[i]; i++) { + // Try seeing if we could allocate a "basic" buffer using these + // capabilities, with no fancy buffer usage. More specific checks will + // happen down the line at VkBuffer creation time, but this should give + // us a rough idea of what the driver supports. + enum pl_handle_type type = vk_mem_handle_list[i]; + if (buf_external_check(vk, VK_BUFFER_USAGE_TRANSFER_DST_BIT, type, import)) + caps |= type; + } + + return caps; +} + +void vk_malloc_free(struct vk_malloc *ma, struct vk_memslice *slice) +{ + struct vk_ctx *vk = ma->vk; + struct vk_slab *slab = slice->priv; + if (!slab || slab->dedicated) { + slab_free(vk, slab); + goto done; + } + + pl_mutex_lock(&slab->lock); + + int page_idx = slice->offset / slab->pagesize; + slab->spacemap |= 0x1LLU << page_idx; + slab->used -= slice->size; + slab->age = ma->age; + pl_assert(slab->used >= 0); + + pl_mutex_unlock(&slab->lock); + +done: + *slice = (struct vk_memslice) {0}; +} + +static inline bool pool_params_eq(const struct vk_malloc_params *a, + const struct vk_malloc_params *b) +{ + return a->reqs.size == b->reqs.size && + a->reqs.alignment == b->reqs.alignment && + a->reqs.memoryTypeBits == b->reqs.memoryTypeBits && + a->required == b->required && + a->optimal == b->optimal && + a->buf_usage == b->buf_usage && + a->export_handle == b->export_handle; +} + +static struct vk_pool *find_pool(struct vk_malloc *ma, + const struct vk_malloc_params *params) +{ + pl_assert(!params->import_handle); + pl_assert(!params->ded_image); + + struct vk_malloc_params fixed = *params; + fixed.reqs.alignment = 0; + fixed.reqs.size = 0; + fixed.shared_mem = (struct pl_shared_mem) {0}; + + for (int i = 0; i < ma->pools.num; i++) { + if (pool_params_eq(&ma->pools.elem[i].params, &fixed)) + return &ma->pools.elem[i]; + } + + // Not found => add it + PL_ARRAY_GROW(ma, ma->pools); + size_t idx = ma->pools.num++; + ma->pools.elem[idx] = (struct vk_pool) { + .params = fixed, + .index = idx, + }; + return &ma->pools.elem[idx]; +} + +// Returns a suitable memory page from the pool. A new slab will be allocated +// under the hood, if necessary. +// +// Note: This locks the slab it returns +static struct vk_slab *pool_get_page(struct vk_malloc *ma, struct vk_pool *pool, + size_t size, size_t align, + VkDeviceSize *offset) +{ + struct vk_slab *slab = NULL; + int slab_pages = MINIMUM_PAGE_COUNT; + size = PL_ALIGN2(size, PAGE_SIZE_ALIGN); + const size_t pagesize = PL_ALIGN(size, align); + + for (int i = 0; i < pool->slabs.num; i++) { + slab = pool->slabs.elem[i]; + if (slab->pagesize < size) + continue; + if (slab->pagesize > pagesize * MINIMUM_PAGE_COUNT) // rough heuristic + continue; + if (slab->pagesize % align) + continue; + + pl_mutex_lock(&slab->lock); + int page_idx = __builtin_ffsll(slab->spacemap); + if (!page_idx--) { + pl_mutex_unlock(&slab->lock); + // Increase the number of slabs to allocate for new slabs the + // more existing full slabs exist for this size range + slab_pages = PL_MIN(slab_pages << 1, MAXIMUM_PAGE_COUNT); + continue; + } + + slab->spacemap ^= 0x1LLU << page_idx; + *offset = page_idx * slab->pagesize; + return slab; + } + + // Otherwise, allocate a new vk_slab and append it to the list. + VkDeviceSize slab_size = slab_pages * pagesize; + pl_static_assert(MINIMUM_SLAB_SIZE <= PAGE_SIZE_ALIGN * MAXIMUM_PAGE_COUNT); + const VkDeviceSize max_slab_size = ma->maximum_page_size * MINIMUM_PAGE_COUNT; + pl_assert(pagesize <= ma->maximum_page_size); + slab_size = PL_CLAMP(slab_size, MINIMUM_SLAB_SIZE, max_slab_size); + slab_pages = slab_size / pagesize; + slab_size = slab_pages * pagesize; // max_slab_size may be npot2, trim excess + + struct vk_malloc_params params = pool->params; + params.reqs.size = slab_size; + + // Don't hold the lock while allocating the slab, because it can be a + // potentially very costly operation. + pl_mutex_unlock(&ma->lock); + slab = slab_alloc(ma, ¶ms); + pl_mutex_lock(&ma->lock); + if (!slab) + return NULL; + pl_mutex_lock(&slab->lock); + + slab->spacemap = (slab_pages == sizeof(uint64_t) * 8) ? ~0LLU : ~(~0LLU << slab_pages); + slab->pagesize = pagesize; + PL_ARRAY_APPEND(NULL, pool->slabs, slab); + + // Return the first page in this newly allocated slab + slab->spacemap ^= 0x1; + *offset = 0; + return slab; +} + +static bool vk_malloc_import(struct vk_malloc *ma, struct vk_memslice *out, + const struct vk_malloc_params *params) +{ + struct vk_ctx *vk = ma->vk; + VkExternalMemoryHandleTypeFlagBitsKHR vk_handle_type; + vk_handle_type = vk_mem_handle_type(params->import_handle); + + struct vk_slab *slab = NULL; + const struct pl_shared_mem *shmem = ¶ms->shared_mem; + + VkMemoryDedicatedAllocateInfoKHR dinfo = { + .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR, + .image = params->ded_image, + }; + + VkImportMemoryFdInfoKHR fdinfo = { + .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR, + .handleType = vk_handle_type, + .fd = -1, + }; + + VkImportMemoryHostPointerInfoEXT ptrinfo = { + .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT, + .handleType = vk_handle_type, + }; + + VkMemoryAllocateInfo ainfo = { + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .allocationSize = shmem->size, + }; + + if (params->ded_image) + vk_link_struct(&ainfo, &dinfo); + + VkBuffer buffer = VK_NULL_HANDLE; + VkMemoryRequirements reqs = params->reqs; + + if (params->buf_usage) { + uint32_t qfs[3] = {0}; + pl_assert(vk->pools.num <= PL_ARRAY_SIZE(qfs)); + for (int i = 0; i < vk->pools.num; i++) + qfs[i] = vk->pools.elem[i]->qf; + + VkExternalMemoryBufferCreateInfoKHR ext_buf_info = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR, + .handleTypes = vk_handle_type, + }; + + VkBufferCreateInfo binfo = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = &ext_buf_info, + .size = shmem->size, + .usage = params->buf_usage, + .sharingMode = vk->pools.num > 1 ? VK_SHARING_MODE_CONCURRENT + : VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = vk->pools.num, + .pQueueFamilyIndices = qfs, + }; + + VK(vk->CreateBuffer(vk->dev, &binfo, PL_VK_ALLOC, &buffer)); + PL_VK_NAME(BUFFER, buffer, "imported"); + + vk->GetBufferMemoryRequirements(vk->dev, buffer, &reqs); + } + + if (reqs.size > shmem->size) { + PL_ERR(vk, "Imported object requires %zu bytes, larger than the " + "provided size %zu!", + (size_t) reqs.size, shmem->size); + goto error; + } + + if (shmem->offset % reqs.alignment || shmem->offset % params->reqs.alignment) { + PL_ERR(vk, "Imported object offset %zu conflicts with alignment %zu!", + shmem->offset, pl_lcm(reqs.alignment, params->reqs.alignment)); + goto error; + } + + switch (params->import_handle) { +#ifdef PL_HAVE_UNIX + case PL_HANDLE_DMA_BUF: { + if (!vk->GetMemoryFdPropertiesKHR) { + PL_ERR(vk, "Importing PL_HANDLE_DMA_BUF requires %s.", + VK_EXT_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME); + goto error; + } + + VkMemoryFdPropertiesKHR fdprops = { + .sType = VK_STRUCTURE_TYPE_MEMORY_FD_PROPERTIES_KHR, + }; + + VK(vk->GetMemoryFdPropertiesKHR(vk->dev, + vk_handle_type, + shmem->handle.fd, + &fdprops)); + + // We dup() the fd to make it safe to import the same original fd + // multiple times. + fdinfo.fd = dup(shmem->handle.fd); + if (fdinfo.fd == -1) { + PL_ERR(vk, "Failed to dup() fd (%d) when importing memory: %s", + fdinfo.fd, strerror(errno)); + goto error; + } + + reqs.memoryTypeBits &= fdprops.memoryTypeBits; + vk_link_struct(&ainfo, &fdinfo); + break; + } +#else // !PL_HAVE_UNIX + case PL_HANDLE_DMA_BUF: + PL_ERR(vk, "PL_HANDLE_DMA_BUF requires building with UNIX support!"); + goto error; +#endif + + case PL_HANDLE_HOST_PTR: { + VkMemoryHostPointerPropertiesEXT ptrprops = { + .sType = VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT, + }; + + VK(vk->GetMemoryHostPointerPropertiesEXT(vk->dev, vk_handle_type, + shmem->handle.ptr, + &ptrprops)); + + ptrinfo.pHostPointer = (void *) shmem->handle.ptr; + reqs.memoryTypeBits &= ptrprops.memoryTypeBits; + vk_link_struct(&ainfo, &ptrinfo); + break; + } + + case PL_HANDLE_FD: + case PL_HANDLE_WIN32: + case PL_HANDLE_WIN32_KMT: + case PL_HANDLE_IOSURFACE: + case PL_HANDLE_MTL_TEX: + PL_ERR(vk, "vk_malloc_import: unsupported handle type %d", + params->import_handle); + goto error; + } + + if (!find_best_memtype(ma, reqs.memoryTypeBits, params, &ainfo.memoryTypeIndex)) { + PL_ERR(vk, "No compatible memory types offered for imported memory!"); + goto error; + } + + VkDeviceMemory vkmem = VK_NULL_HANDLE; + VK(vk->AllocateMemory(vk->dev, &ainfo, PL_VK_ALLOC, &vkmem)); + + slab = pl_alloc_ptr(NULL, slab); + *slab = (struct vk_slab) { + .mem = vkmem, + .dedicated = true, + .imported = true, + .buffer = buffer, + .size = shmem->size, + .handle_type = params->import_handle, + }; + pl_mutex_init(&slab->lock); + + *out = (struct vk_memslice) { + .vkmem = vkmem, + .buf = buffer, + .size = shmem->size - shmem->offset, + .offset = shmem->offset, + .shared_mem = *shmem, + .priv = slab, + }; + + switch (params->import_handle) { + case PL_HANDLE_DMA_BUF: + case PL_HANDLE_FD: + PL_TRACE(vk, "Imported %s bytes from fd: %d%s", + PRINT_SIZE(slab->size), shmem->handle.fd, + params->ded_image ? " (dedicated)" : ""); + // fd ownership is transferred at this point. + slab->handle.fd = fdinfo.fd; + fdinfo.fd = -1; + break; + case PL_HANDLE_HOST_PTR: + PL_TRACE(vk, "Imported %s bytes from ptr: %p%s", + PRINT_SIZE(slab->size), shmem->handle.ptr, + params->ded_image ? " (dedicated" : ""); + slab->handle.ptr = ptrinfo.pHostPointer; + break; + case PL_HANDLE_WIN32: + case PL_HANDLE_WIN32_KMT: + case PL_HANDLE_IOSURFACE: + case PL_HANDLE_MTL_TEX: + break; + } + + VkMemoryPropertyFlags flags = ma->props.memoryTypes[ainfo.memoryTypeIndex].propertyFlags; + if (flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) { + VK(vk->MapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data)); + slab->coherent = flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + out->data = (uint8_t *) slab->data + out->offset; + out->coherent = slab->coherent; + if (!slab->coherent) { + // Use entire buffer range, since this is a dedicated memory + // allocation. This avoids issues with noncoherent atomicity + out->map_offset = 0; + out->map_size = VK_WHOLE_SIZE; + + // Mapping does not implicitly invalidate mapped memory + VK(vk->InvalidateMappedMemoryRanges(vk->dev, 1, &(VkMappedMemoryRange) { + .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, + .memory = slab->mem, + .offset = out->map_offset, + .size = out->map_size, + })); + } + } + + if (buffer) + VK(vk->BindBufferMemory(vk->dev, buffer, vkmem, 0)); + + return true; + +error: + if (params->debug_tag) + PL_ERR(vk, " for malloc: %s", params->debug_tag); + vk->DestroyBuffer(vk->dev, buffer, PL_VK_ALLOC); +#ifdef PL_HAVE_UNIX + if (fdinfo.fd > -1) + close(fdinfo.fd); +#endif + pl_free(slab); + *out = (struct vk_memslice) {0}; + return false; +} + +size_t vk_malloc_avail(struct vk_malloc *ma, VkMemoryPropertyFlags flags) +{ + size_t avail = 0; + for (int i = 0; i < ma->props.memoryTypeCount; i++) { + const VkMemoryType *mtype = &ma->props.memoryTypes[i]; + if ((mtype->propertyFlags & flags) != flags) + continue; + avail = PL_MAX(avail, ma->props.memoryHeaps[mtype->heapIndex].size); + } + + return avail; +} + +bool vk_malloc_slice(struct vk_malloc *ma, struct vk_memslice *out, + const struct vk_malloc_params *params) +{ + struct vk_ctx *vk = ma->vk; + pl_assert(!params->import_handle || !params->export_handle); + if (params->import_handle) + return vk_malloc_import(ma, out, params); + + pl_assert(params->reqs.size); + size_t size = params->reqs.size; + size_t align = params->reqs.alignment; + align = pl_lcm(align, vk->props.limits.bufferImageGranularity); + align = pl_lcm(align, vk->props.limits.nonCoherentAtomSize); + + struct vk_slab *slab; + VkDeviceSize offset; + + if (params->ded_image || size > ma->maximum_page_size) { + slab = slab_alloc(ma, params); + if (!slab) + return false; + slab->dedicated = true; + offset = 0; + } else { + pl_mutex_lock(&ma->lock); + struct vk_pool *pool = find_pool(ma, params); + slab = pool_get_page(ma, pool, size, align, &offset); + pl_mutex_unlock(&ma->lock); + if (!slab) { + PL_ERR(ma->vk, "No slab to serve request for %s bytes (with " + "alignment 0x%zx) in pool %d!", + PRINT_SIZE(size), align, pool->index); + return false; + } + + // For accounting, just treat the alignment as part of the used size. + // Doing it this way makes sure that the sizes reported to vk_memslice + // consumers are always aligned properly. + size = PL_ALIGN(size, align); + slab->used += size; + slab->age = ma->age; + if (params->debug_tag) + slab->debug_tag = params->debug_tag; + pl_mutex_unlock(&slab->lock); + } + + pl_assert(offset % align == 0); + *out = (struct vk_memslice) { + .vkmem = slab->mem, + .offset = offset, + .size = size, + .buf = slab->buffer, + .data = slab->data ? (uint8_t *) slab->data + offset : 0x0, + .coherent = slab->coherent, + .map_offset = slab->data ? offset : 0, + .map_size = slab->data ? size : 0, + .priv = slab, + .shared_mem = { + .handle = slab->handle, + .offset = offset, + .size = slab->size, + }, + }; + return true; +} diff --git a/src/vulkan/malloc.h b/src/vulkan/malloc.h new file mode 100644 index 0000000..115352e --- /dev/null +++ b/src/vulkan/malloc.h @@ -0,0 +1,72 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "common.h" + +// All memory allocated from a vk_malloc MUST be explicitly released by +// the caller before vk_malloc_destroy is called. +struct vk_malloc *vk_malloc_create(struct vk_ctx *vk); +void vk_malloc_destroy(struct vk_malloc **ma); + +// Get the supported handle types for this malloc instance +pl_handle_caps vk_malloc_handle_caps(const struct vk_malloc *ma, bool import); + +// Represents a single "slice" of generic (non-buffer) memory, plus some +// metadata for accounting. This struct is essentially read-only. +struct vk_memslice { + VkDeviceMemory vkmem; + VkDeviceSize offset; + VkDeviceSize size; + void *priv; + // depending on the type/flags: + struct pl_shared_mem shared_mem; + VkBuffer buf; // associated buffer (when `buf_usage` is nonzero) + void *data; // pointer to slice (for persistently mapped slices) + bool coherent; // whether `data` is coherent + VkDeviceSize map_offset; // can be larger than offset/size + VkDeviceSize map_size; +}; + +struct vk_malloc_params { + VkMemoryRequirements reqs; + VkMemoryPropertyFlags required; + VkMemoryPropertyFlags optimal; + VkBufferUsageFlags buf_usage; + VkImage ded_image; // for dedicated image allocations + enum pl_handle_type export_handle; + enum pl_handle_type import_handle; + struct pl_shared_mem shared_mem; // for `import_handle` + pl_debug_tag debug_tag; +}; + +// Returns the amount of available memory matching a given set of property +// flags. Always returns the highest single allocation, not the combined total. +size_t vk_malloc_avail(struct vk_malloc *ma, VkMemoryPropertyFlags flags); + +bool vk_malloc_slice(struct vk_malloc *ma, struct vk_memslice *out, + const struct vk_malloc_params *params); + +void vk_malloc_free(struct vk_malloc *ma, struct vk_memslice *slice); + +// Clean up unused slabs. Call this roughly once per frame to reduce +// memory pressure / memory leaks. +void vk_malloc_garbage_collect(struct vk_malloc *ma); + +// For debugging purposes. Doesn't include dedicated slab allocations! +void vk_malloc_print_stats(struct vk_malloc *ma, enum pl_log_level); diff --git a/src/vulkan/meson.build b/src/vulkan/meson.build new file mode 100644 index 0000000..64c5572 --- /dev/null +++ b/src/vulkan/meson.build @@ -0,0 +1,59 @@ +vulkan_build = get_option('vulkan') +vulkan_link = get_option('vk-proc-addr') +vulkan_loader = dependency('vulkan', required: false) +vulkan_headers = vulkan_loader.partial_dependency(includes: true, compile_args: true) +registry_xml = get_option('vulkan-registry') + +# Prefer our Vulkan headers for portability +vulkan_headers_dir = thirdparty/'Vulkan-Headers' +vulkan_headers_inc = include_directories() +if fs.is_dir(vulkan_headers_dir/'include') + vulkan_headers = declare_dependency() + vulkan_headers_inc = include_directories('../../3rdparty/Vulkan-Headers/include') + # Force the use of this vk.xml because it has to be in sync with the headers + registry_xml = vulkan_headers_dir/'registry/vk.xml' +endif + +vulkan_build = vulkan_build.require( + cc.has_header_symbol('vulkan/vulkan_core.h', 'VK_VERSION_1_3', + include_directories: vulkan_headers_inc, + dependencies: vulkan_headers), + error_message: 'vulkan.h was not found on the system, nor inside ' + + '`3rdparty/Vulkan-Headers`. Please run `git submodule update --init` ' + + 'followed by `meson --wipe`.') +components.set('vulkan', vulkan_build.allowed()) + +vulkan_link = vulkan_link.require(vulkan_loader.found() and vulkan_build.allowed()) +components.set('vk-proc-addr', vulkan_link.allowed()) + +build_deps += vulkan_headers + +if vulkan_build.allowed() + sources += [ + 'vulkan/command.c', + 'vulkan/context.c', + 'vulkan/formats.c', + 'vulkan/gpu.c', + 'vulkan/gpu_buf.c', + 'vulkan/gpu_tex.c', + 'vulkan/gpu_pass.c', + 'vulkan/malloc.c', + 'vulkan/swapchain.c', + 'vulkan/utils.c', + ] + + datadir = get_option('prefix') / get_option('datadir') + sources += custom_target('utils_gen.c', + input: 'utils_gen.py', + output: 'utils_gen.c', + command: [python, '@INPUT@', datadir, registry_xml, '@OUTPUT@'], + env: python_env, + ) + + if vulkan_link.allowed() + build_deps += vulkan_loader + tests += 'vulkan.c' + endif +else + sources += 'vulkan/stubs.c' +endif diff --git a/src/vulkan/stubs.c b/src/vulkan/stubs.c new file mode 100644 index 0000000..0c0738e --- /dev/null +++ b/src/vulkan/stubs.c @@ -0,0 +1,108 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "../common.h" +#include "log.h" + +#include <libplacebo/vulkan.h> + +const struct pl_vk_inst_params pl_vk_inst_default_params = {0}; +const struct pl_vulkan_params pl_vulkan_default_params = { PL_VULKAN_DEFAULTS }; + +pl_vk_inst pl_vk_inst_create(pl_log log, const struct pl_vk_inst_params *params) +{ + pl_fatal(log, "libplacebo compiled without Vulkan support!"); + return NULL; +} + +void pl_vk_inst_destroy(pl_vk_inst *pinst) +{ + pl_vk_inst inst = *pinst; + pl_assert(!inst); +} + +pl_vulkan pl_vulkan_create(pl_log log, const struct pl_vulkan_params *params) +{ + pl_fatal(log, "libplacebo compiled without Vulkan support!"); + return NULL; +} + +void pl_vulkan_destroy(pl_vulkan *pvk) +{ + pl_vulkan vk = *pvk; + pl_assert(!vk); +} + +pl_vulkan pl_vulkan_get(pl_gpu gpu) +{ + return NULL; +} + +VkPhysicalDevice pl_vulkan_choose_device(pl_log log, + const struct pl_vulkan_device_params *params) +{ + pl_err(log, "libplacebo compiled without Vulkan support!"); + return NULL; +} + +pl_swapchain pl_vulkan_create_swapchain(pl_vulkan vk, + const struct pl_vulkan_swapchain_params *params) +{ + pl_unreachable(); +} + +bool pl_vulkan_swapchain_suboptimal(pl_swapchain sw) +{ + pl_unreachable(); +} + +pl_vulkan pl_vulkan_import(pl_log log, const struct pl_vulkan_import_params *params) +{ + pl_fatal(log, "libplacebo compiled without Vulkan support!"); + return NULL; +} + +pl_tex pl_vulkan_wrap(pl_gpu gpu, const struct pl_vulkan_wrap_params *params) +{ + pl_unreachable(); +} + +VkImage pl_vulkan_unwrap(pl_gpu gpu, pl_tex tex, + VkFormat *out_format, VkImageUsageFlags *out_flags) +{ + pl_unreachable(); +} + +bool pl_vulkan_hold_ex(pl_gpu gpu, const struct pl_vulkan_hold_params *params) +{ + pl_unreachable(); +} + +void pl_vulkan_release_ex(pl_gpu gpu, const struct pl_vulkan_release_params *params) +{ + pl_unreachable(); +} + +VkSemaphore pl_vulkan_sem_create(pl_gpu gpu, const struct pl_vulkan_sem_params *params) +{ + pl_unreachable(); +} + +void pl_vulkan_sem_destroy(pl_gpu gpu, VkSemaphore *semaphore) +{ + pl_unreachable(); +} diff --git a/src/vulkan/swapchain.c b/src/vulkan/swapchain.c new file mode 100644 index 0000000..0741fbf --- /dev/null +++ b/src/vulkan/swapchain.c @@ -0,0 +1,911 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "common.h" +#include "command.h" +#include "formats.h" +#include "utils.h" +#include "gpu.h" +#include "swapchain.h" +#include "pl_thread.h" + +struct sem_pair { + VkSemaphore in; + VkSemaphore out; +}; + +struct priv { + struct pl_sw_fns impl; + + pl_mutex lock; + struct vk_ctx *vk; + VkSurfaceKHR surf; + PL_ARRAY(VkSurfaceFormatKHR) formats; + + // current swapchain and metadata: + struct pl_vulkan_swapchain_params params; + VkSwapchainCreateInfoKHR protoInfo; // partially filled-in prototype + VkSwapchainKHR swapchain; + int cur_width, cur_height; + int swapchain_depth; + pl_rc_t frames_in_flight; // number of frames currently queued + bool suboptimal; // true once VK_SUBOPTIMAL_KHR is returned + bool needs_recreate; // swapchain needs to be recreated + struct pl_color_repr color_repr; + struct pl_color_space color_space; + struct pl_hdr_metadata hdr_metadata; + + // state of the images: + PL_ARRAY(pl_tex) images; // pl_tex wrappers for the VkImages + PL_ARRAY(struct sem_pair) sems; // pool of semaphores used to synchronize images + int idx_sems; // index of next free semaphore pair + int last_imgidx; // the image index last acquired (for submit) +}; + +static const struct pl_sw_fns vulkan_swapchain; + +static bool map_color_space(VkColorSpaceKHR space, struct pl_color_space *out) +{ + switch (space) { + // Note: This is technically against the spec, but more often than not + // it's the correct result since `SRGB_NONLINEAR` is just a catch-all + // for any sort of typical SDR curve, which is better approximated by + // `pl_color_space_monitor`. + case VK_COLOR_SPACE_SRGB_NONLINEAR_KHR: + *out = pl_color_space_monitor; + return true; + + case VK_COLOR_SPACE_BT709_NONLINEAR_EXT: + *out = pl_color_space_monitor; + return true; + case VK_COLOR_SPACE_DISPLAY_P3_NONLINEAR_EXT: + *out = (struct pl_color_space) { + .primaries = PL_COLOR_PRIM_DISPLAY_P3, + .transfer = PL_COLOR_TRC_BT_1886, + }; + return true; + case VK_COLOR_SPACE_DCI_P3_LINEAR_EXT: + *out = (struct pl_color_space) { + .primaries = PL_COLOR_PRIM_DCI_P3, + .transfer = PL_COLOR_TRC_LINEAR, + }; + return true; + case VK_COLOR_SPACE_DCI_P3_NONLINEAR_EXT: + *out = (struct pl_color_space) { + .primaries = PL_COLOR_PRIM_DCI_P3, + .transfer = PL_COLOR_TRC_BT_1886, + }; + return true; + case VK_COLOR_SPACE_EXTENDED_SRGB_LINEAR_EXT: + case VK_COLOR_SPACE_EXTENDED_SRGB_NONLINEAR_EXT: + // TODO + return false; + case VK_COLOR_SPACE_BT709_LINEAR_EXT: + *out = (struct pl_color_space) { + .primaries = PL_COLOR_PRIM_DCI_P3, + .transfer = PL_COLOR_TRC_LINEAR, + }; + return true; + case VK_COLOR_SPACE_BT2020_LINEAR_EXT: + *out = (struct pl_color_space) { + .primaries = PL_COLOR_PRIM_BT_2020, + .transfer = PL_COLOR_TRC_LINEAR, + }; + return true; + case VK_COLOR_SPACE_HDR10_ST2084_EXT: + *out = (struct pl_color_space) { + .primaries = PL_COLOR_PRIM_BT_2020, + .transfer = PL_COLOR_TRC_PQ, + }; + return true; + case VK_COLOR_SPACE_DOLBYVISION_EXT: + // Unlikely to ever be implemented + return false; + case VK_COLOR_SPACE_HDR10_HLG_EXT: + *out = (struct pl_color_space) { + .primaries = PL_COLOR_PRIM_BT_2020, + .transfer = PL_COLOR_TRC_HLG, + }; + return true; + case VK_COLOR_SPACE_ADOBERGB_LINEAR_EXT: + *out = (struct pl_color_space) { + .primaries = PL_COLOR_PRIM_ADOBE, + .transfer = PL_COLOR_TRC_LINEAR, + }; + return true; + case VK_COLOR_SPACE_ADOBERGB_NONLINEAR_EXT: + *out = (struct pl_color_space) { + .primaries = PL_COLOR_PRIM_ADOBE, + .transfer = PL_COLOR_TRC_GAMMA22, + }; + return true; + case VK_COLOR_SPACE_PASS_THROUGH_EXT: + *out = pl_color_space_unknown; + return true; + +#ifdef VK_AMD_display_native_hdr + case VK_COLOR_SPACE_DISPLAY_NATIVE_AMD: + // TODO + return false; +#endif + + default: return false; + } +} + +static bool pick_surf_format(pl_swapchain sw, const struct pl_color_space *hint) +{ + struct priv *p = PL_PRIV(sw); + struct vk_ctx *vk = p->vk; + pl_gpu gpu = sw->gpu; + + int best_score = 0, best_id; + bool wide_gamut = pl_color_primaries_is_wide_gamut(hint->primaries); + bool prefer_hdr = pl_color_transfer_is_hdr(hint->transfer); + + for (int i = 0; i < p->formats.num; i++) { + // Color space / format whitelist + struct pl_color_space space; + if (!map_color_space(p->formats.elem[i].colorSpace, &space)) + continue; + + bool disable10 = !pl_color_transfer_is_hdr(space.transfer) && + p->params.disable_10bit_sdr; + + switch (p->formats.elem[i].format) { + // Only accept floating point formats for linear curves + case VK_FORMAT_R16G16B16_SFLOAT: + case VK_FORMAT_R16G16B16A16_SFLOAT: + case VK_FORMAT_R32G32B32_SFLOAT: + case VK_FORMAT_R32G32B32A32_SFLOAT: + case VK_FORMAT_R64G64B64_SFLOAT: + case VK_FORMAT_R64G64B64A64_SFLOAT: + if (space.transfer == PL_COLOR_TRC_LINEAR) + break; // accept + continue; + + // Only accept 8 bit for non-HDR curves + case VK_FORMAT_R8G8B8_UNORM: + case VK_FORMAT_B8G8R8_UNORM: + case VK_FORMAT_R8G8B8A8_UNORM: + case VK_FORMAT_B8G8R8A8_UNORM: + case VK_FORMAT_A8B8G8R8_UNORM_PACK32: + if (!pl_color_transfer_is_hdr(space.transfer)) + break; // accept + continue; + + // Only accept 10 bit formats for non-linear curves + case VK_FORMAT_A2R10G10B10_UNORM_PACK32: + case VK_FORMAT_A2B10G10R10_UNORM_PACK32: + if (space.transfer != PL_COLOR_TRC_LINEAR && !disable10) + break; // accept + continue; + + // Accept 16-bit formats for everything + case VK_FORMAT_R16G16B16_UNORM: + case VK_FORMAT_R16G16B16A16_UNORM: + if (!disable10) + break; // accept + continue; + + default: continue; + } + + // Make sure we can wrap this format to a meaningful, valid pl_fmt + for (int n = 0; n < gpu->num_formats; n++) { + pl_fmt plfmt = gpu->formats[n]; + const struct vk_format **pvkfmt = PL_PRIV(plfmt); + if ((*pvkfmt)->tfmt != p->formats.elem[i].format) + continue; + + enum pl_fmt_caps render_caps = 0; + render_caps |= PL_FMT_CAP_RENDERABLE; + render_caps |= PL_FMT_CAP_BLITTABLE; + if ((plfmt->caps & render_caps) != render_caps) + continue; + + // format valid, use it if it has a higher score + int score = 0; + for (int c = 0; c < 3; c++) + score += plfmt->component_depth[c]; + if (pl_color_primaries_is_wide_gamut(space.primaries) == wide_gamut) + score += 1000; + if (space.primaries == hint->primaries) + score += 2000; + if (pl_color_transfer_is_hdr(space.transfer) == prefer_hdr) + score += 10000; + if (space.transfer == hint->transfer) + score += 20000; + + switch (plfmt->type) { + case PL_FMT_UNKNOWN: break; + case PL_FMT_UINT: break; + case PL_FMT_SINT: break; + case PL_FMT_UNORM: score += 500; break; + case PL_FMT_SNORM: score += 400; break; + case PL_FMT_FLOAT: score += 300; break; + case PL_FMT_TYPE_COUNT: pl_unreachable(); + }; + + if (score > best_score) { + best_score = score; + best_id = i; + break; + } + } + } + + if (!best_score) { + PL_ERR(vk, "Failed picking any valid, renderable surface format!"); + return false; + } + + VkSurfaceFormatKHR new_sfmt = p->formats.elem[best_id]; + if (p->protoInfo.imageFormat != new_sfmt.format || + p->protoInfo.imageColorSpace != new_sfmt.colorSpace) + { + PL_INFO(vk, "Picked surface configuration %d: %s + %s", best_id, + vk_fmt_name(new_sfmt.format), + vk_csp_name(new_sfmt.colorSpace)); + + p->protoInfo.imageFormat = new_sfmt.format; + p->protoInfo.imageColorSpace = new_sfmt.colorSpace; + p->needs_recreate = true; + } + + return true; +} + +static void set_hdr_metadata(struct priv *p, const struct pl_hdr_metadata *metadata) +{ + struct vk_ctx *vk = p->vk; + if (!vk->SetHdrMetadataEXT) + return; + + // Whitelist only values that we support signalling metadata for + struct pl_hdr_metadata fix = { + .prim = metadata->prim, + .min_luma = metadata->min_luma, + .max_luma = metadata->max_luma, + .max_cll = metadata->max_cll, + .max_fall = metadata->max_fall, + }; + + // Ignore no-op changes + if (pl_hdr_metadata_equal(&fix, &p->hdr_metadata)) + return; + + // Remember the metadata so we can re-apply it after swapchain recreation + p->hdr_metadata = fix; + + // Ignore HDR metadata requests for SDR swapchains + if (!pl_color_transfer_is_hdr(p->color_space.transfer)) + return; + + if (!p->swapchain) + return; + + vk->SetHdrMetadataEXT(vk->dev, 1, &p->swapchain, &(VkHdrMetadataEXT) { + .sType = VK_STRUCTURE_TYPE_HDR_METADATA_EXT, + .displayPrimaryRed = { fix.prim.red.x, fix.prim.red.y }, + .displayPrimaryGreen = { fix.prim.green.x, fix.prim.green.y }, + .displayPrimaryBlue = { fix.prim.blue.x, fix.prim.blue.y }, + .whitePoint = { fix.prim.white.x, fix.prim.white.y }, + .maxLuminance = fix.max_luma, + .minLuminance = fix.min_luma, + .maxContentLightLevel = fix.max_cll, + .maxFrameAverageLightLevel = fix.max_fall, + }); + + // Keep track of applied HDR colorimetry metadata + p->color_space.hdr = p->hdr_metadata; +} + +pl_swapchain pl_vulkan_create_swapchain(pl_vulkan plvk, + const struct pl_vulkan_swapchain_params *params) +{ + struct vk_ctx *vk = PL_PRIV(plvk); + pl_gpu gpu = plvk->gpu; + + if (!vk->CreateSwapchainKHR) { + PL_ERR(gpu, VK_KHR_SWAPCHAIN_EXTENSION_NAME " not enabled!"); + return NULL; + } + + struct pl_swapchain_t *sw = pl_zalloc_obj(NULL, sw, struct priv); + sw->log = vk->log; + sw->gpu = gpu; + + struct priv *p = PL_PRIV(sw); + pl_mutex_init(&p->lock); + p->impl = vulkan_swapchain; + p->params = *params; + p->vk = vk; + p->surf = params->surface; + p->swapchain_depth = PL_DEF(params->swapchain_depth, 3); + pl_assert(p->swapchain_depth > 0); + atomic_init(&p->frames_in_flight, 0); + p->last_imgidx = -1; + p->protoInfo = (VkSwapchainCreateInfoKHR) { + .sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR, + .surface = p->surf, + .imageArrayLayers = 1, // non-stereoscopic + .imageSharingMode = VK_SHARING_MODE_EXCLUSIVE, + .minImageCount = p->swapchain_depth + 1, // +1 for the FB + .presentMode = params->present_mode, + .clipped = true, + }; + + // These fields will be updated by `vk_sw_recreate` + p->color_space = pl_color_space_unknown; + p->color_repr = (struct pl_color_repr) { + .sys = PL_COLOR_SYSTEM_RGB, + .levels = PL_COLOR_LEVELS_FULL, + .alpha = PL_ALPHA_UNKNOWN, + }; + + // Make sure the swapchain present mode is supported + VkPresentModeKHR *modes = NULL; + uint32_t num_modes = 0; + VK(vk->GetPhysicalDeviceSurfacePresentModesKHR(vk->physd, p->surf, &num_modes, NULL)); + modes = pl_calloc_ptr(NULL, num_modes, modes); + VK(vk->GetPhysicalDeviceSurfacePresentModesKHR(vk->physd, p->surf, &num_modes, modes)); + + bool supported = false; + for (int i = 0; i < num_modes; i++) + supported |= (modes[i] == p->protoInfo.presentMode); + pl_free_ptr(&modes); + + if (!supported) { + PL_WARN(vk, "Requested swap mode unsupported by this device, falling " + "back to VK_PRESENT_MODE_FIFO_KHR"); + p->protoInfo.presentMode = VK_PRESENT_MODE_FIFO_KHR; + } + + // Enumerate the supported surface color spaces + uint32_t num_formats = 0; + VK(vk->GetPhysicalDeviceSurfaceFormatsKHR(vk->physd, p->surf, &num_formats, NULL)); + PL_ARRAY_RESIZE(sw, p->formats, num_formats); + VK(vk->GetPhysicalDeviceSurfaceFormatsKHR(vk->physd, p->surf, &num_formats, p->formats.elem)); + p->formats.num = num_formats; + + PL_INFO(gpu, "Available surface configurations:"); + for (int i = 0; i < p->formats.num; i++) { + PL_INFO(gpu, " %d: %-40s %s", i, + vk_fmt_name(p->formats.elem[i].format), + vk_csp_name(p->formats.elem[i].colorSpace)); + } + + // Ensure there exists at least some valid renderable surface format + struct pl_color_space hint = {0}; + if (!pick_surf_format(sw, &hint)) + goto error; + + return sw; + +error: + pl_free(modes); + pl_free(sw); + return NULL; +} + +static void vk_sw_destroy(pl_swapchain sw) +{ + pl_gpu gpu = sw->gpu; + struct priv *p = PL_PRIV(sw); + struct vk_ctx *vk = p->vk; + + pl_gpu_flush(gpu); + vk_wait_idle(vk); + + // Vulkan offers no way to know when a queue presentation command is done, + // leading to spec-mandated undefined behavior when destroying resources + // tied to the swapchain. Use an extra `vkQueueWaitIdle` on all of the + // queues we may have oustanding presentation calls on, to hopefully inform + // the driver that we want to wait until the device is truly idle. + for (int i = 0; i < vk->pool_graphics->num_queues; i++) + vk->QueueWaitIdle(vk->pool_graphics->queues[i]); + + for (int i = 0; i < p->images.num; i++) + pl_tex_destroy(gpu, &p->images.elem[i]); + for (int i = 0; i < p->sems.num; i++) { + vk->DestroySemaphore(vk->dev, p->sems.elem[i].in, PL_VK_ALLOC); + vk->DestroySemaphore(vk->dev, p->sems.elem[i].out, PL_VK_ALLOC); + } + + vk->DestroySwapchainKHR(vk->dev, p->swapchain, PL_VK_ALLOC); + pl_mutex_destroy(&p->lock); + pl_free((void *) sw); +} + +static int vk_sw_latency(pl_swapchain sw) +{ + struct priv *p = PL_PRIV(sw); + return p->swapchain_depth; +} + +static bool update_swapchain_info(struct priv *p, VkSwapchainCreateInfoKHR *info, + int w, int h) +{ + struct vk_ctx *vk = p->vk; + + // Query the supported capabilities and update this struct as needed + VkSurfaceCapabilitiesKHR caps = {0}; + VK(vk->GetPhysicalDeviceSurfaceCapabilitiesKHR(vk->physd, p->surf, &caps)); + + // Check for hidden/invisible window + if (!caps.currentExtent.width || !caps.currentExtent.height) { + PL_DEBUG(vk, "maxImageExtent reported as 0x0, hidden window? skipping"); + return false; + } + + // Sorted by preference + static const struct { VkCompositeAlphaFlagsKHR vk_mode; + enum pl_alpha_mode pl_mode; + } alphaModes[] = { + {VK_COMPOSITE_ALPHA_POST_MULTIPLIED_BIT_KHR, PL_ALPHA_INDEPENDENT}, + {VK_COMPOSITE_ALPHA_PRE_MULTIPLIED_BIT_KHR, PL_ALPHA_PREMULTIPLIED}, + {VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR, PL_ALPHA_UNKNOWN}, + {VK_COMPOSITE_ALPHA_INHERIT_BIT_KHR, PL_ALPHA_UNKNOWN}, + }; + + for (int i = 0; i < PL_ARRAY_SIZE(alphaModes); i++) { + if (caps.supportedCompositeAlpha & alphaModes[i].vk_mode) { + info->compositeAlpha = alphaModes[i].vk_mode; + p->color_repr.alpha = alphaModes[i].pl_mode; + PL_DEBUG(vk, "Requested alpha compositing mode: %s", + vk_alpha_mode(info->compositeAlpha)); + break; + } + } + + if (!info->compositeAlpha) { + PL_ERR(vk, "Failed picking alpha compositing mode (caps: 0x%x)", + caps.supportedCompositeAlpha); + goto error; + } + + // Note: We could probably also allow picking a surface transform that + // flips the framebuffer and set `pl_swapchain_frame.flipped`, but this + // doesn't appear to be necessary for any vulkan implementations. + static const VkSurfaceTransformFlagsKHR rotModes[] = { + VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR, + VK_SURFACE_TRANSFORM_INHERIT_BIT_KHR, + }; + + for (int i = 0; i < PL_ARRAY_SIZE(rotModes); i++) { + if (caps.supportedTransforms & rotModes[i]) { + info->preTransform = rotModes[i]; + PL_DEBUG(vk, "Requested surface transform: %s", + vk_surface_transform(info->preTransform)); + break; + } + } + + if (!info->preTransform) { + PL_ERR(vk, "Failed picking surface transform mode (caps: 0x%x)", + caps.supportedTransforms); + goto error; + } + + // Image count as required + PL_DEBUG(vk, "Requested image count: %d (min %d max %d)", + (int) info->minImageCount, (int) caps.minImageCount, + (int) caps.maxImageCount); + + info->minImageCount = PL_MAX(info->minImageCount, caps.minImageCount); + if (caps.maxImageCount) + info->minImageCount = PL_MIN(info->minImageCount, caps.maxImageCount); + + PL_DEBUG(vk, "Requested image size: %dx%d (min %dx%d < cur %dx%d < max %dx%d)", + w, h, caps.minImageExtent.width, caps.minImageExtent.height, + caps.currentExtent.width, caps.currentExtent.height, + caps.maxImageExtent.width, caps.maxImageExtent.height); + + // Default the requested size based on the reported extent + if (caps.currentExtent.width != 0xFFFFFFFF) + w = PL_DEF(w, caps.currentExtent.width); + if (caps.currentExtent.height != 0xFFFFFFFF) + h = PL_DEF(h, caps.currentExtent.height); + + // Otherwise, re-use the existing size if available + w = PL_DEF(w, info->imageExtent.width); + h = PL_DEF(h, info->imageExtent.height); + + if (!w || !h) { + PL_ERR(vk, "Failed resizing swapchain: unknown size?"); + goto error; + } + + // Clamp the extent based on the supported limits + w = PL_CLAMP(w, caps.minImageExtent.width, caps.maxImageExtent.width); + h = PL_CLAMP(h, caps.minImageExtent.height, caps.maxImageExtent.height); + info->imageExtent = (VkExtent2D) { w, h }; + + // We just request whatever makes sense, and let the pl_vk decide what + // pl_tex_params that translates to. That said, we still need to intersect + // the swapchain usage flags with the format usage flags + VkImageUsageFlags req_flags = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | + VK_IMAGE_USAGE_TRANSFER_DST_BIT; + VkImageUsageFlags opt_flags = VK_IMAGE_USAGE_STORAGE_BIT; + + info->imageUsage = caps.supportedUsageFlags & (req_flags | opt_flags); + VkFormatProperties fmtprop = {0}; + vk->GetPhysicalDeviceFormatProperties(vk->physd, info->imageFormat, &fmtprop); + +#define CHECK(usage, feature) \ + if (!((fmtprop.optimalTilingFeatures & VK_FORMAT_FEATURE_##feature##_BIT))) \ + info->imageUsage &= ~VK_IMAGE_USAGE_##usage##_BIT + + CHECK(COLOR_ATTACHMENT, COLOR_ATTACHMENT); + CHECK(TRANSFER_DST, TRANSFER_DST); + CHECK(STORAGE, STORAGE_IMAGE); + + if ((info->imageUsage & req_flags) != req_flags) { + PL_ERR(vk, "The swapchain doesn't support rendering and blitting!"); + goto error; + } + + return true; + +error: + return false; +} + +static void destroy_swapchain(struct vk_ctx *vk, void *swapchain) +{ + vk->DestroySwapchainKHR(vk->dev, vk_unwrap_handle(swapchain), PL_VK_ALLOC); +} + +static bool vk_sw_recreate(pl_swapchain sw, int w, int h) +{ + pl_gpu gpu = sw->gpu; + struct priv *p = PL_PRIV(sw); + struct vk_ctx *vk = p->vk; + + VkImage *vkimages = NULL; + uint32_t num_images = 0; + + if (!update_swapchain_info(p, &p->protoInfo, w, h)) + return false; + + VkSwapchainCreateInfoKHR sinfo = p->protoInfo; +#ifdef VK_EXT_full_screen_exclusive + // Explicitly disallow full screen exclusive mode if possible + static const VkSurfaceFullScreenExclusiveInfoEXT fsinfo = { + .sType = VK_STRUCTURE_TYPE_SURFACE_FULL_SCREEN_EXCLUSIVE_INFO_EXT, + .fullScreenExclusive = VK_FULL_SCREEN_EXCLUSIVE_DISALLOWED_EXT, + }; + if (vk->AcquireFullScreenExclusiveModeEXT) + vk_link_struct(&sinfo, &fsinfo); +#endif + + p->suboptimal = false; + p->needs_recreate = false; + p->cur_width = sinfo.imageExtent.width; + p->cur_height = sinfo.imageExtent.height; + + PL_DEBUG(sw, "(Re)creating swapchain of size %dx%d", + sinfo.imageExtent.width, + sinfo.imageExtent.height); + +#ifdef PL_HAVE_UNIX + if (vk->props.vendorID == VK_VENDOR_ID_NVIDIA) { + vk->DeviceWaitIdle(vk->dev); + vk_wait_idle(vk); + } +#endif + + // Calling `vkCreateSwapchainKHR` puts sinfo.oldSwapchain into a retired + // state whether the call succeeds or not, so we always need to garbage + // collect it afterwards - asynchronously as it may still be in use + sinfo.oldSwapchain = p->swapchain; + p->swapchain = VK_NULL_HANDLE; + VkResult res = vk->CreateSwapchainKHR(vk->dev, &sinfo, PL_VK_ALLOC, &p->swapchain); + vk_dev_callback(vk, (vk_cb) destroy_swapchain, vk, vk_wrap_handle(sinfo.oldSwapchain)); + PL_VK_ASSERT(res, "vk->CreateSwapchainKHR(...)"); + + // Get the new swapchain images + VK(vk->GetSwapchainImagesKHR(vk->dev, p->swapchain, &num_images, NULL)); + vkimages = pl_calloc_ptr(NULL, num_images, vkimages); + VK(vk->GetSwapchainImagesKHR(vk->dev, p->swapchain, &num_images, vkimages)); + + for (int i = 0; i < num_images; i++) + PL_VK_NAME(IMAGE, vkimages[i], "swapchain"); + + // If needed, allocate some more semaphores + while (num_images > p->sems.num) { + VkSemaphore sem_in = VK_NULL_HANDLE, sem_out = VK_NULL_HANDLE; + static const VkSemaphoreCreateInfo seminfo = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, + }; + VK(vk->CreateSemaphore(vk->dev, &seminfo, PL_VK_ALLOC, &sem_in)); + VK(vk->CreateSemaphore(vk->dev, &seminfo, PL_VK_ALLOC, &sem_out)); + PL_VK_NAME(SEMAPHORE, sem_in, "swapchain in"); + PL_VK_NAME(SEMAPHORE, sem_out, "swapchain out"); + + PL_ARRAY_APPEND(sw, p->sems, (struct sem_pair) { + .in = sem_in, + .out = sem_out, + }); + } + + // Recreate the pl_tex wrappers + for (int i = 0; i < p->images.num; i++) + pl_tex_destroy(gpu, &p->images.elem[i]); + p->images.num = 0; + + for (int i = 0; i < num_images; i++) { + const VkExtent2D *ext = &sinfo.imageExtent; + pl_tex tex = pl_vulkan_wrap(gpu, pl_vulkan_wrap_params( + .image = vkimages[i], + .width = ext->width, + .height = ext->height, + .format = sinfo.imageFormat, + .usage = sinfo.imageUsage, + )); + if (!tex) + goto error; + PL_ARRAY_APPEND(sw, p->images, tex); + } + + pl_assert(num_images > 0); + int bits = 0; + + // The channel with the most bits is probably the most authoritative about + // the actual color information (consider e.g. a2bgr10). Slight downside + // in that it results in rounding r/b for e.g. rgb565, but we don't pick + // surfaces with fewer than 8 bits anyway, so let's not care for now. + pl_fmt fmt = p->images.elem[0]->params.format; + for (int i = 0; i < fmt->num_components; i++) + bits = PL_MAX(bits, fmt->component_depth[i]); + + p->color_repr.bits.sample_depth = bits; + p->color_repr.bits.color_depth = bits; + + // Note: `p->color_space.hdr` is (re-)applied by `set_hdr_metadata` + map_color_space(sinfo.imageColorSpace, &p->color_space); + + // Forcibly re-apply HDR metadata, bypassing the no-op check + struct pl_hdr_metadata metadata = p->hdr_metadata; + p->hdr_metadata = pl_hdr_metadata_empty; + set_hdr_metadata(p, &metadata); + + pl_free(vkimages); + return true; + +error: + PL_ERR(vk, "Failed (re)creating swapchain!"); + pl_free(vkimages); + vk->DestroySwapchainKHR(vk->dev, p->swapchain, PL_VK_ALLOC); + p->swapchain = VK_NULL_HANDLE; + p->cur_width = p->cur_height = 0; + return false; +} + +static bool vk_sw_start_frame(pl_swapchain sw, + struct pl_swapchain_frame *out_frame) +{ + struct priv *p = PL_PRIV(sw); + struct vk_ctx *vk = p->vk; + pl_mutex_lock(&p->lock); + + bool recreate = !p->swapchain || p->needs_recreate; + if (p->suboptimal && !p->params.allow_suboptimal) + recreate = true; + + if (recreate && !vk_sw_recreate(sw, 0, 0)) { + pl_mutex_unlock(&p->lock); + return false; + } + + VkSemaphore sem_in = p->sems.elem[p->idx_sems].in; + PL_TRACE(vk, "vkAcquireNextImageKHR signals 0x%"PRIx64, (uint64_t) sem_in); + + for (int attempts = 0; attempts < 2; attempts++) { + uint32_t imgidx = 0; + VkResult res = vk->AcquireNextImageKHR(vk->dev, p->swapchain, UINT64_MAX, + sem_in, VK_NULL_HANDLE, &imgidx); + + switch (res) { + case VK_SUBOPTIMAL_KHR: + p->suboptimal = true; + // fall through + case VK_SUCCESS: + p->last_imgidx = imgidx; + pl_vulkan_release_ex(sw->gpu, pl_vulkan_release_params( + .tex = p->images.elem[imgidx], + .layout = VK_IMAGE_LAYOUT_UNDEFINED, + .qf = VK_QUEUE_FAMILY_IGNORED, + .semaphore = { sem_in }, + )); + *out_frame = (struct pl_swapchain_frame) { + .fbo = p->images.elem[imgidx], + .flipped = false, + .color_repr = p->color_repr, + .color_space = p->color_space, + }; + // keep lock held + return true; + + case VK_ERROR_OUT_OF_DATE_KHR: { + // In these cases try recreating the swapchain + if (!vk_sw_recreate(sw, 0, 0)) { + pl_mutex_unlock(&p->lock); + return false; + } + continue; + } + + default: + PL_ERR(vk, "Failed acquiring swapchain image: %s", vk_res_str(res)); + pl_mutex_unlock(&p->lock); + return false; + } + } + + // If we've exhausted the number of attempts to recreate the swapchain, + // just give up silently and let the user retry some time later. + pl_mutex_unlock(&p->lock); + return false; +} + +static void present_cb(struct priv *p, void *arg) +{ + (void) pl_rc_deref(&p->frames_in_flight); +} + +static bool vk_sw_submit_frame(pl_swapchain sw) +{ + pl_gpu gpu = sw->gpu; + struct priv *p = PL_PRIV(sw); + struct vk_ctx *vk = p->vk; + pl_assert(p->last_imgidx >= 0); + pl_assert(p->swapchain); + uint32_t idx = p->last_imgidx; + VkSemaphore sem_out = p->sems.elem[p->idx_sems++].out; + p->idx_sems %= p->sems.num; + p->last_imgidx = -1; + + bool held = pl_vulkan_hold_ex(gpu, pl_vulkan_hold_params( + .tex = p->images.elem[idx], + .layout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, + .qf = VK_QUEUE_FAMILY_IGNORED, + .semaphore = { sem_out }, + )); + + if (!held) { + PL_ERR(gpu, "Failed holding swapchain image for presentation"); + pl_mutex_unlock(&p->lock); + return false; + } + + struct vk_cmd *cmd = pl_vk_steal_cmd(gpu); + if (!cmd) { + pl_mutex_unlock(&p->lock); + return false; + } + + pl_rc_ref(&p->frames_in_flight); + vk_cmd_callback(cmd, (vk_cb) present_cb, p, NULL); + if (!vk_cmd_submit(&cmd)) { + pl_mutex_unlock(&p->lock); + return false; + } + + struct vk_cmdpool *pool = vk->pool_graphics; + int qidx = pool->idx_queues; + VkQueue queue = pool->queues[qidx]; + + vk_rotate_queues(p->vk); + vk_malloc_garbage_collect(vk->ma); + + VkPresentInfoKHR pinfo = { + .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR, + .waitSemaphoreCount = 1, + .pWaitSemaphores = &sem_out, + .swapchainCount = 1, + .pSwapchains = &p->swapchain, + .pImageIndices = &idx, + }; + + PL_TRACE(vk, "vkQueuePresentKHR waits on 0x%"PRIx64, (uint64_t) sem_out); + vk->lock_queue(vk->queue_ctx, pool->qf, qidx); + VkResult res = vk->QueuePresentKHR(queue, &pinfo); + vk->unlock_queue(vk->queue_ctx, pool->qf, qidx); + pl_mutex_unlock(&p->lock); + + switch (res) { + case VK_SUBOPTIMAL_KHR: + p->suboptimal = true; + // fall through + case VK_SUCCESS: + return true; + + case VK_ERROR_OUT_OF_DATE_KHR: + // We can silently ignore this error, since the next start_frame will + // recreate the swapchain automatically. + return true; + + default: + PL_ERR(vk, "Failed presenting to queue %p: %s", (void *) queue, + vk_res_str(res)); + return false; + } +} + +static void vk_sw_swap_buffers(pl_swapchain sw) +{ + struct priv *p = PL_PRIV(sw); + + pl_mutex_lock(&p->lock); + while (pl_rc_count(&p->frames_in_flight) >= p->swapchain_depth) { + pl_mutex_unlock(&p->lock); // don't hold mutex while blocking + vk_poll_commands(p->vk, UINT64_MAX); + pl_mutex_lock(&p->lock); + } + pl_mutex_unlock(&p->lock); +} + +static bool vk_sw_resize(pl_swapchain sw, int *width, int *height) +{ + struct priv *p = PL_PRIV(sw); + bool ok = true; + + pl_mutex_lock(&p->lock); + + bool width_changed = *width && *width != p->cur_width, + height_changed = *height && *height != p->cur_height; + + if (p->suboptimal || p->needs_recreate || width_changed || height_changed) + ok = vk_sw_recreate(sw, *width, *height); + + *width = p->cur_width; + *height = p->cur_height; + + pl_mutex_unlock(&p->lock); + return ok; +} + +static void vk_sw_colorspace_hint(pl_swapchain sw, const struct pl_color_space *csp) +{ + struct priv *p = PL_PRIV(sw); + pl_mutex_lock(&p->lock); + + // This should never fail if the swapchain already exists + bool ok = pick_surf_format(sw, csp); + set_hdr_metadata(p, &csp->hdr); + pl_assert(ok); + + pl_mutex_unlock(&p->lock); +} + +bool pl_vulkan_swapchain_suboptimal(pl_swapchain sw) +{ + struct priv *p = PL_PRIV(sw); + return p->suboptimal; +} + +static const struct pl_sw_fns vulkan_swapchain = { + .destroy = vk_sw_destroy, + .latency = vk_sw_latency, + .resize = vk_sw_resize, + .colorspace_hint = vk_sw_colorspace_hint, + .start_frame = vk_sw_start_frame, + .submit_frame = vk_sw_submit_frame, + .swap_buffers = vk_sw_swap_buffers, +}; diff --git a/src/vulkan/utils.c b/src/vulkan/utils.c new file mode 100644 index 0000000..914f9e4 --- /dev/null +++ b/src/vulkan/utils.c @@ -0,0 +1,181 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "utils.h" + +VkExternalMemoryHandleTypeFlagBitsKHR +vk_mem_handle_type(enum pl_handle_type handle_type) +{ + if (!handle_type) + return 0; + + switch (handle_type) { + case PL_HANDLE_FD: + return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR; + case PL_HANDLE_WIN32: + return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR; + case PL_HANDLE_WIN32_KMT: + return VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR; + case PL_HANDLE_DMA_BUF: + return VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT; + case PL_HANDLE_HOST_PTR: + return VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT; + case PL_HANDLE_MTL_TEX: + case PL_HANDLE_IOSURFACE: + return 0; + } + + pl_unreachable(); +} + +VkExternalSemaphoreHandleTypeFlagBitsKHR +vk_sync_handle_type(enum pl_handle_type handle_type) +{ + if (!handle_type) + return 0; + + switch (handle_type) { + case PL_HANDLE_FD: + return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR; + case PL_HANDLE_WIN32: + return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR; + case PL_HANDLE_WIN32_KMT: + return VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR; + case PL_HANDLE_DMA_BUF: + case PL_HANDLE_HOST_PTR: + case PL_HANDLE_MTL_TEX: + case PL_HANDLE_IOSURFACE: + return 0; + } + + pl_unreachable(); +} + +bool vk_external_mem_check(struct vk_ctx *vk, + const VkExternalMemoryPropertiesKHR *props, + enum pl_handle_type handle_type, + bool import) +{ + VkExternalMemoryFeatureFlagsKHR flags = props->externalMemoryFeatures; + VkExternalMemoryHandleTypeFlagBitsKHR vk_handle = vk_mem_handle_type(handle_type); + + if (import) { + if (!(flags & VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT_KHR)) { + PL_DEBUG(vk, "Handle type %s (0x%x) is not importable", + vk_handle_name(vk_handle), (unsigned int) handle_type); + return false; + } + } else { + if (!(flags & VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT_KHR)) { + PL_DEBUG(vk, "Handle type %s (0x%x) is not exportable", + vk_handle_name(vk_handle), (unsigned int) handle_type); + return false; + } + } + + return true; +} + +const enum pl_handle_type vk_mem_handle_list[] = { + PL_HANDLE_HOST_PTR, +#ifdef PL_HAVE_UNIX + PL_HANDLE_FD, + PL_HANDLE_DMA_BUF, +#endif +#ifdef PL_HAVE_WIN32 + PL_HANDLE_WIN32, + PL_HANDLE_WIN32_KMT, +#endif + 0 +}; + +const enum pl_handle_type vk_sync_handle_list[] = { +#ifdef PL_HAVE_UNIX + PL_HANDLE_FD, +#endif +#ifdef PL_HAVE_WIN32 + PL_HANDLE_WIN32, + PL_HANDLE_WIN32_KMT, +#endif + 0 +}; + +const void *vk_find_struct(const void *chain, VkStructureType stype) +{ + const VkBaseInStructure *in = chain; + while (in) { + if (in->sType == stype) + return in; + + in = in->pNext; + } + + return NULL; +} + +void vk_link_struct(void *chain, const void *in) +{ + if (!in) + return; + + VkBaseOutStructure *out = chain; + while (out->pNext) + out = out->pNext; + + out->pNext = (void *) in; +} + +void *vk_struct_memdup(void *alloc, const void *pin) +{ + if (!pin) + return NULL; + + const VkBaseInStructure *in = pin; + size_t size = vk_struct_size(in->sType); + pl_assert(size); + + VkBaseOutStructure *out = pl_memdup(alloc, in, size); + out->pNext = NULL; + return out; +} + +void *vk_chain_memdup(void *alloc, const void *pin) +{ + if (!pin) + return NULL; + + const VkBaseInStructure *in = pin; + VkBaseOutStructure *out = vk_struct_memdup(alloc, in); + pl_assert(out); + + out->pNext = vk_chain_memdup(alloc, in->pNext); + return out; +} + +void *vk_chain_alloc(void *alloc, void *chain, VkStructureType stype) +{ + for (VkBaseOutStructure *out = chain;; out = out->pNext) { + if (out->sType == stype) + return out; + if (!out->pNext) { + VkBaseOutStructure *s = pl_zalloc(alloc, vk_struct_size(stype)); + s->sType = stype; + out->pNext = s; + return s; + } + } +} diff --git a/src/vulkan/utils.h b/src/vulkan/utils.h new file mode 100644 index 0000000..cb1c5f5 --- /dev/null +++ b/src/vulkan/utils.h @@ -0,0 +1,136 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "common.h" + +// Return a human-readable name for various vulkan enums +const char *vk_res_str(VkResult res); +const char *vk_fmt_name(VkFormat fmt); +const char *vk_csp_name(VkColorSpaceKHR csp); +const char *vk_handle_name(VkExternalMemoryHandleTypeFlagBitsKHR handle); +const char *vk_obj_type(VkObjectType obj); +const char *vk_alpha_mode(VkCompositeAlphaFlagsKHR alpha); +const char *vk_surface_transform(VkSurfaceTransformFlagsKHR transform); + +// Return the size of an arbitrary vulkan struct. Returns 0 for unknown structs +size_t vk_struct_size(VkStructureType stype); + +// Returns the vulkan API version which a given extension was promoted to, or 0 +// if the extension is not promoted. +uint32_t vk_ext_promoted_ver(const char *extension); + +// Enum translation boilerplate +VkExternalMemoryHandleTypeFlagBitsKHR vk_mem_handle_type(enum pl_handle_type); +VkExternalSemaphoreHandleTypeFlagBitsKHR vk_sync_handle_type(enum pl_handle_type); + +// Bitmask of all access flags that imply a read/write operation, respectively +extern const VkAccessFlags2 vk_access_read; +extern const VkAccessFlags2 vk_access_write; + +// Check for compatibility of a VkExternalMemoryProperties +bool vk_external_mem_check(struct vk_ctx *vk, + const VkExternalMemoryPropertiesKHR *props, + enum pl_handle_type handle_type, + bool check_import); + +// Static lists of external handle types we should try probing for +extern const enum pl_handle_type vk_mem_handle_list[]; +extern const enum pl_handle_type vk_sync_handle_list[]; + +// Find a structure in a pNext chain, or NULL +const void *vk_find_struct(const void *chain, VkStructureType stype); + +// Link a structure into a pNext chain +void vk_link_struct(void *chain, const void *in); + +// Make a copy of a structure, not including the pNext chain +void *vk_struct_memdup(void *alloc, const void *in); + +// Make a deep copy of an entire pNext chain +void *vk_chain_memdup(void *alloc, const void *in); + +// Find a structure in a pNext chain, or allocate + link it if absent. +void *vk_chain_alloc(void *alloc, void *chain, VkStructureType stype); + +// Renormalize input features into a state consistent for a given API version. +// If `api_ver` is specified as 0, *both* meta-structs and extension structs +// will be emitted. Note: `out` should be initialized by the user. In +// particular, if it already contains a valid features chain, then this +// function will effectively act as a union. +void vk_features_normalize(void *alloc, const VkPhysicalDeviceFeatures2 *in, + uint32_t api_ver, VkPhysicalDeviceFeatures2 *out); + +// Convenience macros to simplify a lot of common boilerplate +#define PL_VK_ASSERT(res, str) \ + do { \ + if (res != VK_SUCCESS) { \ + PL_ERR(vk, str ": %s (%s:%d)", \ + vk_res_str(res), __FILE__, __LINE__); \ + goto error; \ + } \ + } while (0) + +#define VK(cmd) \ + do { \ + PL_TRACE(vk, #cmd); \ + VkResult _res = (cmd); \ + PL_VK_ASSERT(_res, #cmd); \ + } while (0) + +#define PL_VK_NAME(type, obj, name) \ + do { \ + if (vk->SetDebugUtilsObjectNameEXT) { \ + vk->SetDebugUtilsObjectNameEXT(vk->dev, &(VkDebugUtilsObjectNameInfoEXT) { \ + .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT, \ + .objectType = VK_OBJECT_TYPE_##type, \ + .objectHandle = (uint64_t) (obj), \ + .pObjectName = (name), \ + }); \ + } \ + } while (0) + +// Variant of PL_VK_NAME for dispatchable handles +#define PL_VK_NAME_HANDLE(type, obj, name) \ + PL_VK_NAME(type, (uintptr_t) (obj), name) + +// Helper functions to wrap and unwrap non-dispatchable handles into pointers. +// Note that wrap/unwrap must always be used linearly. +#if VK_USE_64_BIT_PTR_DEFINES == 1 +#define vk_wrap_handle(h) (h) +#define vk_unwrap_handle(h) (h) +#elif UINTPTR_MAX >= UINT64_MAX +#define vk_wrap_handle(h) ((void *) (uintptr_t) (h)) +#define vk_unwrap_handle(h) ((uint64_t) (uintptr_t) (h)) +#else +static inline void *vk_wrap_handle(uint64_t h) +{ + uint64_t *wrapper = malloc(sizeof(h)); + assert(wrapper); + *wrapper = h; + return wrapper; +} + +static inline uint64_t vk_unwrap_handle(void *h) +{ + uint64_t *wrapper = h; + uint64_t ret = *wrapper; + free(wrapper); + return ret; +} +#endif diff --git a/src/vulkan/utils_gen.c.j2 b/src/vulkan/utils_gen.c.j2 new file mode 100644 index 0000000..6db0454 --- /dev/null +++ b/src/vulkan/utils_gen.c.j2 @@ -0,0 +1,137 @@ +#define VK_ENABLE_BETA_EXTENSIONS +#include "vulkan/utils.h" + +const char *vk_res_str(VkResult res) +{ + switch (res) { +{% for res in vkresults %} + case {{ res }}: return "{{ res }}"; +{% endfor %} + + default: return "unknown error"; + } +} + +const char *vk_fmt_name(VkFormat fmt) +{ + switch (fmt) { +{% for fmt in vkformats %} + case {{ fmt }}: return "{{ fmt }}"; +{% endfor %} + + default: return "unknown format"; + } +} + +const char *vk_csp_name(VkColorSpaceKHR csp) +{ + switch (csp) { +{% for csp in vkspaces %} + case {{ csp }}: return "{{ csp }}"; +{% endfor %} + + default: return "unknown color space"; + } +} + +const char *vk_handle_name(VkExternalMemoryHandleTypeFlagBitsKHR handle) +{ + switch (handle) { +{% for handle in vkhandles %} + case {{ handle }}: return "{{ handle }}"; +{% endfor %} + + default: return "unknown handle type"; + } +} + +const char *vk_alpha_mode(VkCompositeAlphaFlagsKHR alpha) +{ + switch (alpha) { +{% for mode in vkalphas %} + case {{ mode }}: return "{{ mode }}"; +{% endfor %} + + default: return "unknown alpha mode"; + } +} + +const char *vk_surface_transform(VkSurfaceTransformFlagsKHR tf) +{ + switch (tf) { +{% for tf in vktransforms %} + case {{ tf }}: return "{{ tf }}"; +{% endfor %} + + default: return "unknown surface transform"; + } +} + + +const char *vk_obj_type(VkObjectType obj) +{ + switch (obj) { +{% for obj in vkobjects %} + case {{ obj.enum }}: return "{{ obj.name }}"; +{% endfor %} + + default: return "unknown object"; + } +} + +size_t vk_struct_size(VkStructureType stype) +{ + switch (stype) { +{% for struct in vkstructs %} + case {{ struct.stype }}: return sizeof({{ struct.name }}); +{% endfor %} + + default: return 0; + } +} + +uint32_t vk_ext_promoted_ver(const char *extension) +{ +{% for ext in vkexts %} +{% if ext.promoted_ver %} + if (!strcmp(extension, "{{ ext.name }}")) + return {{ ext.promoted_ver }}; +{% endif %} +{% endfor %} + return 0; +} + +void vk_features_normalize(void *alloc, const VkPhysicalDeviceFeatures2 *fin, + uint32_t api_ver, VkPhysicalDeviceFeatures2 *out) +{ + for (const VkBaseInStructure *in = (void *) fin; in; in = in->pNext) { + switch (in->sType) { + default: break; +{% for fs in vkfeatures %} + case {{ fs.stype }}: { + const {{ fs.name }} *i = (const void *) in; +{% for f in fs.features %} + if (i->{{ f.name }}) { +{% for r in f.replacements %} +{% if r.core_ver %} + if (!api_ver || api_ver >= {{ r.core_ver }}) +{% elif r.max_ver %} + if (!api_ver || api_ver < {{ r.max_ver }}) +{% endif %} +{% if fs.is_base %} + out->{{ f.name }} = true; +{% else %} + (({{ r.name }} *) vk_chain_alloc(alloc, out, {{ r.stype }}))->{{ f.name }} = true; +{% endif %} +{% endfor %} + } +{% endfor %} + break; + } +{% endfor %} + } + } +} + +const VkAccessFlags2 vk_access_read = {{ '0x%x' % vkaccess.read }}LLU; +const VkAccessFlags2 vk_access_write = {{ '0x%x' % vkaccess.write }}LLU; diff --git a/src/vulkan/utils_gen.py b/src/vulkan/utils_gen.py new file mode 100644 index 0000000..a8652fd --- /dev/null +++ b/src/vulkan/utils_gen.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 +# +# This file is part of libplacebo. +# +# libplacebo is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# libplacebo is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with libplacebo. If not, see <http://www.gnu.org/licenses/>. + +import os.path +import re +import sys +import xml.etree.ElementTree as ET + +try: + import jinja2 +except ModuleNotFoundError: + print('Module \'jinja2\' not found, please install \'python3-Jinja2\' or ' + 'an equivalent package on your system! Alternatively, run ' + '`git submodule update --init` followed by `meson --wipe`.', + file=sys.stderr) + sys.exit(1) + +TEMPLATE = jinja2.Environment( + loader = jinja2.FileSystemLoader(searchpath=os.path.dirname(__file__)), + trim_blocks=True, +).get_template('utils_gen.c.j2') + +class Obj(object): + def __init__(self, **kwargs): + self.__dict__.update(kwargs) + +class VkXML(ET.ElementTree): + def blacklist_block(self, req): + for t in req.iterfind('type'): + self.blacklist_types.add(t.attrib['name']) + for e in req.iterfind('enum'): + self.blacklist_enums.add(e.attrib['name']) + + def __init__(self, *args, **kwargs): + + super().__init__(*args, **kwargs) + self.blacklist_types = set() + self.blacklist_enums = set() + + for f in self.iterfind('feature'): + # Feature block for non-Vulkan API + if not 'vulkan' in f.attrib['api'].split(','): + for r in f.iterfind('require'): + self.blacklist_block(r) + + for e in self.iterfind('extensions/extension'): + # Entire extension is unsupported on vulkan or platform-specifid + if not 'vulkan' in e.attrib['supported'].split(',') or 'platform' in e.attrib: + for r in e.iterfind('require'): + self.blacklist_block(r) + continue + + # Only individual <require> blocks are API-specific + for r in e.iterfind('require[@api]'): + if not 'vulkan' in r.attrib['api'].split(','): + self.blacklist_block(r) + + def findall_enum(self, name): + for e in self.iterfind('enums[@name="{0}"]/enum'.format(name)): + if not 'alias' in e.attrib: + if not e.attrib['name'] in self.blacklist_enums: + yield e + for e in self.iterfind('.//enum[@extends="{0}"]'.format(name)): + if not 'alias' in e.attrib: + if not e.attrib['name'] in self.blacklist_enums: + yield e + + def findall_type(self, category): + for t in self.iterfind('types/type[@category="{0}"]'.format(category)): + name = t.attrib.get('name') or t.find('name').text + if name in self.blacklist_types: + continue + yield t + + +def get_vkenum(registry, enum): + for e in registry.findall_enum(enum): + yield e.attrib['name'] + +def get_vkobjects(registry): + for t in registry.findall_type('handle'): + if 'objtypeenum' in t.attrib: + yield Obj(enum = t.attrib['objtypeenum'], + name = t.find('name').text) + +def get_vkstructs(registry): + for t in registry.findall_type('struct'): + stype = None + for m in t.iterfind('member'): + if m.find('name').text == 'sType': + stype = m + break + + if stype is not None and 'values' in stype.attrib: + yield Obj(stype = stype.attrib['values'], + name = t.attrib['name']) + +def get_vkaccess(registry): + access = Obj(read = 0, write = 0) + for e in registry.findall_enum('VkAccessFlagBits2'): + if '_READ_' in e.attrib['name']: + access.read |= 1 << int(e.attrib['bitpos']) + if '_WRITE_' in e.attrib['name']: + access.write |= 1 << int(e.attrib['bitpos']) + return access + +def get_vkexts(registry): + for e in registry.iterfind('extensions/extension'): + promoted_ver = None + if res := re.match(r'VK_VERSION_(\d)_(\d)', e.attrib.get('promotedto', '')): + promoted_ver = 'VK_API_VERSION_{0}_{1}'.format(res[1], res[2]) + yield Obj(name = e.attrib['name'], + promoted_ver = promoted_ver) + +def get_vkfeatures(registry): + structs = []; + featuremap = {}; # features -> [struct] + for t in registry.findall_type('struct'): + sname = t.attrib['name'] + is_base = sname == 'VkPhysicalDeviceFeatures' + extends = t.attrib.get('structextends', []) + if is_base: + sname = 'VkPhysicalDeviceFeatures2' + stype = 'VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2' + elif not 'VkPhysicalDeviceFeatures2' in extends: + continue + + features = [] + for f in t.iterfind('member'): + if f.find('type').text == 'VkStructureType': + stype = f.attrib['values'] + elif f.find('type').text == 'VkBool32': + fname = f.find('name').text + if is_base: + fname = 'features.' + fname + features.append(Obj(name = fname)) + + core_ver = None + if res := re.match(r'VkPhysicalDeviceVulkan(\d)(\d)Features', sname): + core_ver = 'VK_API_VERSION_{0}_{1}'.format(res[1], res[2]) + + struct = Obj(name = sname, + stype = stype, + core_ver = core_ver, + is_base = is_base, + features = features) + + structs.append(struct) + for f in features: + featuremap.setdefault(f.name, []).append(struct) + + for s in structs: + for f in s.features: + f.replacements = featuremap[f.name] + core_ver = next(( r.core_ver for r in f.replacements if r.core_ver ), None) + for r in f.replacements: + if not r.core_ver: + r.max_ver = core_ver + + yield from structs + +def find_registry_xml(datadir): + registry_paths = [ + '{0}/vulkan/registry/vk.xml'.format(datadir), + '$MINGW_PREFIX/share/vulkan/registry/vk.xml', + '%VULKAN_SDK%/share/vulkan/registry/vk.xml', + '$VULKAN_SDK/share/vulkan/registry/vk.xml', + '/usr/share/vulkan/registry/vk.xml', + ] + + for p in registry_paths: + path = os.path.expandvars(p) + if os.path.isfile(path): + print('Found vk.xml: {0}'.format(path)) + return path + + print('Could not find the vulkan registry (vk.xml), please specify its ' + 'location manually using the -Dvulkan-registry=/path/to/vk.xml ' + 'option!', file=sys.stderr) + sys.exit(1) + +if __name__ == '__main__': + assert len(sys.argv) == 4 + datadir = sys.argv[1] + xmlfile = sys.argv[2] + outfile = sys.argv[3] + + if not xmlfile or xmlfile == '': + xmlfile = find_registry_xml(datadir) + + registry = VkXML(ET.parse(xmlfile)) + with open(outfile, 'w') as f: + f.write(TEMPLATE.render( + vkresults = get_vkenum(registry, 'VkResult'), + vkformats = get_vkenum(registry, 'VkFormat'), + vkspaces = get_vkenum(registry, 'VkColorSpaceKHR'), + vkhandles = get_vkenum(registry, 'VkExternalMemoryHandleTypeFlagBits'), + vkalphas = get_vkenum(registry, 'VkCompositeAlphaFlagBitsKHR'), + vktransforms = get_vkenum(registry, 'VkSurfaceTransformFlagBitsKHR'), + vkobjects = get_vkobjects(registry), + vkstructs = get_vkstructs(registry), + vkaccess = get_vkaccess(registry), + vkexts = get_vkexts(registry), + vkfeatures = get_vkfeatures(registry), + )) |