summaryrefslogtreecommitdiffstats
path: root/src/vulkan/gpu_buf.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/vulkan/gpu_buf.c')
-rw-r--r--src/vulkan/gpu_buf.c470
1 files changed, 470 insertions, 0 deletions
diff --git a/src/vulkan/gpu_buf.c b/src/vulkan/gpu_buf.c
new file mode 100644
index 0000000..2f317bc
--- /dev/null
+++ b/src/vulkan/gpu_buf.c
@@ -0,0 +1,470 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+
+void vk_buf_barrier(pl_gpu gpu, struct vk_cmd *cmd, pl_buf buf,
+ VkPipelineStageFlags2 stage, VkAccessFlags2 access,
+ size_t offset, size_t size, bool export)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+ pl_assert(!export || !buf_vk->exported); // can't re-export exported buffers
+ pl_rc_ref(&buf_vk->rc);
+
+ bool needs_flush = buf_vk->needs_flush || buf->params.host_mapped ||
+ buf->params.import_handle == PL_HANDLE_HOST_PTR;
+ bool noncoherent = buf_vk->mem.data && !buf_vk->mem.coherent;
+ if (needs_flush && noncoherent) {
+ VK(vk->FlushMappedMemoryRanges(vk->dev, 1, &(struct VkMappedMemoryRange) {
+ .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+ .memory = buf_vk->mem.vkmem,
+ .offset = buf_vk->mem.map_offset,
+ .size = buf_vk->mem.map_size,
+ }));
+
+ // Just ignore errors, not much we can do about them other than
+ // logging them and moving on...
+ error: ;
+ }
+
+ struct vk_sync_scope last;
+ last = vk_sem_barrier(cmd, &buf_vk->sem, stage, access, export);
+
+ // CONCURRENT buffers require transitioning to/from IGNORED, EXCLUSIVE
+ // buffers require transitioning to/from the concrete QF index
+ uint32_t qf = vk->pools.num > 1 ? VK_QUEUE_FAMILY_IGNORED : cmd->pool->qf;
+ uint32_t src_qf = buf_vk->exported ? VK_QUEUE_FAMILY_EXTERNAL_KHR : qf;
+ uint32_t dst_qf = export ? VK_QUEUE_FAMILY_EXTERNAL_KHR : qf;
+
+ if (last.access || src_qf != dst_qf) {
+ vk_cmd_barrier(cmd, &(VkDependencyInfo) {
+ .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+ .bufferMemoryBarrierCount = 1,
+ .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+ .srcStageMask = last.stage,
+ .srcAccessMask = last.access,
+ .dstStageMask = stage,
+ .dstAccessMask = access,
+ .srcQueueFamilyIndex = src_qf,
+ .dstQueueFamilyIndex = dst_qf,
+ .buffer = buf_vk->mem.buf,
+ .offset = buf_vk->mem.offset + offset,
+ .size = size,
+ },
+ });
+ }
+
+ buf_vk->needs_flush = false;
+ buf_vk->exported = export;
+ vk_cmd_callback(cmd, (vk_cb) vk_buf_deref, gpu, buf);
+}
+
+void vk_buf_deref(pl_gpu gpu, pl_buf buf)
+{
+ if (!buf)
+ return;
+
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+ if (pl_rc_deref(&buf_vk->rc)) {
+ vk->DestroyBufferView(vk->dev, buf_vk->view, PL_VK_ALLOC);
+ vk_malloc_free(vk->ma, &buf_vk->mem);
+ pl_free((void *) buf);
+ }
+}
+
+pl_buf vk_buf_create(pl_gpu gpu, const struct pl_buf_params *params)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+
+ struct pl_buf_t *buf = pl_zalloc_obj(NULL, buf, struct pl_buf_vk);
+ buf->params = *params;
+ buf->params.initial_data = NULL;
+
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+ pl_rc_init(&buf_vk->rc);
+
+ struct vk_malloc_params mparams = {
+ .reqs = {
+ .size = PL_ALIGN2(params->size, 4), // for vk_buf_write
+ .memoryTypeBits = UINT32_MAX,
+ .alignment = 1,
+ },
+ // these are always set, because `vk_buf_copy` can always be used
+ .buf_usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+ VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+ .export_handle = params->export_handle,
+ .import_handle = params->import_handle,
+ .shared_mem = params->shared_mem,
+ .debug_tag = params->debug_tag,
+ };
+
+ // Mandatory/optimal buffer offset alignment
+ VkDeviceSize *align = &mparams.reqs.alignment;
+ VkDeviceSize extra_align = vk->props.limits.optimalBufferCopyOffsetAlignment;
+
+ // Try and align all buffers to the minimum texel alignment, to make sure
+ // tex_upload/tex_download always gets aligned buffer copies if possible
+ extra_align = pl_lcm(extra_align, p->min_texel_alignment);
+
+ enum pl_buf_mem_type mem_type = params->memory_type;
+ bool is_texel = false;
+
+ if (params->uniform) {
+ mparams.buf_usage |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
+ *align = pl_lcm(*align, vk->props.limits.minUniformBufferOffsetAlignment);
+ mem_type = PL_BUF_MEM_DEVICE;
+ if (params->format) {
+ mparams.buf_usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
+ is_texel = true;
+ }
+ }
+
+ if (params->storable) {
+ mparams.buf_usage |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+ *align = pl_lcm(*align, vk->props.limits.minStorageBufferOffsetAlignment);
+ buf_vk->update_queue = COMPUTE;
+ mem_type = PL_BUF_MEM_DEVICE;
+ if (params->format) {
+ mparams.buf_usage |= VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT;
+ is_texel = true;
+ }
+ }
+
+ if (is_texel) {
+ *align = pl_lcm(*align, vk->props.limits.minTexelBufferOffsetAlignment);
+ *align = pl_lcm(*align, params->format->texel_size);
+ }
+
+ if (params->drawable) {
+ mparams.buf_usage |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT |
+ VK_BUFFER_USAGE_INDEX_BUFFER_BIT;
+ mem_type = PL_BUF_MEM_DEVICE;
+ }
+
+ if (params->host_writable || params->initial_data) {
+ // Buffers should be written using mapped memory if possible
+ mparams.optimal = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+ // Use the transfer queue for updates on very large buffers (1 MB)
+ if (params->size > 1024*1024)
+ buf_vk->update_queue = TRANSFER;
+ }
+
+ if (params->host_mapped || params->host_readable) {
+ mparams.required |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+
+ if (params->size > 1024) {
+ // Prefer cached memory for large buffers (1 kB) which may be read
+ // from, because uncached reads are extremely slow
+ mparams.optimal |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+ }
+ }
+
+ switch (mem_type) {
+ case PL_BUF_MEM_AUTO:
+ // We generally prefer VRAM since it's faster than RAM, but any number
+ // of other requirements could potentially exclude it, so just mark it
+ // as optimal by default.
+ if (!(mparams.optimal & VK_MEMORY_PROPERTY_HOST_CACHED_BIT))
+ mparams.optimal |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+ break;
+ case PL_BUF_MEM_DEVICE:
+ // Force device local memory.
+ mparams.required |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+ break;
+ case PL_BUF_MEM_HOST:
+ // This isn't a true guarantee, but actually trying to restrict the
+ // device-local bit locks out all memory heaps on iGPUs. Requiring
+ // the memory be host-mapped is the easiest compromise.
+ mparams.required |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+ mparams.optimal |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+ break;
+ case PL_BUF_MEM_TYPE_COUNT:
+ pl_unreachable();
+ }
+
+ if (params->import_handle) {
+ size_t offset = params->shared_mem.offset;
+ if (PL_ALIGN(offset, *align) != offset) {
+ PL_ERR(gpu, "Imported memory offset %zu violates minimum alignment "
+ "requirement of enabled usage flags (%zu)!",
+ offset, (size_t) *align);
+ goto error;
+ }
+ } else {
+ *align = pl_lcm(*align, extra_align);
+ }
+
+ if (!vk_malloc_slice(vk->ma, &buf_vk->mem, &mparams))
+ goto error;
+
+ if (params->host_mapped)
+ buf->data = buf_vk->mem.data;
+
+ if (params->export_handle) {
+ buf->shared_mem = buf_vk->mem.shared_mem;
+ buf->shared_mem.drm_format_mod = DRM_FORMAT_MOD_LINEAR;
+ buf_vk->exported = true;
+ }
+
+ if (is_texel) {
+ struct pl_fmt_vk *fmtp = PL_PRIV(params->format);
+ VkBufferViewCreateInfo vinfo = {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
+ .buffer = buf_vk->mem.buf,
+ .format = PL_DEF(fmtp->vk_fmt->bfmt, fmtp->vk_fmt->tfmt),
+ .offset = buf_vk->mem.offset,
+ .range = buf_vk->mem.size,
+ };
+
+ VK(vk->CreateBufferView(vk->dev, &vinfo, PL_VK_ALLOC, &buf_vk->view));
+ PL_VK_NAME(BUFFER_VIEW, buf_vk->view, PL_DEF(params->debug_tag, "texel"));
+ }
+
+ if (params->initial_data)
+ vk_buf_write(gpu, buf, 0, params->initial_data, params->size);
+
+ return buf;
+
+error:
+ vk_buf_deref(gpu, buf);
+ return NULL;
+}
+
+static void invalidate_buf(pl_gpu gpu, pl_buf buf)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+ if (buf_vk->mem.data && !buf_vk->mem.coherent) {
+ VK(vk->InvalidateMappedMemoryRanges(vk->dev, 1, &(VkMappedMemoryRange) {
+ .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+ .memory = buf_vk->mem.vkmem,
+ .offset = buf_vk->mem.map_offset,
+ .size = buf_vk->mem.map_size,
+ }));
+ }
+
+ // Ignore errors (after logging), nothing useful we can do anyway
+error: ;
+ vk_buf_deref(gpu, buf);
+}
+
+void vk_buf_flush(pl_gpu gpu, struct vk_cmd *cmd, pl_buf buf,
+ size_t offset, size_t size)
+{
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+ // We need to perform a flush if the host is capable of reading back from
+ // the buffer, or if we intend to overwrite it using mapped memory
+ bool can_read = buf->params.host_readable;
+ bool can_write = buf_vk->mem.data && buf->params.host_writable;
+ if (buf->params.host_mapped || buf->params.import_handle == PL_HANDLE_HOST_PTR)
+ can_read = can_write = true;
+
+ if (!can_read && !can_write)
+ return;
+
+ vk_cmd_barrier(cmd, &(VkDependencyInfo) {
+ .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+ .bufferMemoryBarrierCount = 1,
+ .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) {
+ .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+ .srcStageMask = buf_vk->sem.write.stage,
+ .srcAccessMask = buf_vk->sem.write.access,
+ .dstStageMask = VK_PIPELINE_STAGE_2_HOST_BIT,
+ .dstAccessMask = (can_read ? VK_ACCESS_2_HOST_READ_BIT : 0)
+ | (can_write ? VK_ACCESS_2_HOST_WRITE_BIT : 0),
+ .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+ .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+ .buffer = buf_vk->mem.buf,
+ .offset = buf_vk->mem.offset + offset,
+ .size = size,
+ },
+ });
+
+ // We need to hold on to the buffer until this barrier completes
+ vk_cmd_callback(cmd, (vk_cb) invalidate_buf, gpu, buf);
+ pl_rc_ref(&buf_vk->rc);
+}
+
+bool vk_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t timeout)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+ // Opportunistically check if we can re-use this buffer without flush
+ vk_poll_commands(vk, 0);
+ if (pl_rc_count(&buf_vk->rc) == 1)
+ return false;
+
+ // Otherwise, we're force to submit any queued command so that the
+ // user is guaranteed to see progress eventually, even if they call
+ // this in a tight loop
+ CMD_SUBMIT(NULL);
+ vk_poll_commands(vk, timeout);
+
+ return pl_rc_count(&buf_vk->rc) > 1;
+}
+
+void vk_buf_write(pl_gpu gpu, pl_buf buf, size_t offset,
+ const void *data, size_t size)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+ // For host-mapped buffers, we can just directly memcpy the buffer contents.
+ // Otherwise, we can update the buffer from the GPU using a command buffer.
+ if (buf_vk->mem.data) {
+ // ensure no queued operations
+ while (vk_buf_poll(gpu, buf, UINT64_MAX))
+ ; // do nothing
+
+ uintptr_t addr = (uintptr_t) buf_vk->mem.data + offset;
+ memcpy((void *) addr, data, size);
+ buf_vk->needs_flush = true;
+ } else {
+ struct vk_cmd *cmd = CMD_BEGIN(buf_vk->update_queue);
+ if (!cmd) {
+ PL_ERR(gpu, "Failed updating buffer!");
+ return;
+ }
+
+ vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_WRITE_BIT, offset, size, false);
+
+ // Vulkan requires `size` to be a multiple of 4, so we need to make
+ // sure to handle the end separately if the original data is not
+ const size_t max_transfer = 64 * 1024;
+ size_t size_rem = size % 4;
+ size_t size_base = size - size_rem;
+ VkDeviceSize buf_offset = buf_vk->mem.offset + offset;
+
+ if (size_base > max_transfer) {
+ PL_TRACE(gpu, "Using multiple vkCmdUpdateBuffer calls to upload "
+ "large buffer. Consider using buffer-buffer transfers "
+ "instead!");
+ }
+
+ for (size_t xfer = 0; xfer < size_base; xfer += max_transfer) {
+ vk->CmdUpdateBuffer(cmd->buf, buf_vk->mem.buf,
+ buf_offset + xfer,
+ PL_MIN(size_base, max_transfer),
+ (void *) ((uint8_t *) data + xfer));
+ }
+
+ if (size_rem) {
+ uint8_t tail[4] = {0};
+ memcpy(tail, data, size_rem);
+ vk->CmdUpdateBuffer(cmd->buf, buf_vk->mem.buf, buf_offset + size_base,
+ sizeof(tail), tail);
+ }
+
+ pl_assert(!buf->params.host_readable); // no flush needed due to this
+ CMD_FINISH(&cmd);
+ }
+}
+
+bool vk_buf_read(pl_gpu gpu, pl_buf buf, size_t offset, void *dest, size_t size)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+ pl_assert(buf_vk->mem.data);
+
+ if (vk_buf_poll(gpu, buf, 0) && buf_vk->sem.write.sync.sem) {
+ // ensure no more queued writes
+ VK(vk->WaitSemaphores(vk->dev, &(VkSemaphoreWaitInfo) {
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
+ .semaphoreCount = 1,
+ .pSemaphores = &buf_vk->sem.write.sync.sem,
+ .pValues = &buf_vk->sem.write.sync.value,
+ }, UINT64_MAX));
+
+ // process callbacks
+ vk_poll_commands(vk, 0);
+ }
+
+ uintptr_t addr = (uintptr_t) buf_vk->mem.data + (size_t) offset;
+ memcpy(dest, (void *) addr, size);
+ return true;
+
+error:
+ return false;
+}
+
+void vk_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset,
+ pl_buf src, size_t src_offset, size_t size)
+{
+ struct pl_vk *p = PL_PRIV(gpu);
+ struct vk_ctx *vk = p->vk;
+ struct pl_buf_vk *dst_vk = PL_PRIV(dst);
+ struct pl_buf_vk *src_vk = PL_PRIV(src);
+
+ struct vk_cmd *cmd = CMD_BEGIN(dst_vk->update_queue);
+ if (!cmd) {
+ PL_ERR(gpu, "Failed copying buffer!");
+ return;
+ }
+
+ vk_buf_barrier(gpu, cmd, dst, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_WRITE_BIT, dst_offset, size, false);
+ vk_buf_barrier(gpu, cmd, src, VK_PIPELINE_STAGE_2_COPY_BIT,
+ VK_ACCESS_2_TRANSFER_READ_BIT, src_offset, size, false);
+
+ VkBufferCopy region = {
+ .srcOffset = src_vk->mem.offset + src_offset,
+ .dstOffset = dst_vk->mem.offset + dst_offset,
+ .size = size,
+ };
+
+ vk->CmdCopyBuffer(cmd->buf, src_vk->mem.buf, dst_vk->mem.buf,
+ 1, &region);
+
+ vk_buf_flush(gpu, cmd, dst, dst_offset, size);
+ CMD_FINISH(&cmd);
+}
+
+bool vk_buf_export(pl_gpu gpu, pl_buf buf)
+{
+ struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+ if (buf_vk->exported)
+ return true;
+
+ struct vk_cmd *cmd = CMD_BEGIN(ANY);
+ if (!cmd) {
+ PL_ERR(gpu, "Failed exporting buffer!");
+ return false;
+ }
+
+ // For the queue family ownership transfer, we can ignore all pipeline
+ // stages since the synchronization via fences/semaphores is required
+ vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_NONE, 0, 0,
+ buf->params.size, true);
+
+
+ return CMD_SUBMIT(&cmd);
+}