From ff6e3c025658a5fa1affd094f220b623e7e1b24b Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Mon, 15 Apr 2024 22:38:23 +0200 Subject: Adding upstream version 6.338.2. Signed-off-by: Daniel Baumann --- src/vulkan/gpu_buf.c | 470 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 470 insertions(+) create mode 100644 src/vulkan/gpu_buf.c (limited to 'src/vulkan/gpu_buf.c') diff --git a/src/vulkan/gpu_buf.c b/src/vulkan/gpu_buf.c new file mode 100644 index 0000000..2f317bc --- /dev/null +++ b/src/vulkan/gpu_buf.c @@ -0,0 +1,470 @@ +/* + * This file is part of libplacebo. + * + * libplacebo is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * libplacebo is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with libplacebo. If not, see . + */ + +#include "gpu.h" + +void vk_buf_barrier(pl_gpu gpu, struct vk_cmd *cmd, pl_buf buf, + VkPipelineStageFlags2 stage, VkAccessFlags2 access, + size_t offset, size_t size, bool export) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + pl_assert(!export || !buf_vk->exported); // can't re-export exported buffers + pl_rc_ref(&buf_vk->rc); + + bool needs_flush = buf_vk->needs_flush || buf->params.host_mapped || + buf->params.import_handle == PL_HANDLE_HOST_PTR; + bool noncoherent = buf_vk->mem.data && !buf_vk->mem.coherent; + if (needs_flush && noncoherent) { + VK(vk->FlushMappedMemoryRanges(vk->dev, 1, &(struct VkMappedMemoryRange) { + .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, + .memory = buf_vk->mem.vkmem, + .offset = buf_vk->mem.map_offset, + .size = buf_vk->mem.map_size, + })); + + // Just ignore errors, not much we can do about them other than + // logging them and moving on... + error: ; + } + + struct vk_sync_scope last; + last = vk_sem_barrier(cmd, &buf_vk->sem, stage, access, export); + + // CONCURRENT buffers require transitioning to/from IGNORED, EXCLUSIVE + // buffers require transitioning to/from the concrete QF index + uint32_t qf = vk->pools.num > 1 ? VK_QUEUE_FAMILY_IGNORED : cmd->pool->qf; + uint32_t src_qf = buf_vk->exported ? VK_QUEUE_FAMILY_EXTERNAL_KHR : qf; + uint32_t dst_qf = export ? VK_QUEUE_FAMILY_EXTERNAL_KHR : qf; + + if (last.access || src_qf != dst_qf) { + vk_cmd_barrier(cmd, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .srcStageMask = last.stage, + .srcAccessMask = last.access, + .dstStageMask = stage, + .dstAccessMask = access, + .srcQueueFamilyIndex = src_qf, + .dstQueueFamilyIndex = dst_qf, + .buffer = buf_vk->mem.buf, + .offset = buf_vk->mem.offset + offset, + .size = size, + }, + }); + } + + buf_vk->needs_flush = false; + buf_vk->exported = export; + vk_cmd_callback(cmd, (vk_cb) vk_buf_deref, gpu, buf); +} + +void vk_buf_deref(pl_gpu gpu, pl_buf buf) +{ + if (!buf) + return; + + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + + if (pl_rc_deref(&buf_vk->rc)) { + vk->DestroyBufferView(vk->dev, buf_vk->view, PL_VK_ALLOC); + vk_malloc_free(vk->ma, &buf_vk->mem); + pl_free((void *) buf); + } +} + +pl_buf vk_buf_create(pl_gpu gpu, const struct pl_buf_params *params) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + + struct pl_buf_t *buf = pl_zalloc_obj(NULL, buf, struct pl_buf_vk); + buf->params = *params; + buf->params.initial_data = NULL; + + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + pl_rc_init(&buf_vk->rc); + + struct vk_malloc_params mparams = { + .reqs = { + .size = PL_ALIGN2(params->size, 4), // for vk_buf_write + .memoryTypeBits = UINT32_MAX, + .alignment = 1, + }, + // these are always set, because `vk_buf_copy` can always be used + .buf_usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_TRANSFER_DST_BIT, + .export_handle = params->export_handle, + .import_handle = params->import_handle, + .shared_mem = params->shared_mem, + .debug_tag = params->debug_tag, + }; + + // Mandatory/optimal buffer offset alignment + VkDeviceSize *align = &mparams.reqs.alignment; + VkDeviceSize extra_align = vk->props.limits.optimalBufferCopyOffsetAlignment; + + // Try and align all buffers to the minimum texel alignment, to make sure + // tex_upload/tex_download always gets aligned buffer copies if possible + extra_align = pl_lcm(extra_align, p->min_texel_alignment); + + enum pl_buf_mem_type mem_type = params->memory_type; + bool is_texel = false; + + if (params->uniform) { + mparams.buf_usage |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; + *align = pl_lcm(*align, vk->props.limits.minUniformBufferOffsetAlignment); + mem_type = PL_BUF_MEM_DEVICE; + if (params->format) { + mparams.buf_usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT; + is_texel = true; + } + } + + if (params->storable) { + mparams.buf_usage |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + *align = pl_lcm(*align, vk->props.limits.minStorageBufferOffsetAlignment); + buf_vk->update_queue = COMPUTE; + mem_type = PL_BUF_MEM_DEVICE; + if (params->format) { + mparams.buf_usage |= VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT; + is_texel = true; + } + } + + if (is_texel) { + *align = pl_lcm(*align, vk->props.limits.minTexelBufferOffsetAlignment); + *align = pl_lcm(*align, params->format->texel_size); + } + + if (params->drawable) { + mparams.buf_usage |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | + VK_BUFFER_USAGE_INDEX_BUFFER_BIT; + mem_type = PL_BUF_MEM_DEVICE; + } + + if (params->host_writable || params->initial_data) { + // Buffers should be written using mapped memory if possible + mparams.optimal = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + // Use the transfer queue for updates on very large buffers (1 MB) + if (params->size > 1024*1024) + buf_vk->update_queue = TRANSFER; + } + + if (params->host_mapped || params->host_readable) { + mparams.required |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + + if (params->size > 1024) { + // Prefer cached memory for large buffers (1 kB) which may be read + // from, because uncached reads are extremely slow + mparams.optimal |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT; + } + } + + switch (mem_type) { + case PL_BUF_MEM_AUTO: + // We generally prefer VRAM since it's faster than RAM, but any number + // of other requirements could potentially exclude it, so just mark it + // as optimal by default. + if (!(mparams.optimal & VK_MEMORY_PROPERTY_HOST_CACHED_BIT)) + mparams.optimal |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + break; + case PL_BUF_MEM_DEVICE: + // Force device local memory. + mparams.required |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + break; + case PL_BUF_MEM_HOST: + // This isn't a true guarantee, but actually trying to restrict the + // device-local bit locks out all memory heaps on iGPUs. Requiring + // the memory be host-mapped is the easiest compromise. + mparams.required |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + mparams.optimal |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT; + break; + case PL_BUF_MEM_TYPE_COUNT: + pl_unreachable(); + } + + if (params->import_handle) { + size_t offset = params->shared_mem.offset; + if (PL_ALIGN(offset, *align) != offset) { + PL_ERR(gpu, "Imported memory offset %zu violates minimum alignment " + "requirement of enabled usage flags (%zu)!", + offset, (size_t) *align); + goto error; + } + } else { + *align = pl_lcm(*align, extra_align); + } + + if (!vk_malloc_slice(vk->ma, &buf_vk->mem, &mparams)) + goto error; + + if (params->host_mapped) + buf->data = buf_vk->mem.data; + + if (params->export_handle) { + buf->shared_mem = buf_vk->mem.shared_mem; + buf->shared_mem.drm_format_mod = DRM_FORMAT_MOD_LINEAR; + buf_vk->exported = true; + } + + if (is_texel) { + struct pl_fmt_vk *fmtp = PL_PRIV(params->format); + VkBufferViewCreateInfo vinfo = { + .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO, + .buffer = buf_vk->mem.buf, + .format = PL_DEF(fmtp->vk_fmt->bfmt, fmtp->vk_fmt->tfmt), + .offset = buf_vk->mem.offset, + .range = buf_vk->mem.size, + }; + + VK(vk->CreateBufferView(vk->dev, &vinfo, PL_VK_ALLOC, &buf_vk->view)); + PL_VK_NAME(BUFFER_VIEW, buf_vk->view, PL_DEF(params->debug_tag, "texel")); + } + + if (params->initial_data) + vk_buf_write(gpu, buf, 0, params->initial_data, params->size); + + return buf; + +error: + vk_buf_deref(gpu, buf); + return NULL; +} + +static void invalidate_buf(pl_gpu gpu, pl_buf buf) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + + if (buf_vk->mem.data && !buf_vk->mem.coherent) { + VK(vk->InvalidateMappedMemoryRanges(vk->dev, 1, &(VkMappedMemoryRange) { + .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, + .memory = buf_vk->mem.vkmem, + .offset = buf_vk->mem.map_offset, + .size = buf_vk->mem.map_size, + })); + } + + // Ignore errors (after logging), nothing useful we can do anyway +error: ; + vk_buf_deref(gpu, buf); +} + +void vk_buf_flush(pl_gpu gpu, struct vk_cmd *cmd, pl_buf buf, + size_t offset, size_t size) +{ + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + + // We need to perform a flush if the host is capable of reading back from + // the buffer, or if we intend to overwrite it using mapped memory + bool can_read = buf->params.host_readable; + bool can_write = buf_vk->mem.data && buf->params.host_writable; + if (buf->params.host_mapped || buf->params.import_handle == PL_HANDLE_HOST_PTR) + can_read = can_write = true; + + if (!can_read && !can_write) + return; + + vk_cmd_barrier(cmd, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .srcStageMask = buf_vk->sem.write.stage, + .srcAccessMask = buf_vk->sem.write.access, + .dstStageMask = VK_PIPELINE_STAGE_2_HOST_BIT, + .dstAccessMask = (can_read ? VK_ACCESS_2_HOST_READ_BIT : 0) + | (can_write ? VK_ACCESS_2_HOST_WRITE_BIT : 0), + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = buf_vk->mem.buf, + .offset = buf_vk->mem.offset + offset, + .size = size, + }, + }); + + // We need to hold on to the buffer until this barrier completes + vk_cmd_callback(cmd, (vk_cb) invalidate_buf, gpu, buf); + pl_rc_ref(&buf_vk->rc); +} + +bool vk_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t timeout) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + + // Opportunistically check if we can re-use this buffer without flush + vk_poll_commands(vk, 0); + if (pl_rc_count(&buf_vk->rc) == 1) + return false; + + // Otherwise, we're force to submit any queued command so that the + // user is guaranteed to see progress eventually, even if they call + // this in a tight loop + CMD_SUBMIT(NULL); + vk_poll_commands(vk, timeout); + + return pl_rc_count(&buf_vk->rc) > 1; +} + +void vk_buf_write(pl_gpu gpu, pl_buf buf, size_t offset, + const void *data, size_t size) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + + // For host-mapped buffers, we can just directly memcpy the buffer contents. + // Otherwise, we can update the buffer from the GPU using a command buffer. + if (buf_vk->mem.data) { + // ensure no queued operations + while (vk_buf_poll(gpu, buf, UINT64_MAX)) + ; // do nothing + + uintptr_t addr = (uintptr_t) buf_vk->mem.data + offset; + memcpy((void *) addr, data, size); + buf_vk->needs_flush = true; + } else { + struct vk_cmd *cmd = CMD_BEGIN(buf_vk->update_queue); + if (!cmd) { + PL_ERR(gpu, "Failed updating buffer!"); + return; + } + + vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, offset, size, false); + + // Vulkan requires `size` to be a multiple of 4, so we need to make + // sure to handle the end separately if the original data is not + const size_t max_transfer = 64 * 1024; + size_t size_rem = size % 4; + size_t size_base = size - size_rem; + VkDeviceSize buf_offset = buf_vk->mem.offset + offset; + + if (size_base > max_transfer) { + PL_TRACE(gpu, "Using multiple vkCmdUpdateBuffer calls to upload " + "large buffer. Consider using buffer-buffer transfers " + "instead!"); + } + + for (size_t xfer = 0; xfer < size_base; xfer += max_transfer) { + vk->CmdUpdateBuffer(cmd->buf, buf_vk->mem.buf, + buf_offset + xfer, + PL_MIN(size_base, max_transfer), + (void *) ((uint8_t *) data + xfer)); + } + + if (size_rem) { + uint8_t tail[4] = {0}; + memcpy(tail, data, size_rem); + vk->CmdUpdateBuffer(cmd->buf, buf_vk->mem.buf, buf_offset + size_base, + sizeof(tail), tail); + } + + pl_assert(!buf->params.host_readable); // no flush needed due to this + CMD_FINISH(&cmd); + } +} + +bool vk_buf_read(pl_gpu gpu, pl_buf buf, size_t offset, void *dest, size_t size) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + pl_assert(buf_vk->mem.data); + + if (vk_buf_poll(gpu, buf, 0) && buf_vk->sem.write.sync.sem) { + // ensure no more queued writes + VK(vk->WaitSemaphores(vk->dev, &(VkSemaphoreWaitInfo) { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO, + .semaphoreCount = 1, + .pSemaphores = &buf_vk->sem.write.sync.sem, + .pValues = &buf_vk->sem.write.sync.value, + }, UINT64_MAX)); + + // process callbacks + vk_poll_commands(vk, 0); + } + + uintptr_t addr = (uintptr_t) buf_vk->mem.data + (size_t) offset; + memcpy(dest, (void *) addr, size); + return true; + +error: + return false; +} + +void vk_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset, + pl_buf src, size_t src_offset, size_t size) +{ + struct pl_vk *p = PL_PRIV(gpu); + struct vk_ctx *vk = p->vk; + struct pl_buf_vk *dst_vk = PL_PRIV(dst); + struct pl_buf_vk *src_vk = PL_PRIV(src); + + struct vk_cmd *cmd = CMD_BEGIN(dst_vk->update_queue); + if (!cmd) { + PL_ERR(gpu, "Failed copying buffer!"); + return; + } + + vk_buf_barrier(gpu, cmd, dst, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_WRITE_BIT, dst_offset, size, false); + vk_buf_barrier(gpu, cmd, src, VK_PIPELINE_STAGE_2_COPY_BIT, + VK_ACCESS_2_TRANSFER_READ_BIT, src_offset, size, false); + + VkBufferCopy region = { + .srcOffset = src_vk->mem.offset + src_offset, + .dstOffset = dst_vk->mem.offset + dst_offset, + .size = size, + }; + + vk->CmdCopyBuffer(cmd->buf, src_vk->mem.buf, dst_vk->mem.buf, + 1, ®ion); + + vk_buf_flush(gpu, cmd, dst, dst_offset, size); + CMD_FINISH(&cmd); +} + +bool vk_buf_export(pl_gpu gpu, pl_buf buf) +{ + struct pl_buf_vk *buf_vk = PL_PRIV(buf); + if (buf_vk->exported) + return true; + + struct vk_cmd *cmd = CMD_BEGIN(ANY); + if (!cmd) { + PL_ERR(gpu, "Failed exporting buffer!"); + return false; + } + + // For the queue family ownership transfer, we can ignore all pipeline + // stages since the synchronization via fences/semaphores is required + vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_NONE, 0, 0, + buf->params.size, true); + + + return CMD_SUBMIT(&cmd); +} -- cgit v1.2.3