1 files changed, 470 insertions, 0 deletions
diff --git a/src/vulkan/gpu_buf.c b/src/vulkan/gpu_buf.c
new file mode 100644
index 0000000..2f317bc
--- /dev/null
+++ b/src/vulkan/gpu_buf.c
@@ -0,0 +1,470 @@
+/*
+ * This file is part of libplacebo.
+ *
+ * libplacebo is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libplacebo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gpu.h"
+
+void vk_buf_barrier(pl_gpu gpu, struct vk_cmd *cmd, pl_buf buf,
+                    VkPipelineStageFlags2 stage, VkAccessFlags2 access,
+                    size_t offset, size_t size, bool export)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+    pl_assert(!export || !buf_vk->exported); // can't re-export exported buffers
+    pl_rc_ref(&buf_vk->rc);
+
+    bool needs_flush = buf_vk->needs_flush || buf->params.host_mapped ||
+                       buf->params.import_handle == PL_HANDLE_HOST_PTR;
+    bool noncoherent = buf_vk->mem.data && !buf_vk->mem.coherent;
+    if (needs_flush && noncoherent) {
+        VK(vk->FlushMappedMemoryRanges(vk->dev, 1, &(struct VkMappedMemoryRange) {
+            .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+            .memory = buf_vk->mem.vkmem,
+            .offset = buf_vk->mem.map_offset,
+            .size = buf_vk->mem.map_size,
+        }));
+
+        // Just ignore errors, not much we can do about them other than
+        // logging them and moving on...
+    error: ;
+    }
+
+    struct vk_sync_scope last;
+    last = vk_sem_barrier(cmd, &buf_vk->sem, stage, access, export);
+
+    // CONCURRENT buffers require transitioning to/from IGNORED, EXCLUSIVE
+    // buffers require transitioning to/from the concrete QF index
+    uint32_t qf = vk->pools.num > 1 ? VK_QUEUE_FAMILY_IGNORED : cmd->pool->qf;
+    uint32_t src_qf = buf_vk->exported ? VK_QUEUE_FAMILY_EXTERNAL_KHR : qf;
+    uint32_t dst_qf = export ? VK_QUEUE_FAMILY_EXTERNAL_KHR : qf;
+
+    if (last.access || src_qf != dst_qf) {
+        vk_cmd_barrier(cmd, &(VkDependencyInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+            .bufferMemoryBarrierCount = 1,
+            .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) {
+                .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+                .srcStageMask = last.stage,
+                .srcAccessMask = last.access,
+                .dstStageMask = stage,
+                .dstAccessMask = access,
+                .srcQueueFamilyIndex = src_qf,
+                .dstQueueFamilyIndex = dst_qf,
+                .buffer = buf_vk->mem.buf,
+                .offset = buf_vk->mem.offset + offset,
+                .size = size,
+            },
+        });
+    }
+
+    buf_vk->needs_flush = false;
+    buf_vk->exported = export;
+    vk_cmd_callback(cmd, (vk_cb) vk_buf_deref, gpu, buf);
+}
+
+void vk_buf_deref(pl_gpu gpu, pl_buf buf)
+{
+    if (!buf)
+        return;
+
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+    if (pl_rc_deref(&buf_vk->rc)) {
+        vk->DestroyBufferView(vk->dev, buf_vk->view, PL_VK_ALLOC);
+        vk_malloc_free(vk->ma, &buf_vk->mem);
+        pl_free((void *) buf);
+    }
+}
+
+pl_buf vk_buf_create(pl_gpu gpu, const struct pl_buf_params *params)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+
+    struct pl_buf_t *buf = pl_zalloc_obj(NULL, buf, struct pl_buf_vk);
+    buf->params = *params;
+    buf->params.initial_data = NULL;
+
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+    pl_rc_init(&buf_vk->rc);
+
+    struct vk_malloc_params mparams = {
+        .reqs = {
+            .size = PL_ALIGN2(params->size, 4), // for vk_buf_write
+            .memoryTypeBits = UINT32_MAX,
+            .alignment = 1,
+        },
+        // these are always set, because `vk_buf_copy` can always be used
+        .buf_usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+                     VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+        .export_handle = params->export_handle,
+        .import_handle = params->import_handle,
+        .shared_mem = params->shared_mem,
+        .debug_tag = params->debug_tag,
+    };
+
+    // Mandatory/optimal buffer offset alignment
+    VkDeviceSize *align = &mparams.reqs.alignment;
+    VkDeviceSize extra_align = vk->props.limits.optimalBufferCopyOffsetAlignment;
+
+    // Try and align all buffers to the minimum texel alignment, to make sure
+    // tex_upload/tex_download always gets aligned buffer copies if possible
+    extra_align = pl_lcm(extra_align, p->min_texel_alignment);
+
+    enum pl_buf_mem_type mem_type = params->memory_type;
+    bool is_texel = false;
+
+    if (params->uniform) {
+        mparams.buf_usage |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
+        *align = pl_lcm(*align, vk->props.limits.minUniformBufferOffsetAlignment);
+        mem_type = PL_BUF_MEM_DEVICE;
+        if (params->format) {
+            mparams.buf_usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
+            is_texel = true;
+        }
+    }
+
+    if (params->storable) {
+        mparams.buf_usage |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+        *align = pl_lcm(*align, vk->props.limits.minStorageBufferOffsetAlignment);
+        buf_vk->update_queue = COMPUTE;
+        mem_type = PL_BUF_MEM_DEVICE;
+        if (params->format) {
+            mparams.buf_usage |= VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT;
+            is_texel = true;
+        }
+    }
+
+    if (is_texel) {
+        *align = pl_lcm(*align, vk->props.limits.minTexelBufferOffsetAlignment);
+        *align = pl_lcm(*align, params->format->texel_size);
+    }
+
+    if (params->drawable) {
+        mparams.buf_usage |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT |
+                             VK_BUFFER_USAGE_INDEX_BUFFER_BIT;
+        mem_type = PL_BUF_MEM_DEVICE;
+    }
+
+    if (params->host_writable || params->initial_data) {
+        // Buffers should be written using mapped memory if possible
+        mparams.optimal = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+        // Use the transfer queue for updates on very large buffers (1 MB)
+        if (params->size > 1024*1024)
+            buf_vk->update_queue = TRANSFER;
+    }
+
+    if (params->host_mapped || params->host_readable) {
+        mparams.required |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+
+        if (params->size > 1024) {
+            // Prefer cached memory for large buffers (1 kB) which may be read
+            // from, because uncached reads are extremely slow
+            mparams.optimal |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+        }
+    }
+
+    switch (mem_type) {
+    case PL_BUF_MEM_AUTO:
+        // We generally prefer VRAM since it's faster than RAM, but any number
+        // of other requirements could potentially exclude it, so just mark it
+        // as optimal by default.
+        if (!(mparams.optimal & VK_MEMORY_PROPERTY_HOST_CACHED_BIT))
+            mparams.optimal |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+        break;
+    case PL_BUF_MEM_DEVICE:
+        // Force device local memory.
+        mparams.required |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+        break;
+    case PL_BUF_MEM_HOST:
+        // This isn't a true guarantee, but actually trying to restrict the
+        // device-local bit locks out all memory heaps on iGPUs. Requiring
+        // the memory be host-mapped is the easiest compromise.
+        mparams.required |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+        mparams.optimal  |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+        break;
+    case PL_BUF_MEM_TYPE_COUNT:
+        pl_unreachable();
+    }
+
+    if (params->import_handle) {
+        size_t offset = params->shared_mem.offset;
+        if (PL_ALIGN(offset, *align) != offset) {
+            PL_ERR(gpu, "Imported memory offset %zu violates minimum alignment "
+                   "requirement of enabled usage flags (%zu)!",
+                   offset, (size_t) *align);
+            goto error;
+        }
+    } else {
+        *align = pl_lcm(*align, extra_align);
+    }
+
+    if (!vk_malloc_slice(vk->ma, &buf_vk->mem, &mparams))
+        goto error;
+
+    if (params->host_mapped)
+        buf->data = buf_vk->mem.data;
+
+    if (params->export_handle) {
+        buf->shared_mem = buf_vk->mem.shared_mem;
+        buf->shared_mem.drm_format_mod = DRM_FORMAT_MOD_LINEAR;
+        buf_vk->exported = true;
+    }
+
+    if (is_texel) {
+        struct pl_fmt_vk *fmtp = PL_PRIV(params->format);
+        VkBufferViewCreateInfo vinfo = {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
+            .buffer = buf_vk->mem.buf,
+            .format = PL_DEF(fmtp->vk_fmt->bfmt, fmtp->vk_fmt->tfmt),
+            .offset = buf_vk->mem.offset,
+            .range = buf_vk->mem.size,
+        };
+
+        VK(vk->CreateBufferView(vk->dev, &vinfo, PL_VK_ALLOC, &buf_vk->view));
+        PL_VK_NAME(BUFFER_VIEW, buf_vk->view, PL_DEF(params->debug_tag, "texel"));
+    }
+
+    if (params->initial_data)
+        vk_buf_write(gpu, buf, 0, params->initial_data, params->size);
+
+    return buf;
+
+error:
+    vk_buf_deref(gpu, buf);
+    return NULL;
+}
+
+static void invalidate_buf(pl_gpu gpu, pl_buf buf)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+    if (buf_vk->mem.data && !buf_vk->mem.coherent) {
+        VK(vk->InvalidateMappedMemoryRanges(vk->dev, 1, &(VkMappedMemoryRange) {
+            .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE,
+            .memory = buf_vk->mem.vkmem,
+            .offset = buf_vk->mem.map_offset,
+            .size = buf_vk->mem.map_size,
+        }));
+    }
+
+    // Ignore errors (after logging), nothing useful we can do anyway
+error: ;
+    vk_buf_deref(gpu, buf);
+}
+
+void vk_buf_flush(pl_gpu gpu, struct vk_cmd *cmd, pl_buf buf,
+                  size_t offset, size_t size)
+{
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+    // We need to perform a flush if the host is capable of reading back from
+    // the buffer, or if we intend to overwrite it using mapped memory
+    bool can_read = buf->params.host_readable;
+    bool can_write = buf_vk->mem.data && buf->params.host_writable;
+    if (buf->params.host_mapped || buf->params.import_handle == PL_HANDLE_HOST_PTR)
+        can_read = can_write = true;
+
+    if (!can_read && !can_write)
+        return;
+
+    vk_cmd_barrier(cmd, &(VkDependencyInfo) {
+        .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+        .bufferMemoryBarrierCount = 1,
+        .pBufferMemoryBarriers = &(VkBufferMemoryBarrier2) {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+            .srcStageMask = buf_vk->sem.write.stage,
+            .srcAccessMask = buf_vk->sem.write.access,
+            .dstStageMask = VK_PIPELINE_STAGE_2_HOST_BIT,
+            .dstAccessMask = (can_read ? VK_ACCESS_2_HOST_READ_BIT : 0)
+                           | (can_write ? VK_ACCESS_2_HOST_WRITE_BIT : 0),
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = buf_vk->mem.buf,
+            .offset = buf_vk->mem.offset + offset,
+            .size = size,
+        },
+    });
+
+    // We need to hold on to the buffer until this barrier completes
+    vk_cmd_callback(cmd, (vk_cb) invalidate_buf, gpu, buf);
+    pl_rc_ref(&buf_vk->rc);
+}
+
+bool vk_buf_poll(pl_gpu gpu, pl_buf buf, uint64_t timeout)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+    // Opportunistically check if we can re-use this buffer without flush
+    vk_poll_commands(vk, 0);
+    if (pl_rc_count(&buf_vk->rc) == 1)
+        return false;
+
+    // Otherwise, we're force to submit any queued command so that the
+    // user is guaranteed to see progress eventually, even if they call
+    // this in a tight loop
+    CMD_SUBMIT(NULL);
+    vk_poll_commands(vk, timeout);
+
+    return pl_rc_count(&buf_vk->rc) > 1;
+}
+
+void vk_buf_write(pl_gpu gpu, pl_buf buf, size_t offset,
+                  const void *data, size_t size)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+
+    // For host-mapped buffers, we can just directly memcpy the buffer contents.
+    // Otherwise, we can update the buffer from the GPU using a command buffer.
+    if (buf_vk->mem.data) {
+        // ensure no queued operations
+        while (vk_buf_poll(gpu, buf, UINT64_MAX))
+            ; // do nothing
+
+        uintptr_t addr = (uintptr_t) buf_vk->mem.data + offset;
+        memcpy((void *) addr, data, size);
+        buf_vk->needs_flush = true;
+    } else {
+        struct vk_cmd *cmd = CMD_BEGIN(buf_vk->update_queue);
+        if (!cmd) {
+            PL_ERR(gpu, "Failed updating buffer!");
+            return;
+        }
+
+        vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_COPY_BIT,
+                       VK_ACCESS_2_TRANSFER_WRITE_BIT, offset, size, false);
+
+        // Vulkan requires `size` to be a multiple of 4, so we need to make
+        // sure to handle the end separately if the original data is not
+        const size_t max_transfer = 64 * 1024;
+        size_t size_rem = size % 4;
+        size_t size_base = size - size_rem;
+        VkDeviceSize buf_offset = buf_vk->mem.offset + offset;
+
+        if (size_base > max_transfer) {
+            PL_TRACE(gpu, "Using multiple vkCmdUpdateBuffer calls to upload "
+                     "large buffer. Consider using buffer-buffer transfers "
+                     "instead!");
+        }
+
+        for (size_t xfer = 0; xfer < size_base; xfer += max_transfer) {
+            vk->CmdUpdateBuffer(cmd->buf, buf_vk->mem.buf,
+                                buf_offset + xfer,
+                                PL_MIN(size_base, max_transfer),
+                                (void *) ((uint8_t *) data + xfer));
+        }
+
+        if (size_rem) {
+            uint8_t tail[4] = {0};
+            memcpy(tail, data, size_rem);
+            vk->CmdUpdateBuffer(cmd->buf, buf_vk->mem.buf, buf_offset + size_base,
+                                sizeof(tail), tail);
+        }
+
+        pl_assert(!buf->params.host_readable); // no flush needed due to this
+        CMD_FINISH(&cmd);
+    }
+}
+
+bool vk_buf_read(pl_gpu gpu, pl_buf buf, size_t offset, void *dest, size_t size)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+    pl_assert(buf_vk->mem.data);
+
+    if (vk_buf_poll(gpu, buf, 0) && buf_vk->sem.write.sync.sem) {
+        // ensure no more queued writes
+        VK(vk->WaitSemaphores(vk->dev, &(VkSemaphoreWaitInfo) {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
+            .semaphoreCount = 1,
+            .pSemaphores = &buf_vk->sem.write.sync.sem,
+            .pValues = &buf_vk->sem.write.sync.value,
+        }, UINT64_MAX));
+
+        // process callbacks
+        vk_poll_commands(vk, 0);
+    }
+
+    uintptr_t addr = (uintptr_t) buf_vk->mem.data + (size_t) offset;
+    memcpy(dest, (void *) addr, size);
+    return true;
+
+error:
+    return false;
+}
+
+void vk_buf_copy(pl_gpu gpu, pl_buf dst, size_t dst_offset,
+                 pl_buf src, size_t src_offset, size_t size)
+{
+    struct pl_vk *p = PL_PRIV(gpu);
+    struct vk_ctx *vk = p->vk;
+    struct pl_buf_vk *dst_vk = PL_PRIV(dst);
+    struct pl_buf_vk *src_vk = PL_PRIV(src);
+
+    struct vk_cmd *cmd = CMD_BEGIN(dst_vk->update_queue);
+    if (!cmd) {
+        PL_ERR(gpu, "Failed copying buffer!");
+        return;
+    }
+
+    vk_buf_barrier(gpu, cmd, dst, VK_PIPELINE_STAGE_2_COPY_BIT,
+                   VK_ACCESS_2_TRANSFER_WRITE_BIT, dst_offset, size, false);
+    vk_buf_barrier(gpu, cmd, src, VK_PIPELINE_STAGE_2_COPY_BIT,
+                   VK_ACCESS_2_TRANSFER_READ_BIT, src_offset, size, false);
+
+    VkBufferCopy region = {
+        .srcOffset = src_vk->mem.offset + src_offset,
+        .dstOffset = dst_vk->mem.offset + dst_offset,
+        .size = size,
+    };
+
+    vk->CmdCopyBuffer(cmd->buf, src_vk->mem.buf, dst_vk->mem.buf,
+                      1, &region);
+
+    vk_buf_flush(gpu, cmd, dst, dst_offset, size);
+    CMD_FINISH(&cmd);
+}
+
+bool vk_buf_export(pl_gpu gpu, pl_buf buf)
+{
+    struct pl_buf_vk *buf_vk = PL_PRIV(buf);
+    if (buf_vk->exported)
+        return true;
+
+    struct vk_cmd *cmd = CMD_BEGIN(ANY);
+    if (!cmd) {
+        PL_ERR(gpu, "Failed exporting buffer!");
+        return false;
+    }
+
+    // For the queue family ownership transfer, we can ignore all pipeline
+    // stages since the synchronization via fences/semaphores is required
+    vk_buf_barrier(gpu, cmd, buf, VK_PIPELINE_STAGE_2_NONE, 0, 0,
+                   buf->params.size, true);
+
+
+    return CMD_SUBMIT(&cmd);
+}