/*
 * This file is part of libplacebo.
 *
 * libplacebo is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * libplacebo is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with libplacebo. If not, see <http://www.gnu.org/licenses/>.
 */

#include "command.h"
#include "utils.h"

// returns VK_SUCCESS (completed), VK_TIMEOUT (not yet completed) or an error
static VkResult vk_cmd_poll(struct vk_cmd *cmd, uint64_t timeout)
{
    struct vk_ctx *vk = cmd->pool->vk;
    return vk->WaitSemaphores(vk->dev, &(VkSemaphoreWaitInfo) {
        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
        .semaphoreCount = 1,
        .pSemaphores = &cmd->sync.sem,
        .pValues = &cmd->sync.value,
    }, timeout);
}

static void flush_callbacks(struct vk_ctx *vk)
{
    while (vk->num_pending_callbacks) {
        const struct vk_callback *cb = vk->pending_callbacks++;
        vk->num_pending_callbacks--;
        cb->run(cb->priv, cb->arg);
    }
}

static void vk_cmd_reset(struct vk_cmd *cmd)
{
    struct vk_ctx *vk = cmd->pool->vk;

    // Flush possible callbacks left over from a previous command still in the
    // process of being reset, whose callback triggered this command being
    // reset.
    flush_callbacks(vk);
    vk->pending_callbacks = cmd->callbacks.elem;
    vk->num_pending_callbacks = cmd->callbacks.num;
    flush_callbacks(vk);

    cmd->callbacks.num = 0;
    cmd->deps.num = 0;
    cmd->sigs.num = 0;
}

static void vk_cmd_destroy(struct vk_cmd *cmd)
{
    if (!cmd)
        return;

    struct vk_ctx *vk = cmd->pool->vk;
    vk_cmd_poll(cmd, UINT64_MAX);
    vk_cmd_reset(cmd);
    vk->DestroySemaphore(vk->dev, cmd->sync.sem, PL_VK_ALLOC);
    vk->FreeCommandBuffers(vk->dev, cmd->pool->pool, 1, &cmd->buf);

    pl_free(cmd);
}

static struct vk_cmd *vk_cmd_create(struct vk_cmdpool *pool)
{
    struct vk_ctx *vk = pool->vk;
    struct vk_cmd *cmd = pl_zalloc_ptr(NULL, cmd);
    cmd->pool = pool;

    VkCommandBufferAllocateInfo ainfo = {
        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
        .commandPool = pool->pool,
        .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
        .commandBufferCount = 1,
    };

    VK(vk->AllocateCommandBuffers(vk->dev, &ainfo, &cmd->buf));

    static const VkSemaphoreTypeCreateInfo stinfo = {
        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO,
        .semaphoreType  = VK_SEMAPHORE_TYPE_TIMELINE,
        .initialValue   = 0,
    };

    static const VkSemaphoreCreateInfo sinfo = {
        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
        .pNext = &stinfo,
    };

    VK(vk->CreateSemaphore(vk->dev, &sinfo, PL_VK_ALLOC, &cmd->sync.sem));
    PL_VK_NAME(SEMAPHORE, cmd->sync.sem, "cmd");

    return cmd;

error:
    vk_cmd_destroy(cmd);
    vk->failed = true;
    return NULL;
}

void vk_dev_callback(struct vk_ctx *vk, vk_cb callback,
                     const void *priv, const void *arg)
{
    pl_mutex_lock(&vk->lock);
    if (vk->cmds_pending.num > 0) {
        struct vk_cmd *last_cmd = vk->cmds_pending.elem[vk->cmds_pending.num - 1];
        vk_cmd_callback(last_cmd, callback, priv, arg);
    } else {
        // The device was already idle, so we can just immediately call it
        callback((void *) priv, (void *) arg);
    }
    pl_mutex_unlock(&vk->lock);
}

void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback,
                     const void *priv, const void *arg)
{
    PL_ARRAY_APPEND(cmd, cmd->callbacks, (struct vk_callback) {
        .run  = callback,
        .priv = (void *) priv,
        .arg  = (void *) arg,
    });
}

void vk_cmd_dep(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem dep)
{
    PL_ARRAY_APPEND(cmd, cmd->deps, (VkSemaphoreSubmitInfo) {
        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
        .semaphore  = dep.sem,
        .value      = dep.value,
        .stageMask  = stage,
    });
}

void vk_cmd_sig(struct vk_cmd *cmd, VkPipelineStageFlags2 stage, pl_vulkan_sem sig)
{
    VkSemaphoreSubmitInfo sinfo = {
        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
        .semaphore  = sig.sem,
        .value      = sig.value,
        .stageMask  = stage,
    };

    // Try updating existing semaphore signal operations in-place
    for (int i = 0; i < cmd->sigs.num; i++) {
        if (cmd->sigs.elem[i].semaphore == sig.sem) {
            pl_assert(sig.value > cmd->sigs.elem[i].value);
            cmd->sigs.elem[i] = sinfo;
            return;
        }
    }

    PL_ARRAY_APPEND(cmd, cmd->sigs, sinfo);
}

#define SET(FLAG, CHECK)  \
    if (flags2 & (CHECK)) \
        flags |= FLAG

static VkAccessFlags lower_access2(VkAccessFlags2 flags2)
{
    VkAccessFlags flags = flags2 & VK_ACCESS_FLAG_BITS_MAX_ENUM;
    SET(VK_ACCESS_SHADER_READ_BIT,  VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
                                    VK_ACCESS_2_SHADER_STORAGE_READ_BIT);
    SET(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT);
    return flags;
}

static VkPipelineStageFlags lower_stage2(VkPipelineStageFlags2 flags2)
{
    VkPipelineStageFlags flags = flags2 & VK_PIPELINE_STAGE_FLAG_BITS_MAX_ENUM;
    SET(VK_PIPELINE_STAGE_TRANSFER_BIT,     VK_PIPELINE_STAGE_2_COPY_BIT |
                                            VK_PIPELINE_STAGE_2_RESOLVE_BIT |
                                            VK_PIPELINE_STAGE_2_BLIT_BIT |
                                            VK_PIPELINE_STAGE_2_CLEAR_BIT);
    SET(VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
                                            VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT);
    return flags;
}

#undef SET

void vk_cmd_barrier(struct vk_cmd *cmd, const VkDependencyInfo *info)
{
    struct vk_ctx *vk = cmd->pool->vk;
    if (vk->CmdPipelineBarrier2KHR) {
        vk->CmdPipelineBarrier2KHR(cmd->buf, info);
        return;
    }

    pl_assert(!info->pNext);
    pl_assert(info->memoryBarrierCount == 0);
    pl_assert(info->bufferMemoryBarrierCount + info->imageMemoryBarrierCount == 1);

    if (info->bufferMemoryBarrierCount) {

        const VkBufferMemoryBarrier2 *barr2 = info->pBufferMemoryBarriers;
        const VkBufferMemoryBarrier barr = {
            .sType               = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
            .pNext               = barr2->pNext,
            .srcAccessMask       = lower_access2(barr2->srcAccessMask),
            .dstAccessMask       = lower_access2(barr2->dstAccessMask),
            .srcQueueFamilyIndex = barr2->srcQueueFamilyIndex,
            .dstQueueFamilyIndex = barr2->dstQueueFamilyIndex,
            .buffer              = barr2->buffer,
            .offset              = barr2->offset,
            .size                = barr2->size,
        };

        vk->CmdPipelineBarrier(cmd->buf, lower_stage2(barr2->srcStageMask),
                               lower_stage2(barr2->dstStageMask),
                               info->dependencyFlags,
                               0, NULL, 1, &barr, 0, NULL);

    } else {

        const VkImageMemoryBarrier2 *barr2 = info->pImageMemoryBarriers;
        const VkImageMemoryBarrier barr = {
            .sType               = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
            .pNext               = barr2->pNext,
            .srcAccessMask       = lower_access2(barr2->srcAccessMask),
            .dstAccessMask       = lower_access2(barr2->dstAccessMask),
            .oldLayout           = barr2->oldLayout,
            .newLayout           = barr2->newLayout,
            .srcQueueFamilyIndex = barr2->srcQueueFamilyIndex,
            .dstQueueFamilyIndex = barr2->dstQueueFamilyIndex,
            .image               = barr2->image,
            .subresourceRange    = barr2->subresourceRange,
        };

        vk->CmdPipelineBarrier(cmd->buf, lower_stage2(barr2->srcStageMask),
                               lower_stage2(barr2->dstStageMask),
                               info->dependencyFlags,
                               0, NULL, 0, NULL, 1, &barr);
    }
}

struct vk_sync_scope vk_sem_barrier(struct vk_cmd *cmd, struct vk_sem *sem,
                                    VkPipelineStageFlags2 stage,
                                    VkAccessFlags2 access, bool is_trans)
{
    bool is_write = (access & vk_access_write) || is_trans;

    // Writes need to be synchronized against the last *read* (which is
    // transitively synchronized against the last write), reads only
    // need to be synchronized against the last write.
    struct vk_sync_scope last = sem->write;
    if (is_write && sem->read.access)
        last = sem->read;

    if (last.queue != cmd->queue) {
        if (!is_write && sem->read.queue == cmd->queue) {
            // No semaphore needed in this case because the implicit submission
            // order execution dependencies already transitively imply a wait
            // for the previous write
        } else if (last.sync.sem) {
            // Image barrier still needs to depend on this stage for implicit
            // ordering guarantees to apply properly
            vk_cmd_dep(cmd, stage, last.sync);
            last.stage = stage;
        }

        // Last access is on different queue, so no pipeline barrier needed
        last.access = 0;
    }

    if (!is_write && sem->read.queue == cmd->queue &&
        (sem->read.stage & stage) == stage &&
        (sem->read.access & access) == access)
    {
        // A past pipeline barrier already covers this access transitively, so
        // we don't need to emit another pipeline barrier at all
        last.access = 0;
    }

    if (is_write) {
        sem->write = (struct vk_sync_scope) {
            .sync = cmd->sync,
            .queue = cmd->queue,
            .stage = stage,
            .access = access,
        };

        sem->read = (struct vk_sync_scope) {
            .sync = cmd->sync,
            .queue = cmd->queue,
            // no stage or access scope, because no reads happened yet
        };
    } else if (sem->read.queue == cmd->queue) {
        // Coalesce multiple same-queue reads into a single access scope
        sem->read.sync = cmd->sync;
        sem->read.stage |= stage;
        sem->read.access |= access;
    } else {
        sem->read = (struct vk_sync_scope) {
            .sync = cmd->sync,
            .queue = cmd->queue,
            .stage = stage,
            .access = access,
        };
    }

    // We never need to include pipeline barriers for reads, only writes
    last.access &= vk_access_write;
    return last;
}

struct vk_cmdpool *vk_cmdpool_create(struct vk_ctx *vk, int qf, int qnum,
                                     VkQueueFamilyProperties props)
{
    struct vk_cmdpool *pool = pl_alloc_ptr(NULL, pool);
    *pool = (struct vk_cmdpool) {
        .vk         = vk,
        .props      = props,
        .qf         = qf,
        .queues     = pl_calloc(pool, qnum, sizeof(VkQueue)),
        .num_queues = qnum,
    };

    for (int n = 0; n < qnum; n++)
        vk->GetDeviceQueue(vk->dev, qf, n, &pool->queues[n]);

    VkCommandPoolCreateInfo cinfo = {
        .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
        .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT |
                 VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
        .queueFamilyIndex = qf,
    };

    VK(vk->CreateCommandPool(vk->dev, &cinfo, PL_VK_ALLOC, &pool->pool));
    return pool;

error:
    vk_cmdpool_destroy(pool);
    vk->failed = true;
    return NULL;
}

void vk_cmdpool_destroy(struct vk_cmdpool *pool)
{
    if (!pool)
        return;

    for (int i = 0; i < pool->cmds.num; i++)
        vk_cmd_destroy(pool->cmds.elem[i]);

    struct vk_ctx *vk = pool->vk;
    vk->DestroyCommandPool(vk->dev, pool->pool, PL_VK_ALLOC);
    pl_free(pool);
}

struct vk_cmd *vk_cmd_begin(struct vk_cmdpool *pool, pl_debug_tag debug_tag)
{
    struct vk_ctx *vk = pool->vk;

    // Garbage collect the cmdpool first, to increase the chances of getting
    // an already-available command buffer.
    vk_poll_commands(vk, 0);

    struct vk_cmd *cmd = NULL;
    pl_mutex_lock(&vk->lock);
    if (!PL_ARRAY_POP(pool->cmds, &cmd)) {
        cmd = vk_cmd_create(pool);
        if (!cmd) {
            pl_mutex_unlock(&vk->lock);
            goto error;
        }
    }

    cmd->qindex = pool->idx_queues;
    cmd->queue = pool->queues[cmd->qindex];
    pl_mutex_unlock(&vk->lock);

    VkCommandBufferBeginInfo binfo = {
        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
        .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
    };

    VK(vk->BeginCommandBuffer(cmd->buf, &binfo));

    debug_tag = PL_DEF(debug_tag, "vk_cmd");
    PL_VK_NAME_HANDLE(COMMAND_BUFFER, cmd->buf, debug_tag);
    PL_VK_NAME(SEMAPHORE, cmd->sync.sem, debug_tag);

    cmd->sync.value++;
    vk_cmd_sig(cmd, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, cmd->sync);
    return cmd;

error:
    // Something has to be seriously messed up if we get to this point
    vk_cmd_destroy(cmd);
    vk->failed = true;
    return NULL;
}

static VkResult vk_queue_submit2(struct vk_ctx *vk, VkQueue queue,
                                 const VkSubmitInfo2 *info2, VkFence fence)
{
    if (vk->QueueSubmit2KHR)
        return vk->QueueSubmit2KHR(queue, 1, info2, fence);

    const uint32_t num_deps = info2->waitSemaphoreInfoCount;
    const uint32_t num_sigs = info2->signalSemaphoreInfoCount;
    const uint32_t num_cmds = info2->commandBufferInfoCount;

    void *tmp = pl_tmp(NULL);
    VkSemaphore *deps           = pl_calloc_ptr(tmp, num_deps, deps);
    VkPipelineStageFlags *masks = pl_calloc_ptr(tmp, num_deps, masks);
    uint64_t *depvals           = pl_calloc_ptr(tmp, num_deps, depvals);
    VkSemaphore *sigs           = pl_calloc_ptr(tmp, num_sigs, sigs);
    uint64_t *sigvals           = pl_calloc_ptr(tmp, num_sigs, sigvals);
    VkCommandBuffer *cmds       = pl_calloc_ptr(tmp, num_cmds, cmds);

    for (int i = 0; i < num_deps; i++) {
        deps[i] = info2->pWaitSemaphoreInfos[i].semaphore;
        masks[i] = info2->pWaitSemaphoreInfos[i].stageMask;
        depvals[i] = info2->pWaitSemaphoreInfos[i].value;
    }
    for (int i = 0; i < num_sigs; i++) {
        sigs[i] = info2->pSignalSemaphoreInfos[i].semaphore;
        sigvals[i] = info2->pSignalSemaphoreInfos[i].value;
    }
    for (int i = 0; i < num_cmds; i++)
        cmds[i] = info2->pCommandBufferInfos[i].commandBuffer;

    const VkTimelineSemaphoreSubmitInfo tinfo = {
        .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO,
        .pNext = info2->pNext,
        .waitSemaphoreValueCount = num_deps,
        .pWaitSemaphoreValues = depvals,
        .signalSemaphoreValueCount = num_sigs,
        .pSignalSemaphoreValues = sigvals,
    };

    const VkSubmitInfo info = {
        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
        .pNext = &tinfo,
        .waitSemaphoreCount = num_deps,
        .pWaitSemaphores = deps,
        .pWaitDstStageMask = masks,
        .commandBufferCount = num_cmds,
        .pCommandBuffers = cmds,
        .signalSemaphoreCount = num_sigs,
        .pSignalSemaphores = sigs,
    };

    VkResult res = vk->QueueSubmit(queue, 1, &info, fence);
    pl_free(tmp);
    return res;
}

bool vk_cmd_submit(struct vk_cmd **pcmd)
{
    struct vk_cmd *cmd = *pcmd;
    if (!cmd)
        return true;

    *pcmd = NULL;
    struct vk_cmdpool *pool = cmd->pool;
    struct vk_ctx *vk = pool->vk;

    VK(vk->EndCommandBuffer(cmd->buf));

    VkSubmitInfo2 sinfo = {
        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
        .waitSemaphoreInfoCount = cmd->deps.num,
        .pWaitSemaphoreInfos = cmd->deps.elem,
        .signalSemaphoreInfoCount = cmd->sigs.num,
        .pSignalSemaphoreInfos = cmd->sigs.elem,
        .commandBufferInfoCount = 1,
        .pCommandBufferInfos = &(VkCommandBufferSubmitInfo) {
            .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
            .commandBuffer = cmd->buf,
        },
    };

    if (pl_msg_test(vk->log, PL_LOG_TRACE)) {
        PL_TRACE(vk, "Submitting command %p on queue %p (QF %d):",
                 (void *) cmd->buf, (void *) cmd->queue, pool->qf);
        for (int n = 0; n < cmd->deps.num; n++) {
            PL_TRACE(vk, "    waits on semaphore 0x%"PRIx64" = %"PRIu64,
                     (uint64_t) cmd->deps.elem[n].semaphore, cmd->deps.elem[n].value);
        }
        for (int n = 0; n < cmd->sigs.num; n++) {
            PL_TRACE(vk, "    signals semaphore 0x%"PRIx64" = %"PRIu64,
                    (uint64_t) cmd->sigs.elem[n].semaphore, cmd->sigs.elem[n].value);
        }
        if (cmd->callbacks.num)
            PL_TRACE(vk, "    signals %d callbacks", cmd->callbacks.num);
    }

    vk->lock_queue(vk->queue_ctx, pool->qf, cmd->qindex);
    VkResult res = vk_queue_submit2(vk, cmd->queue, &sinfo, VK_NULL_HANDLE);
    vk->unlock_queue(vk->queue_ctx, pool->qf, cmd->qindex);
    PL_VK_ASSERT(res, "vkQueueSubmit2");

    pl_mutex_lock(&vk->lock);
    PL_ARRAY_APPEND(vk->alloc, vk->cmds_pending, cmd);
    pl_mutex_unlock(&vk->lock);
    return true;

error:
    vk_cmd_reset(cmd);
    pl_mutex_lock(&vk->lock);
    PL_ARRAY_APPEND(pool, pool->cmds, cmd);
    pl_mutex_unlock(&vk->lock);
    vk->failed = true;
    return false;
}

bool vk_poll_commands(struct vk_ctx *vk, uint64_t timeout)
{
    bool ret = false;
    pl_mutex_lock(&vk->lock);

    while (vk->cmds_pending.num) {
        struct vk_cmd *cmd = vk->cmds_pending.elem[0];
        struct vk_cmdpool *pool = cmd->pool;
        pl_mutex_unlock(&vk->lock); // don't hold mutex while blocking
        if (vk_cmd_poll(cmd, timeout) == VK_TIMEOUT)
            return ret;
        pl_mutex_lock(&vk->lock);
        if (!vk->cmds_pending.num || vk->cmds_pending.elem[0] != cmd)
            continue; // another thread modified this state while blocking

        PL_TRACE(vk, "VkSemaphore signalled: 0x%"PRIx64" = %"PRIu64,
                 (uint64_t) cmd->sync.sem, cmd->sync.value);
        PL_ARRAY_REMOVE_AT(vk->cmds_pending, 0); // remove before callbacks
        vk_cmd_reset(cmd);
        PL_ARRAY_APPEND(pool, pool->cmds, cmd);
        ret = true;

        // If we've successfully spent some time waiting for at least one
        // command, disable the timeout. This has the dual purpose of both
        // making sure we don't over-wait due to repeat timeout application,
        // but also makes sure we don't block on future commands if we've
        // already spend time waiting for one.
        timeout = 0;
    }

    pl_mutex_unlock(&vk->lock);
    return ret;
}

void vk_rotate_queues(struct vk_ctx *vk)
{
    pl_mutex_lock(&vk->lock);

    // Rotate the queues to ensure good parallelism across frames
    for (int i = 0; i < vk->pools.num; i++) {
        struct vk_cmdpool *pool = vk->pools.elem[i];
        pool->idx_queues = (pool->idx_queues + 1) % pool->num_queues;
        PL_TRACE(vk, "QF %d: %d/%d", pool->qf, pool->idx_queues, pool->num_queues);
    }

    pl_mutex_unlock(&vk->lock);
}

void vk_wait_idle(struct vk_ctx *vk)
{
    while (vk_poll_commands(vk, UINT64_MAX)) ;
}