diff options
Diffstat (limited to '')
26 files changed, 28339 insertions, 0 deletions
diff --git a/drivers/misc/habanalabs/common/Makefile b/drivers/misc/habanalabs/common/Makefile new file mode 100644 index 000000000..e6abffea9 --- /dev/null +++ b/drivers/misc/habanalabs/common/Makefile @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only + +include $(src)/common/mmu/Makefile +habanalabs-y += $(HL_COMMON_MMU_FILES) + +include $(src)/common/pci/Makefile +habanalabs-y += $(HL_COMMON_PCI_FILES) + +HL_COMMON_FILES := common/habanalabs_drv.o common/device.o common/context.o \ + common/asid.o common/habanalabs_ioctl.o \ + common/command_buffer.o common/hw_queue.o common/irq.o \ + common/sysfs.o common/hwmon.o common/memory.o \ + common/command_submission.o common/firmware_if.o \ + common/security.o common/state_dump.o \ + common/memory_mgr.o common/decoder.o diff --git a/drivers/misc/habanalabs/common/asid.c b/drivers/misc/habanalabs/common/asid.c new file mode 100644 index 000000000..c9c2619cc --- /dev/null +++ b/drivers/misc/habanalabs/common/asid.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2019 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "habanalabs.h" + +#include <linux/slab.h> + +int hl_asid_init(struct hl_device *hdev) +{ + hdev->asid_bitmap = bitmap_zalloc(hdev->asic_prop.max_asid, GFP_KERNEL); + if (!hdev->asid_bitmap) + return -ENOMEM; + + mutex_init(&hdev->asid_mutex); + + /* ASID 0 is reserved for the kernel driver and device CPU */ + set_bit(0, hdev->asid_bitmap); + + return 0; +} + +void hl_asid_fini(struct hl_device *hdev) +{ + mutex_destroy(&hdev->asid_mutex); + bitmap_free(hdev->asid_bitmap); +} + +unsigned long hl_asid_alloc(struct hl_device *hdev) +{ + unsigned long found; + + mutex_lock(&hdev->asid_mutex); + + found = find_first_zero_bit(hdev->asid_bitmap, + hdev->asic_prop.max_asid); + if (found == hdev->asic_prop.max_asid) + found = 0; + else + set_bit(found, hdev->asid_bitmap); + + mutex_unlock(&hdev->asid_mutex); + + return found; +} + +void hl_asid_free(struct hl_device *hdev, unsigned long asid) +{ + if (asid == HL_KERNEL_ASID_ID || asid >= hdev->asic_prop.max_asid) { + dev_crit(hdev->dev, "Invalid ASID %lu", asid); + return; + } + + clear_bit(asid, hdev->asid_bitmap); +} diff --git a/drivers/misc/habanalabs/common/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c new file mode 100644 index 000000000..2b332991a --- /dev/null +++ b/drivers/misc/habanalabs/common/command_buffer.c @@ -0,0 +1,536 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2019 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include <uapi/misc/habanalabs.h> +#include "habanalabs.h" + +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/uaccess.h> + +#define CB_VA_POOL_SIZE (4UL * SZ_1G) + +static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb) +{ + struct hl_device *hdev = ctx->hdev; + struct asic_fixed_properties *prop = &hdev->asic_prop; + u32 page_size = prop->pmmu.page_size; + int rc; + + if (!hdev->supports_cb_mapping) { + dev_err_ratelimited(hdev->dev, + "Mapping a CB to the device's MMU is not supported\n"); + return -EINVAL; + } + + if (!hdev->mmu_enable) { + dev_err_ratelimited(hdev->dev, + "Cannot map CB because MMU is disabled\n"); + return -EINVAL; + } + + if (cb->is_mmu_mapped) + return 0; + + cb->roundup_size = roundup(cb->size, page_size); + + cb->virtual_addr = (u64) gen_pool_alloc(ctx->cb_va_pool, cb->roundup_size); + if (!cb->virtual_addr) { + dev_err(hdev->dev, "Failed to allocate device virtual address for CB\n"); + return -ENOMEM; + } + + mutex_lock(&hdev->mmu_lock); + rc = hl_mmu_map_contiguous(ctx, cb->virtual_addr, cb->bus_address, cb->roundup_size); + if (rc) { + dev_err(hdev->dev, "Failed to map VA %#llx to CB\n", cb->virtual_addr); + goto err_va_umap; + } + rc = hl_mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR | MMU_OP_SKIP_LOW_CACHE_INV); + mutex_unlock(&hdev->mmu_lock); + + cb->is_mmu_mapped = true; + return rc; + +err_va_umap: + mutex_unlock(&hdev->mmu_lock); + gen_pool_free(ctx->cb_va_pool, cb->virtual_addr, cb->roundup_size); + return rc; +} + +static void cb_unmap_mem(struct hl_ctx *ctx, struct hl_cb *cb) +{ + struct hl_device *hdev = ctx->hdev; + + mutex_lock(&hdev->mmu_lock); + hl_mmu_unmap_contiguous(ctx, cb->virtual_addr, cb->roundup_size); + hl_mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR); + mutex_unlock(&hdev->mmu_lock); + + gen_pool_free(ctx->cb_va_pool, cb->virtual_addr, cb->roundup_size); +} + +static void cb_fini(struct hl_device *hdev, struct hl_cb *cb) +{ + if (cb->is_internal) + gen_pool_free(hdev->internal_cb_pool, + (uintptr_t)cb->kernel_address, cb->size); + else + hl_asic_dma_free_coherent(hdev, cb->size, cb->kernel_address, cb->bus_address); + + kfree(cb); +} + +static void cb_do_release(struct hl_device *hdev, struct hl_cb *cb) +{ + if (cb->is_pool) { + spin_lock(&hdev->cb_pool_lock); + list_add(&cb->pool_list, &hdev->cb_pool); + spin_unlock(&hdev->cb_pool_lock); + } else { + cb_fini(hdev, cb); + } +} + +static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size, + int ctx_id, bool internal_cb) +{ + struct hl_cb *cb = NULL; + u32 cb_offset; + void *p; + + /* + * We use of GFP_ATOMIC here because this function can be called from + * the latency-sensitive code path for command submission. Due to H/W + * limitations in some of the ASICs, the kernel must copy the user CB + * that is designated for an external queue and actually enqueue + * the kernel's copy. Hence, we must never sleep in this code section + * and must use GFP_ATOMIC for all memory allocations. + */ + if (ctx_id == HL_KERNEL_ASID_ID && !hdev->disabled) + cb = kzalloc(sizeof(*cb), GFP_ATOMIC); + + if (!cb) + cb = kzalloc(sizeof(*cb), GFP_KERNEL); + + if (!cb) + return NULL; + + if (internal_cb) { + p = (void *) gen_pool_alloc(hdev->internal_cb_pool, cb_size); + if (!p) { + kfree(cb); + return NULL; + } + + cb_offset = p - hdev->internal_cb_pool_virt_addr; + cb->is_internal = true; + cb->bus_address = hdev->internal_cb_va_base + cb_offset; + } else if (ctx_id == HL_KERNEL_ASID_ID) { + p = hl_asic_dma_alloc_coherent(hdev, cb_size, &cb->bus_address, GFP_ATOMIC); + if (!p) + p = hl_asic_dma_alloc_coherent(hdev, cb_size, &cb->bus_address, GFP_KERNEL); + } else { + p = hl_asic_dma_alloc_coherent(hdev, cb_size, &cb->bus_address, + GFP_USER | __GFP_ZERO); + } + + if (!p) { + dev_err(hdev->dev, + "failed to allocate %d of dma memory for CB\n", + cb_size); + kfree(cb); + return NULL; + } + + cb->kernel_address = p; + cb->size = cb_size; + + return cb; +} + +struct hl_cb_mmap_mem_alloc_args { + struct hl_device *hdev; + struct hl_ctx *ctx; + u32 cb_size; + bool internal_cb; + bool map_cb; +}; + +static void hl_cb_mmap_mem_release(struct hl_mmap_mem_buf *buf) +{ + struct hl_cb *cb = buf->private; + + hl_debugfs_remove_cb(cb); + + if (cb->is_mmu_mapped) + cb_unmap_mem(cb->ctx, cb); + + hl_ctx_put(cb->ctx); + + cb_do_release(cb->hdev, cb); +} + +static int hl_cb_mmap_mem_alloc(struct hl_mmap_mem_buf *buf, gfp_t gfp, void *args) +{ + struct hl_cb_mmap_mem_alloc_args *cb_args = args; + struct hl_cb *cb; + int rc, ctx_id = cb_args->ctx->asid; + bool alloc_new_cb = true; + + if (!cb_args->internal_cb) { + /* Minimum allocation must be PAGE SIZE */ + if (cb_args->cb_size < PAGE_SIZE) + cb_args->cb_size = PAGE_SIZE; + + if (ctx_id == HL_KERNEL_ASID_ID && + cb_args->cb_size <= cb_args->hdev->asic_prop.cb_pool_cb_size) { + + spin_lock(&cb_args->hdev->cb_pool_lock); + if (!list_empty(&cb_args->hdev->cb_pool)) { + cb = list_first_entry(&cb_args->hdev->cb_pool, + typeof(*cb), pool_list); + list_del(&cb->pool_list); + spin_unlock(&cb_args->hdev->cb_pool_lock); + alloc_new_cb = false; + } else { + spin_unlock(&cb_args->hdev->cb_pool_lock); + dev_dbg(cb_args->hdev->dev, "CB pool is empty\n"); + } + } + } + + if (alloc_new_cb) { + cb = hl_cb_alloc(cb_args->hdev, cb_args->cb_size, ctx_id, cb_args->internal_cb); + if (!cb) + return -ENOMEM; + } + + cb->hdev = cb_args->hdev; + cb->ctx = cb_args->ctx; + cb->buf = buf; + cb->buf->mappable_size = cb->size; + cb->buf->private = cb; + + hl_ctx_get(cb->ctx); + + if (cb_args->map_cb) { + if (ctx_id == HL_KERNEL_ASID_ID) { + dev_err(cb_args->hdev->dev, + "CB mapping is not supported for kernel context\n"); + rc = -EINVAL; + goto release_cb; + } + + rc = cb_map_mem(cb_args->ctx, cb); + if (rc) + goto release_cb; + } + + hl_debugfs_add_cb(cb); + + return 0; + +release_cb: + hl_ctx_put(cb->ctx); + cb_do_release(cb_args->hdev, cb); + + return rc; +} + +static int hl_cb_mmap(struct hl_mmap_mem_buf *buf, + struct vm_area_struct *vma, void *args) +{ + struct hl_cb *cb = buf->private; + + return cb->hdev->asic_funcs->mmap(cb->hdev, vma, cb->kernel_address, + cb->bus_address, cb->size); +} + +static struct hl_mmap_mem_buf_behavior cb_behavior = { + .topic = "CB", + .mem_id = HL_MMAP_TYPE_CB, + .alloc = hl_cb_mmap_mem_alloc, + .release = hl_cb_mmap_mem_release, + .mmap = hl_cb_mmap, +}; + +int hl_cb_create(struct hl_device *hdev, struct hl_mem_mgr *mmg, + struct hl_ctx *ctx, u32 cb_size, bool internal_cb, + bool map_cb, u64 *handle) +{ + struct hl_cb_mmap_mem_alloc_args args = { + .hdev = hdev, + .ctx = ctx, + .cb_size = cb_size, + .internal_cb = internal_cb, + .map_cb = map_cb, + }; + struct hl_mmap_mem_buf *buf; + int ctx_id = ctx->asid; + + if ((hdev->disabled) || (hdev->reset_info.in_reset && (ctx_id != HL_KERNEL_ASID_ID))) { + dev_warn_ratelimited(hdev->dev, + "Device is disabled or in reset. Can't create new CBs\n"); + return -EBUSY; + } + + if (cb_size > SZ_2M) { + dev_err(hdev->dev, "CB size %d must be less than %d\n", + cb_size, SZ_2M); + return -EINVAL; + } + + buf = hl_mmap_mem_buf_alloc( + mmg, &cb_behavior, + ctx_id == HL_KERNEL_ASID_ID ? GFP_ATOMIC : GFP_KERNEL, &args); + if (!buf) + return -ENOMEM; + + *handle = buf->handle; + + return 0; +} + +int hl_cb_destroy(struct hl_mem_mgr *mmg, u64 cb_handle) +{ + int rc; + + rc = hl_mmap_mem_buf_put_handle(mmg, cb_handle); + if (rc < 0) + return rc; /* Invalid handle */ + + if (rc == 0) + dev_dbg(mmg->dev, "CB 0x%llx is destroyed while still in use\n", cb_handle); + + return 0; +} + +static int hl_cb_info(struct hl_mem_mgr *mmg, + u64 handle, u32 flags, u32 *usage_cnt, u64 *device_va) +{ + struct hl_cb *cb; + int rc = 0; + + cb = hl_cb_get(mmg, handle); + if (!cb) { + dev_err(mmg->dev, + "CB info failed, no match to handle 0x%llx\n", handle); + return -EINVAL; + } + + if (flags & HL_CB_FLAGS_GET_DEVICE_VA) { + if (cb->is_mmu_mapped) { + *device_va = cb->virtual_addr; + } else { + dev_err(mmg->dev, "CB is not mapped to the device's MMU\n"); + rc = -EINVAL; + goto out; + } + } else { + *usage_cnt = atomic_read(&cb->cs_cnt); + } + +out: + hl_cb_put(cb); + return rc; +} + +int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data) +{ + union hl_cb_args *args = data; + struct hl_device *hdev = hpriv->hdev; + u64 handle = 0, device_va = 0; + enum hl_device_status status; + u32 usage_cnt = 0; + int rc; + + if (!hl_device_operational(hdev, &status)) { + dev_warn_ratelimited(hdev->dev, + "Device is %s. Can't execute CB IOCTL\n", + hdev->status[status]); + return -EBUSY; + } + + switch (args->in.op) { + case HL_CB_OP_CREATE: + if (args->in.cb_size > HL_MAX_CB_SIZE) { + dev_err(hdev->dev, + "User requested CB size %d must be less than %d\n", + args->in.cb_size, HL_MAX_CB_SIZE); + rc = -EINVAL; + } else { + rc = hl_cb_create(hdev, &hpriv->mem_mgr, hpriv->ctx, + args->in.cb_size, false, + !!(args->in.flags & HL_CB_FLAGS_MAP), + &handle); + } + + memset(args, 0, sizeof(*args)); + args->out.cb_handle = handle; + break; + + case HL_CB_OP_DESTROY: + rc = hl_cb_destroy(&hpriv->mem_mgr, + args->in.cb_handle); + break; + + case HL_CB_OP_INFO: + rc = hl_cb_info(&hpriv->mem_mgr, args->in.cb_handle, + args->in.flags, + &usage_cnt, + &device_va); + if (rc) + break; + + memset(&args->out, 0, sizeof(args->out)); + + if (args->in.flags & HL_CB_FLAGS_GET_DEVICE_VA) + args->out.device_va = device_va; + else + args->out.usage_cnt = usage_cnt; + break; + + default: + rc = -EINVAL; + break; + } + + return rc; +} + +struct hl_cb *hl_cb_get(struct hl_mem_mgr *mmg, u64 handle) +{ + struct hl_mmap_mem_buf *buf; + + buf = hl_mmap_mem_buf_get(mmg, handle); + if (!buf) + return NULL; + return buf->private; + +} + +void hl_cb_put(struct hl_cb *cb) +{ + hl_mmap_mem_buf_put(cb->buf); +} + +struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size, + bool internal_cb) +{ + u64 cb_handle; + struct hl_cb *cb; + int rc; + + rc = hl_cb_create(hdev, &hdev->kernel_mem_mgr, hdev->kernel_ctx, cb_size, + internal_cb, false, &cb_handle); + if (rc) { + dev_err(hdev->dev, + "Failed to allocate CB for the kernel driver %d\n", rc); + return NULL; + } + + cb = hl_cb_get(&hdev->kernel_mem_mgr, cb_handle); + /* hl_cb_get should never fail here */ + if (!cb) { + dev_crit(hdev->dev, "Kernel CB handle invalid 0x%x\n", + (u32) cb_handle); + goto destroy_cb; + } + + return cb; + +destroy_cb: + hl_cb_destroy(&hdev->kernel_mem_mgr, cb_handle); + + return NULL; +} + +int hl_cb_pool_init(struct hl_device *hdev) +{ + struct hl_cb *cb; + int i; + + INIT_LIST_HEAD(&hdev->cb_pool); + spin_lock_init(&hdev->cb_pool_lock); + + for (i = 0 ; i < hdev->asic_prop.cb_pool_cb_cnt ; i++) { + cb = hl_cb_alloc(hdev, hdev->asic_prop.cb_pool_cb_size, + HL_KERNEL_ASID_ID, false); + if (cb) { + cb->is_pool = true; + list_add(&cb->pool_list, &hdev->cb_pool); + } else { + hl_cb_pool_fini(hdev); + return -ENOMEM; + } + } + + return 0; +} + +int hl_cb_pool_fini(struct hl_device *hdev) +{ + struct hl_cb *cb, *tmp; + + list_for_each_entry_safe(cb, tmp, &hdev->cb_pool, pool_list) { + list_del(&cb->pool_list); + cb_fini(hdev, cb); + } + + return 0; +} + +int hl_cb_va_pool_init(struct hl_ctx *ctx) +{ + struct hl_device *hdev = ctx->hdev; + struct asic_fixed_properties *prop = &hdev->asic_prop; + int rc; + + if (!hdev->supports_cb_mapping) + return 0; + + ctx->cb_va_pool = gen_pool_create(__ffs(prop->pmmu.page_size), -1); + if (!ctx->cb_va_pool) { + dev_err(hdev->dev, + "Failed to create VA gen pool for CB mapping\n"); + return -ENOMEM; + } + + ctx->cb_va_pool_base = hl_reserve_va_block(hdev, ctx, HL_VA_RANGE_TYPE_HOST, + CB_VA_POOL_SIZE, HL_MMU_VA_ALIGNMENT_NOT_NEEDED); + if (!ctx->cb_va_pool_base) { + rc = -ENOMEM; + goto err_pool_destroy; + } + rc = gen_pool_add(ctx->cb_va_pool, ctx->cb_va_pool_base, CB_VA_POOL_SIZE, -1); + if (rc) { + dev_err(hdev->dev, + "Failed to add memory to VA gen pool for CB mapping\n"); + goto err_unreserve_va_block; + } + + return 0; + +err_unreserve_va_block: + hl_unreserve_va_block(hdev, ctx, ctx->cb_va_pool_base, CB_VA_POOL_SIZE); +err_pool_destroy: + gen_pool_destroy(ctx->cb_va_pool); + + return rc; +} + +void hl_cb_va_pool_fini(struct hl_ctx *ctx) +{ + struct hl_device *hdev = ctx->hdev; + + if (!hdev->supports_cb_mapping) + return; + + gen_pool_destroy(ctx->cb_va_pool); + hl_unreserve_va_block(hdev, ctx, ctx->cb_va_pool_base, CB_VA_POOL_SIZE); +} diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c new file mode 100644 index 000000000..1071bf492 --- /dev/null +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -0,0 +1,3493 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2021 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include <uapi/misc/habanalabs.h> +#include "habanalabs.h" + +#include <linux/uaccess.h> +#include <linux/slab.h> + +#define HL_CS_FLAGS_TYPE_MASK (HL_CS_FLAGS_SIGNAL | HL_CS_FLAGS_WAIT | \ + HL_CS_FLAGS_COLLECTIVE_WAIT | HL_CS_FLAGS_RESERVE_SIGNALS_ONLY | \ + HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY | HL_CS_FLAGS_ENGINE_CORE_COMMAND) + + +#define MAX_TS_ITER_NUM 10 + +/** + * enum hl_cs_wait_status - cs wait status + * @CS_WAIT_STATUS_BUSY: cs was not completed yet + * @CS_WAIT_STATUS_COMPLETED: cs completed + * @CS_WAIT_STATUS_GONE: cs completed but fence is already gone + */ +enum hl_cs_wait_status { + CS_WAIT_STATUS_BUSY, + CS_WAIT_STATUS_COMPLETED, + CS_WAIT_STATUS_GONE +}; + +static void job_wq_completion(struct work_struct *work); +static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, u64 timeout_us, u64 seq, + enum hl_cs_wait_status *status, s64 *timestamp); +static void cs_do_release(struct kref *ref); + +static void hl_push_cs_outcome(struct hl_device *hdev, + struct hl_cs_outcome_store *outcome_store, + u64 seq, ktime_t ts, int error) +{ + struct hl_cs_outcome *node; + unsigned long flags; + + /* + * CS outcome store supports the following operations: + * push outcome - store a recent CS outcome in the store + * pop outcome - retrieve a SPECIFIC (by seq) CS outcome from the store + * It uses 2 lists: used list and free list. + * It has a pre-allocated amount of nodes, each node stores + * a single CS outcome. + * Initially, all the nodes are in the free list. + * On push outcome, a node (any) is taken from the free list, its + * information is filled in, and the node is moved to the used list. + * It is possible, that there are no nodes left in the free list. + * In this case, we will lose some information about old outcomes. We + * will pop the OLDEST node from the used list, and make it free. + * On pop, the node is searched for in the used list (using a search + * index). + * If found, the node is then removed from the used list, and moved + * back to the free list. The outcome data that the node contained is + * returned back to the user. + */ + + spin_lock_irqsave(&outcome_store->db_lock, flags); + + if (list_empty(&outcome_store->free_list)) { + node = list_last_entry(&outcome_store->used_list, + struct hl_cs_outcome, list_link); + hash_del(&node->map_link); + dev_dbg(hdev->dev, "CS %llu outcome was lost\n", node->seq); + } else { + node = list_last_entry(&outcome_store->free_list, + struct hl_cs_outcome, list_link); + } + + list_del_init(&node->list_link); + + node->seq = seq; + node->ts = ts; + node->error = error; + + list_add(&node->list_link, &outcome_store->used_list); + hash_add(outcome_store->outcome_map, &node->map_link, node->seq); + + spin_unlock_irqrestore(&outcome_store->db_lock, flags); +} + +static bool hl_pop_cs_outcome(struct hl_cs_outcome_store *outcome_store, + u64 seq, ktime_t *ts, int *error) +{ + struct hl_cs_outcome *node; + unsigned long flags; + + spin_lock_irqsave(&outcome_store->db_lock, flags); + + hash_for_each_possible(outcome_store->outcome_map, node, map_link, seq) + if (node->seq == seq) { + *ts = node->ts; + *error = node->error; + + hash_del(&node->map_link); + list_del_init(&node->list_link); + list_add(&node->list_link, &outcome_store->free_list); + + spin_unlock_irqrestore(&outcome_store->db_lock, flags); + + return true; + } + + spin_unlock_irqrestore(&outcome_store->db_lock, flags); + + return false; +} + +static void hl_sob_reset(struct kref *ref) +{ + struct hl_hw_sob *hw_sob = container_of(ref, struct hl_hw_sob, + kref); + struct hl_device *hdev = hw_sob->hdev; + + dev_dbg(hdev->dev, "reset sob id %u\n", hw_sob->sob_id); + + hdev->asic_funcs->reset_sob(hdev, hw_sob); + + hw_sob->need_reset = false; +} + +void hl_sob_reset_error(struct kref *ref) +{ + struct hl_hw_sob *hw_sob = container_of(ref, struct hl_hw_sob, + kref); + struct hl_device *hdev = hw_sob->hdev; + + dev_crit(hdev->dev, + "SOB release shouldn't be called here, q_idx: %d, sob_id: %d\n", + hw_sob->q_idx, hw_sob->sob_id); +} + +void hw_sob_put(struct hl_hw_sob *hw_sob) +{ + if (hw_sob) + kref_put(&hw_sob->kref, hl_sob_reset); +} + +static void hw_sob_put_err(struct hl_hw_sob *hw_sob) +{ + if (hw_sob) + kref_put(&hw_sob->kref, hl_sob_reset_error); +} + +void hw_sob_get(struct hl_hw_sob *hw_sob) +{ + if (hw_sob) + kref_get(&hw_sob->kref); +} + +/** + * hl_gen_sob_mask() - Generates a sob mask to be used in a monitor arm packet + * @sob_base: sob base id + * @sob_mask: sob user mask, each bit represents a sob offset from sob base + * @mask: generated mask + * + * Return: 0 if given parameters are valid + */ +int hl_gen_sob_mask(u16 sob_base, u8 sob_mask, u8 *mask) +{ + int i; + + if (sob_mask == 0) + return -EINVAL; + + if (sob_mask == 0x1) { + *mask = ~(1 << (sob_base & 0x7)); + } else { + /* find msb in order to verify sob range is valid */ + for (i = BITS_PER_BYTE - 1 ; i >= 0 ; i--) + if (BIT(i) & sob_mask) + break; + + if (i > (HL_MAX_SOBS_PER_MONITOR - (sob_base & 0x7) - 1)) + return -EINVAL; + + *mask = ~sob_mask; + } + + return 0; +} + +static void hl_fence_release(struct kref *kref) +{ + struct hl_fence *fence = + container_of(kref, struct hl_fence, refcount); + struct hl_cs_compl *hl_cs_cmpl = + container_of(fence, struct hl_cs_compl, base_fence); + + kfree(hl_cs_cmpl); +} + +void hl_fence_put(struct hl_fence *fence) +{ + if (IS_ERR_OR_NULL(fence)) + return; + kref_put(&fence->refcount, hl_fence_release); +} + +void hl_fences_put(struct hl_fence **fence, int len) +{ + int i; + + for (i = 0; i < len; i++, fence++) + hl_fence_put(*fence); +} + +void hl_fence_get(struct hl_fence *fence) +{ + if (fence) + kref_get(&fence->refcount); +} + +static void hl_fence_init(struct hl_fence *fence, u64 sequence) +{ + kref_init(&fence->refcount); + fence->cs_sequence = sequence; + fence->error = 0; + fence->timestamp = ktime_set(0, 0); + fence->mcs_handling_done = false; + init_completion(&fence->completion); +} + +void cs_get(struct hl_cs *cs) +{ + kref_get(&cs->refcount); +} + +static int cs_get_unless_zero(struct hl_cs *cs) +{ + return kref_get_unless_zero(&cs->refcount); +} + +static void cs_put(struct hl_cs *cs) +{ + kref_put(&cs->refcount, cs_do_release); +} + +static void cs_job_do_release(struct kref *ref) +{ + struct hl_cs_job *job = container_of(ref, struct hl_cs_job, refcount); + + kfree(job); +} + +static void hl_cs_job_put(struct hl_cs_job *job) +{ + kref_put(&job->refcount, cs_job_do_release); +} + +bool cs_needs_completion(struct hl_cs *cs) +{ + /* In case this is a staged CS, only the last CS in sequence should + * get a completion, any non staged CS will always get a completion + */ + if (cs->staged_cs && !cs->staged_last) + return false; + + return true; +} + +bool cs_needs_timeout(struct hl_cs *cs) +{ + /* In case this is a staged CS, only the first CS in sequence should + * get a timeout, any non staged CS will always get a timeout + */ + if (cs->staged_cs && !cs->staged_first) + return false; + + return true; +} + +static bool is_cb_patched(struct hl_device *hdev, struct hl_cs_job *job) +{ + /* + * Patched CB is created for external queues jobs, and for H/W queues + * jobs if the user CB was allocated by driver and MMU is disabled. + */ + return (job->queue_type == QUEUE_TYPE_EXT || + (job->queue_type == QUEUE_TYPE_HW && + job->is_kernel_allocated_cb && + !hdev->mmu_enable)); +} + +/* + * cs_parser - parse the user command submission + * + * @hpriv : pointer to the private data of the fd + * @job : pointer to the job that holds the command submission info + * + * The function parses the command submission of the user. It calls the + * ASIC specific parser, which returns a list of memory blocks to send + * to the device as different command buffers + * + */ +static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job) +{ + struct hl_device *hdev = hpriv->hdev; + struct hl_cs_parser parser; + int rc; + + parser.ctx_id = job->cs->ctx->asid; + parser.cs_sequence = job->cs->sequence; + parser.job_id = job->id; + + parser.hw_queue_id = job->hw_queue_id; + parser.job_userptr_list = &job->userptr_list; + parser.patched_cb = NULL; + parser.user_cb = job->user_cb; + parser.user_cb_size = job->user_cb_size; + parser.queue_type = job->queue_type; + parser.is_kernel_allocated_cb = job->is_kernel_allocated_cb; + job->patched_cb = NULL; + parser.completion = cs_needs_completion(job->cs); + + rc = hdev->asic_funcs->cs_parser(hdev, &parser); + + if (is_cb_patched(hdev, job)) { + if (!rc) { + job->patched_cb = parser.patched_cb; + job->job_cb_size = parser.patched_cb_size; + job->contains_dma_pkt = parser.contains_dma_pkt; + atomic_inc(&job->patched_cb->cs_cnt); + } + + /* + * Whether the parsing worked or not, we don't need the + * original CB anymore because it was already parsed and + * won't be accessed again for this CS + */ + atomic_dec(&job->user_cb->cs_cnt); + hl_cb_put(job->user_cb); + job->user_cb = NULL; + } else if (!rc) { + job->job_cb_size = job->user_cb_size; + } + + return rc; +} + +static void hl_complete_job(struct hl_device *hdev, struct hl_cs_job *job) +{ + struct hl_cs *cs = job->cs; + + if (is_cb_patched(hdev, job)) { + hl_userptr_delete_list(hdev, &job->userptr_list); + + /* + * We might arrive here from rollback and patched CB wasn't + * created, so we need to check it's not NULL + */ + if (job->patched_cb) { + atomic_dec(&job->patched_cb->cs_cnt); + hl_cb_put(job->patched_cb); + } + } + + /* For H/W queue jobs, if a user CB was allocated by driver and MMU is + * enabled, the user CB isn't released in cs_parser() and thus should be + * released here. This is also true for INT queues jobs which were + * allocated by driver. + */ + if ((job->is_kernel_allocated_cb && + ((job->queue_type == QUEUE_TYPE_HW && hdev->mmu_enable) || + job->queue_type == QUEUE_TYPE_INT))) { + atomic_dec(&job->user_cb->cs_cnt); + hl_cb_put(job->user_cb); + } + + /* + * This is the only place where there can be multiple threads + * modifying the list at the same time + */ + spin_lock(&cs->job_lock); + list_del(&job->cs_node); + spin_unlock(&cs->job_lock); + + hl_debugfs_remove_job(hdev, job); + + /* We decrement reference only for a CS that gets completion + * because the reference was incremented only for this kind of CS + * right before it was scheduled. + * + * In staged submission, only the last CS marked as 'staged_last' + * gets completion, hence its release function will be called from here. + * As for all the rest CS's in the staged submission which do not get + * completion, their CS reference will be decremented by the + * 'staged_last' CS during the CS release flow. + * All relevant PQ CI counters will be incremented during the CS release + * flow by calling 'hl_hw_queue_update_ci'. + */ + if (cs_needs_completion(cs) && + (job->queue_type == QUEUE_TYPE_EXT || job->queue_type == QUEUE_TYPE_HW)) + cs_put(cs); + + hl_cs_job_put(job); +} + +/* + * hl_staged_cs_find_first - locate the first CS in this staged submission + * + * @hdev: pointer to device structure + * @cs_seq: staged submission sequence number + * + * @note: This function must be called under 'hdev->cs_mirror_lock' + * + * Find and return a CS pointer with the given sequence + */ +struct hl_cs *hl_staged_cs_find_first(struct hl_device *hdev, u64 cs_seq) +{ + struct hl_cs *cs; + + list_for_each_entry_reverse(cs, &hdev->cs_mirror_list, mirror_node) + if (cs->staged_cs && cs->staged_first && + cs->sequence == cs_seq) + return cs; + + return NULL; +} + +/* + * is_staged_cs_last_exists - returns true if the last CS in sequence exists + * + * @hdev: pointer to device structure + * @cs: staged submission member + * + */ +bool is_staged_cs_last_exists(struct hl_device *hdev, struct hl_cs *cs) +{ + struct hl_cs *last_entry; + + last_entry = list_last_entry(&cs->staged_cs_node, struct hl_cs, + staged_cs_node); + + if (last_entry->staged_last) + return true; + + return false; +} + +/* + * staged_cs_get - get CS reference if this CS is a part of a staged CS + * + * @hdev: pointer to device structure + * @cs: current CS + * @cs_seq: staged submission sequence number + * + * Increment CS reference for every CS in this staged submission except for + * the CS which get completion. + */ +static void staged_cs_get(struct hl_device *hdev, struct hl_cs *cs) +{ + /* Only the last CS in this staged submission will get a completion. + * We must increment the reference for all other CS's in this + * staged submission. + * Once we get a completion we will release the whole staged submission. + */ + if (!cs->staged_last) + cs_get(cs); +} + +/* + * staged_cs_put - put a CS in case it is part of staged submission + * + * @hdev: pointer to device structure + * @cs: CS to put + * + * This function decrements a CS reference (for a non completion CS) + */ +static void staged_cs_put(struct hl_device *hdev, struct hl_cs *cs) +{ + /* We release all CS's in a staged submission except the last + * CS which we have never incremented its reference. + */ + if (!cs_needs_completion(cs)) + cs_put(cs); +} + +static void cs_handle_tdr(struct hl_device *hdev, struct hl_cs *cs) +{ + struct hl_cs *next = NULL, *iter, *first_cs; + + if (!cs_needs_timeout(cs)) + return; + + spin_lock(&hdev->cs_mirror_lock); + + /* We need to handle tdr only once for the complete staged submission. + * Hence, we choose the CS that reaches this function first which is + * the CS marked as 'staged_last'. + * In case single staged cs was submitted which has both first and last + * indications, then "cs_find_first" below will return NULL, since we + * removed the cs node from the list before getting here, + * in such cases just continue with the cs to cancel it's TDR work. + */ + if (cs->staged_cs && cs->staged_last) { + first_cs = hl_staged_cs_find_first(hdev, cs->staged_sequence); + if (first_cs) + cs = first_cs; + } + + spin_unlock(&hdev->cs_mirror_lock); + + /* Don't cancel TDR in case this CS was timedout because we might be + * running from the TDR context + */ + if (cs->timedout || hdev->timeout_jiffies == MAX_SCHEDULE_TIMEOUT) + return; + + if (cs->tdr_active) + cancel_delayed_work_sync(&cs->work_tdr); + + spin_lock(&hdev->cs_mirror_lock); + + /* queue TDR for next CS */ + list_for_each_entry(iter, &hdev->cs_mirror_list, mirror_node) + if (cs_needs_timeout(iter)) { + next = iter; + break; + } + + if (next && !next->tdr_active) { + next->tdr_active = true; + schedule_delayed_work(&next->work_tdr, next->timeout_jiffies); + } + + spin_unlock(&hdev->cs_mirror_lock); +} + +/* + * force_complete_multi_cs - complete all contexts that wait on multi-CS + * + * @hdev: pointer to habanalabs device structure + */ +static void force_complete_multi_cs(struct hl_device *hdev) +{ + int i; + + for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) { + struct multi_cs_completion *mcs_compl; + + mcs_compl = &hdev->multi_cs_completion[i]; + + spin_lock(&mcs_compl->lock); + + if (!mcs_compl->used) { + spin_unlock(&mcs_compl->lock); + continue; + } + + /* when calling force complete no context should be waiting on + * multi-cS. + * We are calling the function as a protection for such case + * to free any pending context and print error message + */ + dev_err(hdev->dev, + "multi-CS completion context %d still waiting when calling force completion\n", + i); + complete_all(&mcs_compl->completion); + spin_unlock(&mcs_compl->lock); + } +} + +/* + * complete_multi_cs - complete all waiting entities on multi-CS + * + * @hdev: pointer to habanalabs device structure + * @cs: CS structure + * The function signals a waiting entity that has an overlapping stream masters + * with the completed CS. + * For example: + * - a completed CS worked on stream master QID 4, multi CS completion + * is actively waiting on stream master QIDs 3, 5. don't send signal as no + * common stream master QID + * - a completed CS worked on stream master QID 4, multi CS completion + * is actively waiting on stream master QIDs 3, 4. send signal as stream + * master QID 4 is common + */ +static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs) +{ + struct hl_fence *fence = cs->fence; + int i; + + /* in case of multi CS check for completion only for the first CS */ + if (cs->staged_cs && !cs->staged_first) + return; + + for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) { + struct multi_cs_completion *mcs_compl; + + mcs_compl = &hdev->multi_cs_completion[i]; + if (!mcs_compl->used) + continue; + + spin_lock(&mcs_compl->lock); + + /* + * complete if: + * 1. still waiting for completion + * 2. the completed CS has at least one overlapping stream + * master with the stream masters in the completion + */ + if (mcs_compl->used && + (fence->stream_master_qid_map & + mcs_compl->stream_master_qid_map)) { + /* extract the timestamp only of first completed CS */ + if (!mcs_compl->timestamp) + mcs_compl->timestamp = ktime_to_ns(fence->timestamp); + + complete_all(&mcs_compl->completion); + + /* + * Setting mcs_handling_done inside the lock ensures + * at least one fence have mcs_handling_done set to + * true before wait for mcs finish. This ensures at + * least one CS will be set as completed when polling + * mcs fences. + */ + fence->mcs_handling_done = true; + } + + spin_unlock(&mcs_compl->lock); + } + /* In case CS completed without mcs completion initialized */ + fence->mcs_handling_done = true; +} + +static inline void cs_release_sob_reset_handler(struct hl_device *hdev, + struct hl_cs *cs, + struct hl_cs_compl *hl_cs_cmpl) +{ + /* Skip this handler if the cs wasn't submitted, to avoid putting + * the hw_sob twice, since this case already handled at this point, + * also skip if the hw_sob pointer wasn't set. + */ + if (!hl_cs_cmpl->hw_sob || !cs->submitted) + return; + + spin_lock(&hl_cs_cmpl->lock); + + /* + * we get refcount upon reservation of signals or signal/wait cs for the + * hw_sob object, and need to put it when the first staged cs + * (which cotains the encaps signals) or cs signal/wait is completed. + */ + if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) || + (hl_cs_cmpl->type == CS_TYPE_WAIT) || + (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT) || + (!!hl_cs_cmpl->encaps_signals)) { + dev_dbg(hdev->dev, + "CS 0x%llx type %d finished, sob_id: %d, sob_val: %u\n", + hl_cs_cmpl->cs_seq, + hl_cs_cmpl->type, + hl_cs_cmpl->hw_sob->sob_id, + hl_cs_cmpl->sob_val); + + hw_sob_put(hl_cs_cmpl->hw_sob); + + if (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT) + hdev->asic_funcs->reset_sob_group(hdev, + hl_cs_cmpl->sob_group); + } + + spin_unlock(&hl_cs_cmpl->lock); +} + +static void cs_do_release(struct kref *ref) +{ + struct hl_cs *cs = container_of(ref, struct hl_cs, refcount); + struct hl_device *hdev = cs->ctx->hdev; + struct hl_cs_job *job, *tmp; + struct hl_cs_compl *hl_cs_cmpl = + container_of(cs->fence, struct hl_cs_compl, base_fence); + + cs->completed = true; + + /* + * Although if we reached here it means that all external jobs have + * finished, because each one of them took refcnt to CS, we still + * need to go over the internal jobs and complete them. Otherwise, we + * will have leaked memory and what's worse, the CS object (and + * potentially the CTX object) could be released, while the JOB + * still holds a pointer to them (but no reference). + */ + list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node) + hl_complete_job(hdev, job); + + if (!cs->submitted) { + /* + * In case the wait for signal CS was submitted, the fence put + * occurs in init_signal_wait_cs() or collective_wait_init_cs() + * right before hanging on the PQ. + */ + if (cs->type == CS_TYPE_WAIT || + cs->type == CS_TYPE_COLLECTIVE_WAIT) + hl_fence_put(cs->signal_fence); + + goto out; + } + + /* Need to update CI for all queue jobs that does not get completion */ + hl_hw_queue_update_ci(cs); + + /* remove CS from CS mirror list */ + spin_lock(&hdev->cs_mirror_lock); + list_del_init(&cs->mirror_node); + spin_unlock(&hdev->cs_mirror_lock); + + cs_handle_tdr(hdev, cs); + + if (cs->staged_cs) { + /* the completion CS decrements reference for the entire + * staged submission + */ + if (cs->staged_last) { + struct hl_cs *staged_cs, *tmp_cs; + + list_for_each_entry_safe(staged_cs, tmp_cs, + &cs->staged_cs_node, staged_cs_node) + staged_cs_put(hdev, staged_cs); + } + + /* A staged CS will be a member in the list only after it + * was submitted. We used 'cs_mirror_lock' when inserting + * it to list so we will use it again when removing it + */ + if (cs->submitted) { + spin_lock(&hdev->cs_mirror_lock); + list_del(&cs->staged_cs_node); + spin_unlock(&hdev->cs_mirror_lock); + } + + /* decrement refcount to handle when first staged cs + * with encaps signals is completed. + */ + if (hl_cs_cmpl->encaps_signals) + kref_put(&hl_cs_cmpl->encaps_sig_hdl->refcount, + hl_encaps_handle_do_release); + } + + if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT) + && cs->encaps_signals) + kref_put(&cs->encaps_sig_hdl->refcount, + hl_encaps_handle_do_release); + +out: + /* Must be called before hl_ctx_put because inside we use ctx to get + * the device + */ + hl_debugfs_remove_cs(cs); + + hdev->shadow_cs_queue[cs->sequence & (hdev->asic_prop.max_pending_cs - 1)] = NULL; + + /* We need to mark an error for not submitted because in that case + * the hl fence release flow is different. Mainly, we don't need + * to handle hw_sob for signal/wait + */ + if (cs->timedout) + cs->fence->error = -ETIMEDOUT; + else if (cs->aborted) + cs->fence->error = -EIO; + else if (!cs->submitted) + cs->fence->error = -EBUSY; + + if (unlikely(cs->skip_reset_on_timeout)) { + dev_err(hdev->dev, + "Command submission %llu completed after %llu (s)\n", + cs->sequence, + div_u64(jiffies - cs->submission_time_jiffies, HZ)); + } + + if (cs->timestamp) { + cs->fence->timestamp = ktime_get(); + hl_push_cs_outcome(hdev, &cs->ctx->outcome_store, cs->sequence, + cs->fence->timestamp, cs->fence->error); + } + + hl_ctx_put(cs->ctx); + + complete_all(&cs->fence->completion); + complete_multi_cs(hdev, cs); + + cs_release_sob_reset_handler(hdev, cs, hl_cs_cmpl); + + hl_fence_put(cs->fence); + + kfree(cs->jobs_in_queue_cnt); + kfree(cs); +} + +static void cs_timedout(struct work_struct *work) +{ + struct hl_device *hdev; + u64 event_mask; + int rc; + struct hl_cs *cs = container_of(work, struct hl_cs, + work_tdr.work); + bool skip_reset_on_timeout = cs->skip_reset_on_timeout, device_reset = false; + + rc = cs_get_unless_zero(cs); + if (!rc) + return; + + if ((!cs->submitted) || (cs->completed)) { + cs_put(cs); + return; + } + + hdev = cs->ctx->hdev; + + if (likely(!skip_reset_on_timeout)) { + if (hdev->reset_on_lockup) + device_reset = true; + else + hdev->reset_info.needs_reset = true; + + /* Mark the CS is timed out so we won't try to cancel its TDR */ + cs->timedout = true; + } + + /* Save only the first CS timeout parameters */ + rc = atomic_cmpxchg(&hdev->captured_err_info.cs_timeout.write_enable, 1, 0); + if (rc) { + hdev->captured_err_info.cs_timeout.timestamp = ktime_get(); + hdev->captured_err_info.cs_timeout.seq = cs->sequence; + + event_mask = device_reset ? (HL_NOTIFIER_EVENT_CS_TIMEOUT | + HL_NOTIFIER_EVENT_DEVICE_RESET) : HL_NOTIFIER_EVENT_CS_TIMEOUT; + + hl_notifier_event_send_all(hdev, event_mask); + } + + switch (cs->type) { + case CS_TYPE_SIGNAL: + dev_err(hdev->dev, + "Signal command submission %llu has not finished in time!\n", + cs->sequence); + break; + + case CS_TYPE_WAIT: + dev_err(hdev->dev, + "Wait command submission %llu has not finished in time!\n", + cs->sequence); + break; + + case CS_TYPE_COLLECTIVE_WAIT: + dev_err(hdev->dev, + "Collective Wait command submission %llu has not finished in time!\n", + cs->sequence); + break; + + default: + dev_err(hdev->dev, + "Command submission %llu has not finished in time!\n", + cs->sequence); + break; + } + + rc = hl_state_dump(hdev); + if (rc) + dev_err(hdev->dev, "Error during system state dump %d\n", rc); + + cs_put(cs); + + if (device_reset) + hl_device_reset(hdev, HL_DRV_RESET_TDR); +} + +static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, + enum hl_cs_type cs_type, u64 user_sequence, + struct hl_cs **cs_new, u32 flags, u32 timeout) +{ + struct hl_cs_counters_atomic *cntr; + struct hl_fence *other = NULL; + struct hl_cs_compl *cs_cmpl; + struct hl_cs *cs; + int rc; + + cntr = &hdev->aggregated_cs_counters; + + cs = kzalloc(sizeof(*cs), GFP_ATOMIC); + if (!cs) + cs = kzalloc(sizeof(*cs), GFP_KERNEL); + + if (!cs) { + atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt); + atomic64_inc(&cntr->out_of_mem_drop_cnt); + return -ENOMEM; + } + + /* increment refcnt for context */ + hl_ctx_get(ctx); + + cs->ctx = ctx; + cs->submitted = false; + cs->completed = false; + cs->type = cs_type; + cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP); + cs->encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS); + cs->timeout_jiffies = timeout; + cs->skip_reset_on_timeout = + hdev->reset_info.skip_reset_on_timeout || + !!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT); + cs->submission_time_jiffies = jiffies; + INIT_LIST_HEAD(&cs->job_list); + INIT_DELAYED_WORK(&cs->work_tdr, cs_timedout); + kref_init(&cs->refcount); + spin_lock_init(&cs->job_lock); + + cs_cmpl = kzalloc(sizeof(*cs_cmpl), GFP_ATOMIC); + if (!cs_cmpl) + cs_cmpl = kzalloc(sizeof(*cs_cmpl), GFP_KERNEL); + + if (!cs_cmpl) { + atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt); + atomic64_inc(&cntr->out_of_mem_drop_cnt); + rc = -ENOMEM; + goto free_cs; + } + + cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues, + sizeof(*cs->jobs_in_queue_cnt), GFP_ATOMIC); + if (!cs->jobs_in_queue_cnt) + cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues, + sizeof(*cs->jobs_in_queue_cnt), GFP_KERNEL); + + if (!cs->jobs_in_queue_cnt) { + atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt); + atomic64_inc(&cntr->out_of_mem_drop_cnt); + rc = -ENOMEM; + goto free_cs_cmpl; + } + + cs_cmpl->hdev = hdev; + cs_cmpl->type = cs->type; + spin_lock_init(&cs_cmpl->lock); + cs->fence = &cs_cmpl->base_fence; + + spin_lock(&ctx->cs_lock); + + cs_cmpl->cs_seq = ctx->cs_sequence; + other = ctx->cs_pending[cs_cmpl->cs_seq & + (hdev->asic_prop.max_pending_cs - 1)]; + + if (other && !completion_done(&other->completion)) { + /* If the following statement is true, it means we have reached + * a point in which only part of the staged submission was + * submitted and we don't have enough room in the 'cs_pending' + * array for the rest of the submission. + * This causes a deadlock because this CS will never be + * completed as it depends on future CS's for completion. + */ + if (other->cs_sequence == user_sequence) + dev_crit_ratelimited(hdev->dev, + "Staged CS %llu deadlock due to lack of resources", + user_sequence); + + dev_dbg_ratelimited(hdev->dev, + "Rejecting CS because of too many in-flights CS\n"); + atomic64_inc(&ctx->cs_counters.max_cs_in_flight_drop_cnt); + atomic64_inc(&cntr->max_cs_in_flight_drop_cnt); + rc = -EAGAIN; + goto free_fence; + } + + /* init hl_fence */ + hl_fence_init(&cs_cmpl->base_fence, cs_cmpl->cs_seq); + + cs->sequence = cs_cmpl->cs_seq; + + ctx->cs_pending[cs_cmpl->cs_seq & + (hdev->asic_prop.max_pending_cs - 1)] = + &cs_cmpl->base_fence; + ctx->cs_sequence++; + + hl_fence_get(&cs_cmpl->base_fence); + + hl_fence_put(other); + + spin_unlock(&ctx->cs_lock); + + *cs_new = cs; + + return 0; + +free_fence: + spin_unlock(&ctx->cs_lock); + kfree(cs->jobs_in_queue_cnt); +free_cs_cmpl: + kfree(cs_cmpl); +free_cs: + kfree(cs); + hl_ctx_put(ctx); + return rc; +} + +static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs) +{ + struct hl_cs_job *job, *tmp; + + staged_cs_put(hdev, cs); + + list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node) + hl_complete_job(hdev, job); +} + +void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush) +{ + int i; + struct hl_cs *cs, *tmp; + + if (!skip_wq_flush) { + flush_workqueue(hdev->ts_free_obj_wq); + + /* flush all completions before iterating over the CS mirror list in + * order to avoid a race with the release functions + */ + for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) + flush_workqueue(hdev->cq_wq[i]); + + flush_workqueue(hdev->cs_cmplt_wq); + } + + /* Make sure we don't have leftovers in the CS mirror list */ + list_for_each_entry_safe(cs, tmp, &hdev->cs_mirror_list, mirror_node) { + cs_get(cs); + cs->aborted = true; + dev_warn_ratelimited(hdev->dev, "Killing CS %d.%llu\n", + cs->ctx->asid, cs->sequence); + cs_rollback(hdev, cs); + cs_put(cs); + } + + force_complete_multi_cs(hdev); +} + +static void +wake_pending_user_interrupt_threads(struct hl_user_interrupt *interrupt) +{ + struct hl_user_pending_interrupt *pend, *temp; + unsigned long flags; + + spin_lock_irqsave(&interrupt->wait_list_lock, flags); + list_for_each_entry_safe(pend, temp, &interrupt->wait_list_head, wait_list_node) { + if (pend->ts_reg_info.buf) { + list_del(&pend->wait_list_node); + hl_mmap_mem_buf_put(pend->ts_reg_info.buf); + hl_cb_put(pend->ts_reg_info.cq_cb); + } else { + pend->fence.error = -EIO; + complete_all(&pend->fence.completion); + } + } + spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); +} + +void hl_release_pending_user_interrupts(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct hl_user_interrupt *interrupt; + int i; + + if (!prop->user_interrupt_count) + return; + + /* We iterate through the user interrupt requests and waking up all + * user threads waiting for interrupt completion. We iterate the + * list under a lock, this is why all user threads, once awake, + * will wait on the same lock and will release the waiting object upon + * unlock. + */ + + for (i = 0 ; i < prop->user_interrupt_count ; i++) { + interrupt = &hdev->user_interrupt[i]; + wake_pending_user_interrupt_threads(interrupt); + } + + interrupt = &hdev->common_user_cq_interrupt; + wake_pending_user_interrupt_threads(interrupt); + + interrupt = &hdev->common_decoder_interrupt; + wake_pending_user_interrupt_threads(interrupt); +} + +static void job_wq_completion(struct work_struct *work) +{ + struct hl_cs_job *job = container_of(work, struct hl_cs_job, + finish_work); + struct hl_cs *cs = job->cs; + struct hl_device *hdev = cs->ctx->hdev; + + /* job is no longer needed */ + hl_complete_job(hdev, job); +} + +static void cs_completion(struct work_struct *work) +{ + struct hl_cs *cs = container_of(work, struct hl_cs, finish_work); + struct hl_device *hdev = cs->ctx->hdev; + struct hl_cs_job *job, *tmp; + + list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node) + hl_complete_job(hdev, job); +} + +static int validate_queue_index(struct hl_device *hdev, + struct hl_cs_chunk *chunk, + enum hl_queue_type *queue_type, + bool *is_kernel_allocated_cb) +{ + struct asic_fixed_properties *asic = &hdev->asic_prop; + struct hw_queue_properties *hw_queue_prop; + + /* This must be checked here to prevent out-of-bounds access to + * hw_queues_props array + */ + if (chunk->queue_index >= asic->max_queues) { + dev_err(hdev->dev, "Queue index %d is invalid\n", + chunk->queue_index); + return -EINVAL; + } + + hw_queue_prop = &asic->hw_queues_props[chunk->queue_index]; + + if (hw_queue_prop->type == QUEUE_TYPE_NA) { + dev_err(hdev->dev, "Queue index %d is not applicable\n", + chunk->queue_index); + return -EINVAL; + } + + if (hw_queue_prop->binned) { + dev_err(hdev->dev, "Queue index %d is binned out\n", + chunk->queue_index); + return -EINVAL; + } + + if (hw_queue_prop->driver_only) { + dev_err(hdev->dev, + "Queue index %d is restricted for the kernel driver\n", + chunk->queue_index); + return -EINVAL; + } + + /* When hw queue type isn't QUEUE_TYPE_HW, + * USER_ALLOC_CB flag shall be referred as "don't care". + */ + if (hw_queue_prop->type == QUEUE_TYPE_HW) { + if (chunk->cs_chunk_flags & HL_CS_CHUNK_FLAGS_USER_ALLOC_CB) { + if (!(hw_queue_prop->cb_alloc_flags & CB_ALLOC_USER)) { + dev_err(hdev->dev, + "Queue index %d doesn't support user CB\n", + chunk->queue_index); + return -EINVAL; + } + + *is_kernel_allocated_cb = false; + } else { + if (!(hw_queue_prop->cb_alloc_flags & + CB_ALLOC_KERNEL)) { + dev_err(hdev->dev, + "Queue index %d doesn't support kernel CB\n", + chunk->queue_index); + return -EINVAL; + } + + *is_kernel_allocated_cb = true; + } + } else { + *is_kernel_allocated_cb = !!(hw_queue_prop->cb_alloc_flags + & CB_ALLOC_KERNEL); + } + + *queue_type = hw_queue_prop->type; + return 0; +} + +static struct hl_cb *get_cb_from_cs_chunk(struct hl_device *hdev, + struct hl_mem_mgr *mmg, + struct hl_cs_chunk *chunk) +{ + struct hl_cb *cb; + + cb = hl_cb_get(mmg, chunk->cb_handle); + if (!cb) { + dev_err(hdev->dev, "CB handle 0x%llx invalid\n", chunk->cb_handle); + return NULL; + } + + if ((chunk->cb_size < 8) || (chunk->cb_size > cb->size)) { + dev_err(hdev->dev, "CB size %u invalid\n", chunk->cb_size); + goto release_cb; + } + + atomic_inc(&cb->cs_cnt); + + return cb; + +release_cb: + hl_cb_put(cb); + return NULL; +} + +struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, + enum hl_queue_type queue_type, bool is_kernel_allocated_cb) +{ + struct hl_cs_job *job; + + job = kzalloc(sizeof(*job), GFP_ATOMIC); + if (!job) + job = kzalloc(sizeof(*job), GFP_KERNEL); + + if (!job) + return NULL; + + kref_init(&job->refcount); + job->queue_type = queue_type; + job->is_kernel_allocated_cb = is_kernel_allocated_cb; + + if (is_cb_patched(hdev, job)) + INIT_LIST_HEAD(&job->userptr_list); + + if (job->queue_type == QUEUE_TYPE_EXT) + INIT_WORK(&job->finish_work, job_wq_completion); + + return job; +} + +static enum hl_cs_type hl_cs_get_cs_type(u32 cs_type_flags) +{ + if (cs_type_flags & HL_CS_FLAGS_SIGNAL) + return CS_TYPE_SIGNAL; + else if (cs_type_flags & HL_CS_FLAGS_WAIT) + return CS_TYPE_WAIT; + else if (cs_type_flags & HL_CS_FLAGS_COLLECTIVE_WAIT) + return CS_TYPE_COLLECTIVE_WAIT; + else if (cs_type_flags & HL_CS_FLAGS_RESERVE_SIGNALS_ONLY) + return CS_RESERVE_SIGNALS; + else if (cs_type_flags & HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY) + return CS_UNRESERVE_SIGNALS; + else if (cs_type_flags & HL_CS_FLAGS_ENGINE_CORE_COMMAND) + return CS_TYPE_ENGINE_CORE; + else + return CS_TYPE_DEFAULT; +} + +static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + struct hl_ctx *ctx = hpriv->ctx; + u32 cs_type_flags, num_chunks; + enum hl_device_status status; + enum hl_cs_type cs_type; + bool is_sync_stream; + + if (!hl_device_operational(hdev, &status)) { + return -EBUSY; + } + + if ((args->in.cs_flags & HL_CS_FLAGS_STAGED_SUBMISSION) && + !hdev->supports_staged_submission) { + dev_err(hdev->dev, "staged submission not supported"); + return -EPERM; + } + + cs_type_flags = args->in.cs_flags & HL_CS_FLAGS_TYPE_MASK; + + if (unlikely(cs_type_flags && !is_power_of_2(cs_type_flags))) { + dev_err(hdev->dev, + "CS type flags are mutually exclusive, context %d\n", + ctx->asid); + return -EINVAL; + } + + cs_type = hl_cs_get_cs_type(cs_type_flags); + num_chunks = args->in.num_chunks_execute; + + is_sync_stream = (cs_type == CS_TYPE_SIGNAL || cs_type == CS_TYPE_WAIT || + cs_type == CS_TYPE_COLLECTIVE_WAIT); + + if (unlikely(is_sync_stream && !hdev->supports_sync_stream)) { + dev_err(hdev->dev, "Sync stream CS is not supported\n"); + return -EINVAL; + } + + if (cs_type == CS_TYPE_DEFAULT) { + if (!num_chunks) { + dev_err(hdev->dev, "Got execute CS with 0 chunks, context %d\n", ctx->asid); + return -EINVAL; + } + } else if (is_sync_stream && num_chunks != 1) { + dev_err(hdev->dev, + "Sync stream CS mandates one chunk only, context %d\n", + ctx->asid); + return -EINVAL; + } + + return 0; +} + +static int hl_cs_copy_chunk_array(struct hl_device *hdev, + struct hl_cs_chunk **cs_chunk_array, + void __user *chunks, u32 num_chunks, + struct hl_ctx *ctx) +{ + u32 size_to_copy; + + if (num_chunks > HL_MAX_JOBS_PER_CS) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt); + dev_err(hdev->dev, + "Number of chunks can NOT be larger than %d\n", + HL_MAX_JOBS_PER_CS); + return -EINVAL; + } + + *cs_chunk_array = kmalloc_array(num_chunks, sizeof(**cs_chunk_array), + GFP_ATOMIC); + if (!*cs_chunk_array) + *cs_chunk_array = kmalloc_array(num_chunks, + sizeof(**cs_chunk_array), GFP_KERNEL); + if (!*cs_chunk_array) { + atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt); + atomic64_inc(&hdev->aggregated_cs_counters.out_of_mem_drop_cnt); + return -ENOMEM; + } + + size_to_copy = num_chunks * sizeof(struct hl_cs_chunk); + if (copy_from_user(*cs_chunk_array, chunks, size_to_copy)) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt); + dev_err(hdev->dev, "Failed to copy cs chunk array from user\n"); + kfree(*cs_chunk_array); + return -EFAULT; + } + + return 0; +} + +static int cs_staged_submission(struct hl_device *hdev, struct hl_cs *cs, + u64 sequence, u32 flags, + u32 encaps_signal_handle) +{ + if (!(flags & HL_CS_FLAGS_STAGED_SUBMISSION)) + return 0; + + cs->staged_last = !!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_LAST); + cs->staged_first = !!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST); + + if (cs->staged_first) { + /* Staged CS sequence is the first CS sequence */ + INIT_LIST_HEAD(&cs->staged_cs_node); + cs->staged_sequence = cs->sequence; + + if (cs->encaps_signals) + cs->encaps_sig_hdl_id = encaps_signal_handle; + } else { + /* User sequence will be validated in 'hl_hw_queue_schedule_cs' + * under the cs_mirror_lock + */ + cs->staged_sequence = sequence; + } + + /* Increment CS reference if needed */ + staged_cs_get(hdev, cs); + + cs->staged_cs = true; + + return 0; +} + +static u32 get_stream_master_qid_mask(struct hl_device *hdev, u32 qid) +{ + int i; + + for (i = 0; i < hdev->stream_master_qid_arr_size; i++) + if (qid == hdev->stream_master_qid_arr[i]) + return BIT(i); + + return 0; +} + +static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, + u32 num_chunks, u64 *cs_seq, u32 flags, + u32 encaps_signals_handle, u32 timeout, + u16 *signal_initial_sob_count) +{ + bool staged_mid, int_queues_only = true, using_hw_queues = false; + struct hl_device *hdev = hpriv->hdev; + struct hl_cs_chunk *cs_chunk_array; + struct hl_cs_counters_atomic *cntr; + struct hl_ctx *ctx = hpriv->ctx; + struct hl_cs_job *job; + struct hl_cs *cs; + struct hl_cb *cb; + u64 user_sequence; + u8 stream_master_qid_map = 0; + int rc, i; + + cntr = &hdev->aggregated_cs_counters; + user_sequence = *cs_seq; + *cs_seq = ULLONG_MAX; + + rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks, + hpriv->ctx); + if (rc) + goto out; + + if ((flags & HL_CS_FLAGS_STAGED_SUBMISSION) && + !(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST)) + staged_mid = true; + else + staged_mid = false; + + rc = allocate_cs(hdev, hpriv->ctx, CS_TYPE_DEFAULT, + staged_mid ? user_sequence : ULLONG_MAX, &cs, flags, + timeout); + if (rc) + goto free_cs_chunk_array; + + *cs_seq = cs->sequence; + + hl_debugfs_add_cs(cs); + + rc = cs_staged_submission(hdev, cs, user_sequence, flags, + encaps_signals_handle); + if (rc) + goto free_cs_object; + + /* If this is a staged submission we must return the staged sequence + * rather than the internal CS sequence + */ + if (cs->staged_cs) + *cs_seq = cs->staged_sequence; + + /* Validate ALL the CS chunks before submitting the CS */ + for (i = 0 ; i < num_chunks ; i++) { + struct hl_cs_chunk *chunk = &cs_chunk_array[i]; + enum hl_queue_type queue_type; + bool is_kernel_allocated_cb; + + rc = validate_queue_index(hdev, chunk, &queue_type, + &is_kernel_allocated_cb); + if (rc) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&cntr->validation_drop_cnt); + goto free_cs_object; + } + + if (is_kernel_allocated_cb) { + cb = get_cb_from_cs_chunk(hdev, &hpriv->mem_mgr, chunk); + if (!cb) { + atomic64_inc( + &ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&cntr->validation_drop_cnt); + rc = -EINVAL; + goto free_cs_object; + } + } else { + cb = (struct hl_cb *) (uintptr_t) chunk->cb_handle; + } + + if (queue_type == QUEUE_TYPE_EXT || + queue_type == QUEUE_TYPE_HW) { + int_queues_only = false; + + /* + * store which stream are being used for external/HW + * queues of this CS + */ + if (hdev->supports_wait_for_multi_cs) + stream_master_qid_map |= + get_stream_master_qid_mask(hdev, + chunk->queue_index); + } + + if (queue_type == QUEUE_TYPE_HW) + using_hw_queues = true; + + job = hl_cs_allocate_job(hdev, queue_type, + is_kernel_allocated_cb); + if (!job) { + atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt); + atomic64_inc(&cntr->out_of_mem_drop_cnt); + dev_err(hdev->dev, "Failed to allocate a new job\n"); + rc = -ENOMEM; + if (is_kernel_allocated_cb) + goto release_cb; + + goto free_cs_object; + } + + job->id = i + 1; + job->cs = cs; + job->user_cb = cb; + job->user_cb_size = chunk->cb_size; + job->hw_queue_id = chunk->queue_index; + + cs->jobs_in_queue_cnt[job->hw_queue_id]++; + cs->jobs_cnt++; + + list_add_tail(&job->cs_node, &cs->job_list); + + /* + * Increment CS reference. When CS reference is 0, CS is + * done and can be signaled to user and free all its resources + * Only increment for JOB on external or H/W queues, because + * only for those JOBs we get completion + */ + if (cs_needs_completion(cs) && + (job->queue_type == QUEUE_TYPE_EXT || + job->queue_type == QUEUE_TYPE_HW)) + cs_get(cs); + + hl_debugfs_add_job(hdev, job); + + rc = cs_parser(hpriv, job); + if (rc) { + atomic64_inc(&ctx->cs_counters.parsing_drop_cnt); + atomic64_inc(&cntr->parsing_drop_cnt); + dev_err(hdev->dev, + "Failed to parse JOB %d.%llu.%d, err %d, rejecting the CS\n", + cs->ctx->asid, cs->sequence, job->id, rc); + goto free_cs_object; + } + } + + /* We allow a CS with any queue type combination as long as it does + * not get a completion + */ + if (int_queues_only && cs_needs_completion(cs)) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&cntr->validation_drop_cnt); + dev_err(hdev->dev, + "Reject CS %d.%llu since it contains only internal queues jobs and needs completion\n", + cs->ctx->asid, cs->sequence); + rc = -EINVAL; + goto free_cs_object; + } + + if (using_hw_queues) + INIT_WORK(&cs->finish_work, cs_completion); + + /* + * store the (external/HW queues) streams used by the CS in the + * fence object for multi-CS completion + */ + if (hdev->supports_wait_for_multi_cs) + cs->fence->stream_master_qid_map = stream_master_qid_map; + + rc = hl_hw_queue_schedule_cs(cs); + if (rc) { + if (rc != -EAGAIN) + dev_err(hdev->dev, + "Failed to submit CS %d.%llu to H/W queues, error %d\n", + cs->ctx->asid, cs->sequence, rc); + goto free_cs_object; + } + + *signal_initial_sob_count = cs->initial_sob_count; + + rc = HL_CS_STATUS_SUCCESS; + goto put_cs; + +release_cb: + atomic_dec(&cb->cs_cnt); + hl_cb_put(cb); +free_cs_object: + cs_rollback(hdev, cs); + *cs_seq = ULLONG_MAX; + /* The path below is both for good and erroneous exits */ +put_cs: + /* We finished with the CS in this function, so put the ref */ + cs_put(cs); +free_cs_chunk_array: + kfree(cs_chunk_array); +out: + return rc; +} + +static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args, + u64 *cs_seq) +{ + struct hl_device *hdev = hpriv->hdev; + struct hl_ctx *ctx = hpriv->ctx; + bool need_soft_reset = false; + int rc = 0, do_ctx_switch = 0; + void __user *chunks; + u32 num_chunks, tmp; + u16 sob_count; + int ret; + + if (hdev->supports_ctx_switch) + do_ctx_switch = atomic_cmpxchg(&ctx->thread_ctx_switch_token, 1, 0); + + if (do_ctx_switch || (args->in.cs_flags & HL_CS_FLAGS_FORCE_RESTORE)) { + mutex_lock(&hpriv->restore_phase_mutex); + + if (do_ctx_switch) { + rc = hdev->asic_funcs->context_switch(hdev, ctx->asid); + if (rc) { + dev_err_ratelimited(hdev->dev, + "Failed to switch to context %d, rejecting CS! %d\n", + ctx->asid, rc); + /* + * If we timedout, or if the device is not IDLE + * while we want to do context-switch (-EBUSY), + * we need to soft-reset because QMAN is + * probably stuck. However, we can't call to + * reset here directly because of deadlock, so + * need to do it at the very end of this + * function + */ + if ((rc == -ETIMEDOUT) || (rc == -EBUSY)) + need_soft_reset = true; + mutex_unlock(&hpriv->restore_phase_mutex); + goto out; + } + } + + hdev->asic_funcs->restore_phase_topology(hdev); + + chunks = (void __user *) (uintptr_t) args->in.chunks_restore; + num_chunks = args->in.num_chunks_restore; + + if (!num_chunks) { + dev_dbg(hdev->dev, + "Need to run restore phase but restore CS is empty\n"); + rc = 0; + } else { + rc = cs_ioctl_default(hpriv, chunks, num_chunks, + cs_seq, 0, 0, hdev->timeout_jiffies, &sob_count); + } + + mutex_unlock(&hpriv->restore_phase_mutex); + + if (rc) { + dev_err(hdev->dev, + "Failed to submit restore CS for context %d (%d)\n", + ctx->asid, rc); + goto out; + } + + /* Need to wait for restore completion before execution phase */ + if (num_chunks) { + enum hl_cs_wait_status status; +wait_again: + ret = _hl_cs_wait_ioctl(hdev, ctx, + jiffies_to_usecs(hdev->timeout_jiffies), + *cs_seq, &status, NULL); + if (ret) { + if (ret == -ERESTARTSYS) { + usleep_range(100, 200); + goto wait_again; + } + + dev_err(hdev->dev, + "Restore CS for context %d failed to complete %d\n", + ctx->asid, ret); + rc = -ENOEXEC; + goto out; + } + } + + if (hdev->supports_ctx_switch) + ctx->thread_ctx_switch_wait_token = 1; + + } else if (hdev->supports_ctx_switch && !ctx->thread_ctx_switch_wait_token) { + rc = hl_poll_timeout_memory(hdev, + &ctx->thread_ctx_switch_wait_token, tmp, (tmp == 1), + 100, jiffies_to_usecs(hdev->timeout_jiffies), false); + + if (rc == -ETIMEDOUT) { + dev_err(hdev->dev, + "context switch phase timeout (%d)\n", tmp); + goto out; + } + } + +out: + if ((rc == -ETIMEDOUT || rc == -EBUSY) && (need_soft_reset)) + hl_device_reset(hdev, 0); + + return rc; +} + +/* + * hl_cs_signal_sob_wraparound_handler: handle SOB value wrapaound case. + * if the SOB value reaches the max value move to the other SOB reserved + * to the queue. + * @hdev: pointer to device structure + * @q_idx: stream queue index + * @hw_sob: the H/W SOB used in this signal CS. + * @count: signals count + * @encaps_sig: tells whether it's reservation for encaps signals or not. + * + * Note that this function must be called while hw_queues_lock is taken. + */ +int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx, + struct hl_hw_sob **hw_sob, u32 count, bool encaps_sig) + +{ + struct hl_sync_stream_properties *prop; + struct hl_hw_sob *sob = *hw_sob, *other_sob; + u8 other_sob_offset; + + prop = &hdev->kernel_queues[q_idx].sync_stream_prop; + + hw_sob_get(sob); + + /* check for wraparound */ + if (prop->next_sob_val + count >= HL_MAX_SOB_VAL) { + /* + * Decrement as we reached the max value. + * The release function won't be called here as we've + * just incremented the refcount right before calling this + * function. + */ + hw_sob_put_err(sob); + + /* + * check the other sob value, if it still in use then fail + * otherwise make the switch + */ + other_sob_offset = (prop->curr_sob_offset + 1) % HL_RSVD_SOBS; + other_sob = &prop->hw_sob[other_sob_offset]; + + if (kref_read(&other_sob->kref) != 1) { + dev_err(hdev->dev, "error: Cannot switch SOBs q_idx: %d\n", + q_idx); + return -EINVAL; + } + + /* + * next_sob_val always points to the next available signal + * in the sob, so in encaps signals it will be the next one + * after reserving the required amount. + */ + if (encaps_sig) + prop->next_sob_val = count + 1; + else + prop->next_sob_val = count; + + /* only two SOBs are currently in use */ + prop->curr_sob_offset = other_sob_offset; + *hw_sob = other_sob; + + /* + * check if other_sob needs reset, then do it before using it + * for the reservation or the next signal cs. + * we do it here, and for both encaps and regular signal cs + * cases in order to avoid possible races of two kref_put + * of the sob which can occur at the same time if we move the + * sob reset(kref_put) to cs_do_release function. + * in addition, if we have combination of cs signal and + * encaps, and at the point we need to reset the sob there was + * no more reservations and only signal cs keep coming, + * in such case we need signal_cs to put the refcount and + * reset the sob. + */ + if (other_sob->need_reset) + hw_sob_put(other_sob); + + if (encaps_sig) { + /* set reset indication for the sob */ + sob->need_reset = true; + hw_sob_get(other_sob); + } + + dev_dbg(hdev->dev, "switched to SOB %d, q_idx: %d\n", + prop->curr_sob_offset, q_idx); + } else { + prop->next_sob_val += count; + } + + return 0; +} + +static int cs_ioctl_extract_signal_seq(struct hl_device *hdev, + struct hl_cs_chunk *chunk, u64 *signal_seq, struct hl_ctx *ctx, + bool encaps_signals) +{ + u64 *signal_seq_arr = NULL; + u32 size_to_copy, signal_seq_arr_len; + int rc = 0; + + if (encaps_signals) { + *signal_seq = chunk->encaps_signal_seq; + return 0; + } + + signal_seq_arr_len = chunk->num_signal_seq_arr; + + /* currently only one signal seq is supported */ + if (signal_seq_arr_len != 1) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt); + dev_err(hdev->dev, + "Wait for signal CS supports only one signal CS seq\n"); + return -EINVAL; + } + + signal_seq_arr = kmalloc_array(signal_seq_arr_len, + sizeof(*signal_seq_arr), + GFP_ATOMIC); + if (!signal_seq_arr) + signal_seq_arr = kmalloc_array(signal_seq_arr_len, + sizeof(*signal_seq_arr), + GFP_KERNEL); + if (!signal_seq_arr) { + atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt); + atomic64_inc(&hdev->aggregated_cs_counters.out_of_mem_drop_cnt); + return -ENOMEM; + } + + size_to_copy = signal_seq_arr_len * sizeof(*signal_seq_arr); + if (copy_from_user(signal_seq_arr, + u64_to_user_ptr(chunk->signal_seq_arr), + size_to_copy)) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt); + dev_err(hdev->dev, + "Failed to copy signal seq array from user\n"); + rc = -EFAULT; + goto out; + } + + /* currently it is guaranteed to have only one signal seq */ + *signal_seq = signal_seq_arr[0]; + +out: + kfree(signal_seq_arr); + + return rc; +} + +static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev, + struct hl_ctx *ctx, struct hl_cs *cs, + enum hl_queue_type q_type, u32 q_idx, u32 encaps_signal_offset) +{ + struct hl_cs_counters_atomic *cntr; + struct hl_cs_job *job; + struct hl_cb *cb; + u32 cb_size; + + cntr = &hdev->aggregated_cs_counters; + + job = hl_cs_allocate_job(hdev, q_type, true); + if (!job) { + atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt); + atomic64_inc(&cntr->out_of_mem_drop_cnt); + dev_err(hdev->dev, "Failed to allocate a new job\n"); + return -ENOMEM; + } + + if (cs->type == CS_TYPE_WAIT) + cb_size = hdev->asic_funcs->get_wait_cb_size(hdev); + else + cb_size = hdev->asic_funcs->get_signal_cb_size(hdev); + + cb = hl_cb_kernel_create(hdev, cb_size, + q_type == QUEUE_TYPE_HW && hdev->mmu_enable); + if (!cb) { + atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt); + atomic64_inc(&cntr->out_of_mem_drop_cnt); + kfree(job); + return -EFAULT; + } + + job->id = 0; + job->cs = cs; + job->user_cb = cb; + atomic_inc(&job->user_cb->cs_cnt); + job->user_cb_size = cb_size; + job->hw_queue_id = q_idx; + + if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT) + && cs->encaps_signals) + job->encaps_sig_wait_offset = encaps_signal_offset; + /* + * No need in parsing, user CB is the patched CB. + * We call hl_cb_destroy() out of two reasons - we don't need the CB in + * the CB idr anymore and to decrement its refcount as it was + * incremented inside hl_cb_kernel_create(). + */ + job->patched_cb = job->user_cb; + job->job_cb_size = job->user_cb_size; + hl_cb_destroy(&hdev->kernel_mem_mgr, cb->buf->handle); + + /* increment refcount as for external queues we get completion */ + cs_get(cs); + + cs->jobs_in_queue_cnt[job->hw_queue_id]++; + cs->jobs_cnt++; + + list_add_tail(&job->cs_node, &cs->job_list); + + hl_debugfs_add_job(hdev, job); + + return 0; +} + +static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv, + u32 q_idx, u32 count, + u32 *handle_id, u32 *sob_addr, + u32 *signals_count) +{ + struct hw_queue_properties *hw_queue_prop; + struct hl_sync_stream_properties *prop; + struct hl_device *hdev = hpriv->hdev; + struct hl_cs_encaps_sig_handle *handle; + struct hl_encaps_signals_mgr *mgr; + struct hl_hw_sob *hw_sob; + int hdl_id; + int rc = 0; + + if (count >= HL_MAX_SOB_VAL) { + dev_err(hdev->dev, "signals count(%u) exceeds the max SOB value\n", + count); + rc = -EINVAL; + goto out; + } + + if (q_idx >= hdev->asic_prop.max_queues) { + dev_err(hdev->dev, "Queue index %d is invalid\n", + q_idx); + rc = -EINVAL; + goto out; + } + + hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx]; + + if (!hw_queue_prop->supports_sync_stream) { + dev_err(hdev->dev, + "Queue index %d does not support sync stream operations\n", + q_idx); + rc = -EINVAL; + goto out; + } + + prop = &hdev->kernel_queues[q_idx].sync_stream_prop; + + handle = kzalloc(sizeof(*handle), GFP_KERNEL); + if (!handle) { + rc = -ENOMEM; + goto out; + } + + handle->count = count; + + hl_ctx_get(hpriv->ctx); + handle->ctx = hpriv->ctx; + mgr = &hpriv->ctx->sig_mgr; + + spin_lock(&mgr->lock); + hdl_id = idr_alloc(&mgr->handles, handle, 1, 0, GFP_ATOMIC); + spin_unlock(&mgr->lock); + + if (hdl_id < 0) { + dev_err(hdev->dev, "Failed to allocate IDR for a new signal reservation\n"); + rc = -EINVAL; + goto put_ctx; + } + + handle->id = hdl_id; + handle->q_idx = q_idx; + handle->hdev = hdev; + kref_init(&handle->refcount); + + hdev->asic_funcs->hw_queues_lock(hdev); + + hw_sob = &prop->hw_sob[prop->curr_sob_offset]; + + /* + * Increment the SOB value by count by user request + * to reserve those signals + * check if the signals amount to reserve is not exceeding the max sob + * value, if yes then switch sob. + */ + rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, count, + true); + if (rc) { + dev_err(hdev->dev, "Failed to switch SOB\n"); + hdev->asic_funcs->hw_queues_unlock(hdev); + rc = -EINVAL; + goto remove_idr; + } + /* set the hw_sob to the handle after calling the sob wraparound handler + * since sob could have changed. + */ + handle->hw_sob = hw_sob; + + /* store the current sob value for unreserve validity check, and + * signal offset support + */ + handle->pre_sob_val = prop->next_sob_val - handle->count; + + *signals_count = prop->next_sob_val; + hdev->asic_funcs->hw_queues_unlock(hdev); + + *sob_addr = handle->hw_sob->sob_addr; + *handle_id = hdl_id; + + dev_dbg(hdev->dev, + "Signals reserved, sob_id: %d, sob addr: 0x%x, last sob_val: %u, q_idx: %d, hdl_id: %d\n", + hw_sob->sob_id, handle->hw_sob->sob_addr, + prop->next_sob_val - 1, q_idx, hdl_id); + goto out; + +remove_idr: + spin_lock(&mgr->lock); + idr_remove(&mgr->handles, hdl_id); + spin_unlock(&mgr->lock); + +put_ctx: + hl_ctx_put(handle->ctx); + kfree(handle); + +out: + return rc; +} + +static int cs_ioctl_unreserve_signals(struct hl_fpriv *hpriv, u32 handle_id) +{ + struct hl_cs_encaps_sig_handle *encaps_sig_hdl; + struct hl_sync_stream_properties *prop; + struct hl_device *hdev = hpriv->hdev; + struct hl_encaps_signals_mgr *mgr; + struct hl_hw_sob *hw_sob; + u32 q_idx, sob_addr; + int rc = 0; + + mgr = &hpriv->ctx->sig_mgr; + + spin_lock(&mgr->lock); + encaps_sig_hdl = idr_find(&mgr->handles, handle_id); + if (encaps_sig_hdl) { + dev_dbg(hdev->dev, "unreserve signals, handle: %u, SOB:0x%x, count: %u\n", + handle_id, encaps_sig_hdl->hw_sob->sob_addr, + encaps_sig_hdl->count); + + hdev->asic_funcs->hw_queues_lock(hdev); + + q_idx = encaps_sig_hdl->q_idx; + prop = &hdev->kernel_queues[q_idx].sync_stream_prop; + hw_sob = &prop->hw_sob[prop->curr_sob_offset]; + sob_addr = hdev->asic_funcs->get_sob_addr(hdev, hw_sob->sob_id); + + /* Check if sob_val got out of sync due to other + * signal submission requests which were handled + * between the reserve-unreserve calls or SOB switch + * upon reaching SOB max value. + */ + if (encaps_sig_hdl->pre_sob_val + encaps_sig_hdl->count + != prop->next_sob_val || + sob_addr != encaps_sig_hdl->hw_sob->sob_addr) { + dev_err(hdev->dev, "Cannot unreserve signals, SOB val ran out of sync, expected: %u, actual val: %u\n", + encaps_sig_hdl->pre_sob_val, + (prop->next_sob_val - encaps_sig_hdl->count)); + + hdev->asic_funcs->hw_queues_unlock(hdev); + rc = -EINVAL; + goto out; + } + + /* + * Decrement the SOB value by count by user request + * to unreserve those signals + */ + prop->next_sob_val -= encaps_sig_hdl->count; + + hdev->asic_funcs->hw_queues_unlock(hdev); + + hw_sob_put(hw_sob); + + /* Release the id and free allocated memory of the handle */ + idr_remove(&mgr->handles, handle_id); + hl_ctx_put(encaps_sig_hdl->ctx); + kfree(encaps_sig_hdl); + } else { + rc = -EINVAL; + dev_err(hdev->dev, "failed to unreserve signals, cannot find handler\n"); + } +out: + spin_unlock(&mgr->lock); + + return rc; +} + +static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, + void __user *chunks, u32 num_chunks, + u64 *cs_seq, u32 flags, u32 timeout, + u32 *signal_sob_addr_offset, u16 *signal_initial_sob_count) +{ + struct hl_cs_encaps_sig_handle *encaps_sig_hdl = NULL; + bool handle_found = false, is_wait_cs = false, + wait_cs_submitted = false, + cs_encaps_signals = false; + struct hl_cs_chunk *cs_chunk_array, *chunk; + bool staged_cs_with_encaps_signals = false; + struct hw_queue_properties *hw_queue_prop; + struct hl_device *hdev = hpriv->hdev; + struct hl_cs_compl *sig_waitcs_cmpl; + u32 q_idx, collective_engine_id = 0; + struct hl_cs_counters_atomic *cntr; + struct hl_fence *sig_fence = NULL; + struct hl_ctx *ctx = hpriv->ctx; + enum hl_queue_type q_type; + struct hl_cs *cs; + u64 signal_seq; + int rc; + + cntr = &hdev->aggregated_cs_counters; + *cs_seq = ULLONG_MAX; + + rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks, + ctx); + if (rc) + goto out; + + /* currently it is guaranteed to have only one chunk */ + chunk = &cs_chunk_array[0]; + + if (chunk->queue_index >= hdev->asic_prop.max_queues) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&cntr->validation_drop_cnt); + dev_err(hdev->dev, "Queue index %d is invalid\n", + chunk->queue_index); + rc = -EINVAL; + goto free_cs_chunk_array; + } + + q_idx = chunk->queue_index; + hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx]; + q_type = hw_queue_prop->type; + + if (!hw_queue_prop->supports_sync_stream) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&cntr->validation_drop_cnt); + dev_err(hdev->dev, + "Queue index %d does not support sync stream operations\n", + q_idx); + rc = -EINVAL; + goto free_cs_chunk_array; + } + + if (cs_type == CS_TYPE_COLLECTIVE_WAIT) { + if (!(hw_queue_prop->collective_mode == HL_COLLECTIVE_MASTER)) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&cntr->validation_drop_cnt); + dev_err(hdev->dev, + "Queue index %d is invalid\n", q_idx); + rc = -EINVAL; + goto free_cs_chunk_array; + } + + if (!hdev->nic_ports_mask) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&cntr->validation_drop_cnt); + dev_err(hdev->dev, + "Collective operations not supported when NIC ports are disabled"); + rc = -EINVAL; + goto free_cs_chunk_array; + } + + collective_engine_id = chunk->collective_engine_id; + } + + is_wait_cs = !!(cs_type == CS_TYPE_WAIT || + cs_type == CS_TYPE_COLLECTIVE_WAIT); + + cs_encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS); + + if (is_wait_cs) { + rc = cs_ioctl_extract_signal_seq(hdev, chunk, &signal_seq, + ctx, cs_encaps_signals); + if (rc) + goto free_cs_chunk_array; + + if (cs_encaps_signals) { + /* check if cs sequence has encapsulated + * signals handle + */ + struct idr *idp; + u32 id; + + spin_lock(&ctx->sig_mgr.lock); + idp = &ctx->sig_mgr.handles; + idr_for_each_entry(idp, encaps_sig_hdl, id) { + if (encaps_sig_hdl->cs_seq == signal_seq) { + /* get refcount to protect removing this handle from idr, + * needed when multiple wait cs are used with offset + * to wait on reserved encaps signals. + * Since kref_put of this handle is executed outside the + * current lock, it is possible that the handle refcount + * is 0 but it yet to be removed from the list. In this + * case need to consider the handle as not valid. + */ + if (kref_get_unless_zero(&encaps_sig_hdl->refcount)) + handle_found = true; + break; + } + } + spin_unlock(&ctx->sig_mgr.lock); + + if (!handle_found) { + /* treat as signal CS already finished */ + dev_dbg(hdev->dev, "Cannot find encapsulated signals handle for seq 0x%llx\n", + signal_seq); + rc = 0; + goto free_cs_chunk_array; + } + + /* validate also the signal offset value */ + if (chunk->encaps_signal_offset > + encaps_sig_hdl->count) { + dev_err(hdev->dev, "offset(%u) value exceed max reserved signals count(%u)!\n", + chunk->encaps_signal_offset, + encaps_sig_hdl->count); + rc = -EINVAL; + goto free_cs_chunk_array; + } + } + + sig_fence = hl_ctx_get_fence(ctx, signal_seq); + if (IS_ERR(sig_fence)) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&cntr->validation_drop_cnt); + dev_err(hdev->dev, + "Failed to get signal CS with seq 0x%llx\n", + signal_seq); + rc = PTR_ERR(sig_fence); + goto free_cs_chunk_array; + } + + if (!sig_fence) { + /* signal CS already finished */ + rc = 0; + goto free_cs_chunk_array; + } + + sig_waitcs_cmpl = + container_of(sig_fence, struct hl_cs_compl, base_fence); + + staged_cs_with_encaps_signals = !! + (sig_waitcs_cmpl->type == CS_TYPE_DEFAULT && + (flags & HL_CS_FLAGS_ENCAP_SIGNALS)); + + if (sig_waitcs_cmpl->type != CS_TYPE_SIGNAL && + !staged_cs_with_encaps_signals) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&cntr->validation_drop_cnt); + dev_err(hdev->dev, + "CS seq 0x%llx is not of a signal/encaps-signal CS\n", + signal_seq); + hl_fence_put(sig_fence); + rc = -EINVAL; + goto free_cs_chunk_array; + } + + if (completion_done(&sig_fence->completion)) { + /* signal CS already finished */ + hl_fence_put(sig_fence); + rc = 0; + goto free_cs_chunk_array; + } + } + + rc = allocate_cs(hdev, ctx, cs_type, ULLONG_MAX, &cs, flags, timeout); + if (rc) { + if (is_wait_cs) + hl_fence_put(sig_fence); + + goto free_cs_chunk_array; + } + + /* + * Save the signal CS fence for later initialization right before + * hanging the wait CS on the queue. + * for encaps signals case, we save the cs sequence and handle pointer + * for later initialization. + */ + if (is_wait_cs) { + cs->signal_fence = sig_fence; + /* store the handle pointer, so we don't have to + * look for it again, later on the flow + * when we need to set SOB info in hw_queue. + */ + if (cs->encaps_signals) + cs->encaps_sig_hdl = encaps_sig_hdl; + } + + hl_debugfs_add_cs(cs); + + *cs_seq = cs->sequence; + + if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_SIGNAL) + rc = cs_ioctl_signal_wait_create_jobs(hdev, ctx, cs, q_type, + q_idx, chunk->encaps_signal_offset); + else if (cs_type == CS_TYPE_COLLECTIVE_WAIT) + rc = hdev->asic_funcs->collective_wait_create_jobs(hdev, ctx, + cs, q_idx, collective_engine_id, + chunk->encaps_signal_offset); + else { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&cntr->validation_drop_cnt); + rc = -EINVAL; + } + + if (rc) + goto free_cs_object; + + if (q_type == QUEUE_TYPE_HW) + INIT_WORK(&cs->finish_work, cs_completion); + + rc = hl_hw_queue_schedule_cs(cs); + if (rc) { + /* In case wait cs failed here, it means the signal cs + * already completed. we want to free all it's related objects + * but we don't want to fail the ioctl. + */ + if (is_wait_cs) + rc = 0; + else if (rc != -EAGAIN) + dev_err(hdev->dev, + "Failed to submit CS %d.%llu to H/W queues, error %d\n", + ctx->asid, cs->sequence, rc); + goto free_cs_object; + } + + *signal_sob_addr_offset = cs->sob_addr_offset; + *signal_initial_sob_count = cs->initial_sob_count; + + rc = HL_CS_STATUS_SUCCESS; + if (is_wait_cs) + wait_cs_submitted = true; + goto put_cs; + +free_cs_object: + cs_rollback(hdev, cs); + *cs_seq = ULLONG_MAX; + /* The path below is both for good and erroneous exits */ +put_cs: + /* We finished with the CS in this function, so put the ref */ + cs_put(cs); +free_cs_chunk_array: + if (!wait_cs_submitted && cs_encaps_signals && handle_found && + is_wait_cs) + kref_put(&encaps_sig_hdl->refcount, + hl_encaps_handle_do_release); + kfree(cs_chunk_array); +out: + return rc; +} + +static int cs_ioctl_engine_cores(struct hl_fpriv *hpriv, u64 engine_cores, + u32 num_engine_cores, u32 core_command) +{ + int rc; + struct hl_device *hdev = hpriv->hdev; + void __user *engine_cores_arr; + u32 *cores; + + if (!num_engine_cores || num_engine_cores > hdev->asic_prop.num_engine_cores) { + dev_err(hdev->dev, "Number of engine cores %d is invalid\n", num_engine_cores); + return -EINVAL; + } + + if (core_command != HL_ENGINE_CORE_RUN && core_command != HL_ENGINE_CORE_HALT) { + dev_err(hdev->dev, "Engine core command is invalid\n"); + return -EINVAL; + } + + engine_cores_arr = (void __user *) (uintptr_t) engine_cores; + cores = kmalloc_array(num_engine_cores, sizeof(u32), GFP_KERNEL); + if (!cores) + return -ENOMEM; + + if (copy_from_user(cores, engine_cores_arr, num_engine_cores * sizeof(u32))) { + dev_err(hdev->dev, "Failed to copy core-ids array from user\n"); + kfree(cores); + return -EFAULT; + } + + rc = hdev->asic_funcs->set_engine_cores(hdev, cores, num_engine_cores, core_command); + kfree(cores); + + return rc; +} + +int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) +{ + union hl_cs_args *args = data; + enum hl_cs_type cs_type = 0; + u64 cs_seq = ULONG_MAX; + void __user *chunks; + u32 num_chunks, flags, timeout, + signals_count = 0, sob_addr = 0, handle_id = 0; + u16 sob_initial_count = 0; + int rc; + + rc = hl_cs_sanity_checks(hpriv, args); + if (rc) + goto out; + + rc = hl_cs_ctx_switch(hpriv, args, &cs_seq); + if (rc) + goto out; + + cs_type = hl_cs_get_cs_type(args->in.cs_flags & + ~HL_CS_FLAGS_FORCE_RESTORE); + chunks = (void __user *) (uintptr_t) args->in.chunks_execute; + num_chunks = args->in.num_chunks_execute; + flags = args->in.cs_flags; + + /* In case this is a staged CS, user should supply the CS sequence */ + if ((flags & HL_CS_FLAGS_STAGED_SUBMISSION) && + !(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST)) + cs_seq = args->in.seq; + + timeout = flags & HL_CS_FLAGS_CUSTOM_TIMEOUT + ? msecs_to_jiffies(args->in.timeout * 1000) + : hpriv->hdev->timeout_jiffies; + + switch (cs_type) { + case CS_TYPE_SIGNAL: + case CS_TYPE_WAIT: + case CS_TYPE_COLLECTIVE_WAIT: + rc = cs_ioctl_signal_wait(hpriv, cs_type, chunks, num_chunks, + &cs_seq, args->in.cs_flags, timeout, + &sob_addr, &sob_initial_count); + break; + case CS_RESERVE_SIGNALS: + rc = cs_ioctl_reserve_signals(hpriv, + args->in.encaps_signals_q_idx, + args->in.encaps_signals_count, + &handle_id, &sob_addr, &signals_count); + break; + case CS_UNRESERVE_SIGNALS: + rc = cs_ioctl_unreserve_signals(hpriv, + args->in.encaps_sig_handle_id); + break; + case CS_TYPE_ENGINE_CORE: + rc = cs_ioctl_engine_cores(hpriv, args->in.engine_cores, + args->in.num_engine_cores, args->in.core_command); + break; + default: + rc = cs_ioctl_default(hpriv, chunks, num_chunks, &cs_seq, + args->in.cs_flags, + args->in.encaps_sig_handle_id, + timeout, &sob_initial_count); + break; + } +out: + if (rc != -EAGAIN) { + memset(args, 0, sizeof(*args)); + + switch (cs_type) { + case CS_RESERVE_SIGNALS: + args->out.handle_id = handle_id; + args->out.sob_base_addr_offset = sob_addr; + args->out.count = signals_count; + break; + case CS_TYPE_SIGNAL: + args->out.sob_base_addr_offset = sob_addr; + args->out.sob_count_before_submission = sob_initial_count; + args->out.seq = cs_seq; + break; + case CS_TYPE_DEFAULT: + args->out.sob_count_before_submission = sob_initial_count; + args->out.seq = cs_seq; + break; + default: + args->out.seq = cs_seq; + break; + } + + args->out.status = rc; + } + + return rc; +} + +static int hl_wait_for_fence(struct hl_ctx *ctx, u64 seq, struct hl_fence *fence, + enum hl_cs_wait_status *status, u64 timeout_us, s64 *timestamp) +{ + struct hl_device *hdev = ctx->hdev; + ktime_t timestamp_kt; + long completion_rc; + int rc = 0, error; + + if (IS_ERR(fence)) { + rc = PTR_ERR(fence); + if (rc == -EINVAL) + dev_notice_ratelimited(hdev->dev, + "Can't wait on CS %llu because current CS is at seq %llu\n", + seq, ctx->cs_sequence); + return rc; + } + + if (!fence) { + if (!hl_pop_cs_outcome(&ctx->outcome_store, seq, ×tamp_kt, &error)) { + dev_dbg(hdev->dev, + "Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n", + seq, ctx->cs_sequence); + *status = CS_WAIT_STATUS_GONE; + return 0; + } + + completion_rc = 1; + goto report_results; + } + + if (!timeout_us) { + completion_rc = completion_done(&fence->completion); + } else { + unsigned long timeout; + + timeout = (timeout_us == MAX_SCHEDULE_TIMEOUT) ? + timeout_us : usecs_to_jiffies(timeout_us); + completion_rc = + wait_for_completion_interruptible_timeout( + &fence->completion, timeout); + } + + error = fence->error; + timestamp_kt = fence->timestamp; + +report_results: + if (completion_rc > 0) { + *status = CS_WAIT_STATUS_COMPLETED; + if (timestamp) + *timestamp = ktime_to_ns(timestamp_kt); + } else { + *status = CS_WAIT_STATUS_BUSY; + } + + if (error == -ETIMEDOUT || error == -EIO) + rc = error; + + return rc; +} + +/* + * hl_cs_poll_fences - iterate CS fences to check for CS completion + * + * @mcs_data: multi-CS internal data + * @mcs_compl: multi-CS completion structure + * + * @return 0 on success, otherwise non 0 error code + * + * The function iterates on all CS sequence in the list and set bit in + * completion_bitmap for each completed CS. + * While iterating, the function sets the stream map of each fence in the fence + * array in the completion QID stream map to be used by CSs to perform + * completion to the multi-CS context. + * This function shall be called after taking context ref + */ +static int hl_cs_poll_fences(struct multi_cs_data *mcs_data, struct multi_cs_completion *mcs_compl) +{ + struct hl_fence **fence_ptr = mcs_data->fence_arr; + struct hl_device *hdev = mcs_data->ctx->hdev; + int i, rc, arr_len = mcs_data->arr_len; + u64 *seq_arr = mcs_data->seq_arr; + ktime_t max_ktime, first_cs_time; + enum hl_cs_wait_status status; + + memset(fence_ptr, 0, arr_len * sizeof(struct hl_fence *)); + + /* get all fences under the same lock */ + rc = hl_ctx_get_fences(mcs_data->ctx, seq_arr, fence_ptr, arr_len); + if (rc) + return rc; + + /* + * re-initialize the completion here to handle 2 possible cases: + * 1. CS will complete the multi-CS prior clearing the completion. in which + * case the fence iteration is guaranteed to catch the CS completion. + * 2. the completion will occur after re-init of the completion. + * in which case we will wake up immediately in wait_for_completion. + */ + reinit_completion(&mcs_compl->completion); + + /* + * set to maximum time to verify timestamp is valid: if at the end + * this value is maintained- no timestamp was updated + */ + max_ktime = ktime_set(KTIME_SEC_MAX, 0); + first_cs_time = max_ktime; + + for (i = 0; i < arr_len; i++, fence_ptr++) { + struct hl_fence *fence = *fence_ptr; + + /* + * In order to prevent case where we wait until timeout even though a CS associated + * with the multi-CS actually completed we do things in the below order: + * 1. for each fence set it's QID map in the multi-CS completion QID map. This way + * any CS can, potentially, complete the multi CS for the specific QID (note + * that once completion is initialized, calling complete* and then wait on the + * completion will cause it to return at once) + * 2. only after allowing multi-CS completion for the specific QID we check whether + * the specific CS already completed (and thus the wait for completion part will + * be skipped). if the CS not completed it is guaranteed that completing CS will + * wake up the completion. + */ + if (fence) + mcs_compl->stream_master_qid_map |= fence->stream_master_qid_map; + + /* + * function won't sleep as it is called with timeout 0 (i.e. + * poll the fence) + */ + rc = hl_wait_for_fence(mcs_data->ctx, seq_arr[i], fence, &status, 0, NULL); + if (rc) { + dev_err(hdev->dev, + "wait_for_fence error :%d for CS seq %llu\n", + rc, seq_arr[i]); + break; + } + + switch (status) { + case CS_WAIT_STATUS_BUSY: + /* CS did not finished, QID to wait on already stored */ + break; + case CS_WAIT_STATUS_COMPLETED: + /* + * Using mcs_handling_done to avoid possibility of mcs_data + * returns to user indicating CS completed before it finished + * all of its mcs handling, to avoid race the next time the + * user waits for mcs. + * note: when reaching this case fence is definitely not NULL + * but NULL check was added to overcome static analysis + */ + if (fence && !fence->mcs_handling_done) { + /* + * in case multi CS is completed but MCS handling not done + * we "complete" the multi CS to prevent it from waiting + * until time-out and the "multi-CS handling done" will have + * another chance at the next iteration + */ + complete_all(&mcs_compl->completion); + break; + } + + mcs_data->completion_bitmap |= BIT(i); + /* + * For all completed CSs we take the earliest timestamp. + * For this we have to validate that the timestamp is + * earliest of all timestamps so far. + */ + if (fence && mcs_data->update_ts && + (ktime_compare(fence->timestamp, first_cs_time) < 0)) + first_cs_time = fence->timestamp; + break; + case CS_WAIT_STATUS_GONE: + mcs_data->update_ts = false; + mcs_data->gone_cs = true; + /* + * It is possible to get an old sequence numbers from user + * which related to already completed CSs and their fences + * already gone. In this case, CS set as completed but + * no need to consider its QID for mcs completion. + */ + mcs_data->completion_bitmap |= BIT(i); + break; + default: + dev_err(hdev->dev, "Invalid fence status\n"); + return -EINVAL; + } + + } + + hl_fences_put(mcs_data->fence_arr, arr_len); + + if (mcs_data->update_ts && + (ktime_compare(first_cs_time, max_ktime) != 0)) + mcs_data->timestamp = ktime_to_ns(first_cs_time); + + return rc; +} + +static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, u64 timeout_us, u64 seq, + enum hl_cs_wait_status *status, s64 *timestamp) +{ + struct hl_fence *fence; + int rc = 0; + + if (timestamp) + *timestamp = 0; + + hl_ctx_get(ctx); + + fence = hl_ctx_get_fence(ctx, seq); + + rc = hl_wait_for_fence(ctx, seq, fence, status, timeout_us, timestamp); + hl_fence_put(fence); + hl_ctx_put(ctx); + + return rc; +} + +static inline unsigned long hl_usecs64_to_jiffies(const u64 usecs) +{ + if (usecs <= U32_MAX) + return usecs_to_jiffies(usecs); + + /* + * If the value in nanoseconds is larger than 64 bit, use the largest + * 64 bit value. + */ + if (usecs >= ((u64)(U64_MAX / NSEC_PER_USEC))) + return nsecs_to_jiffies(U64_MAX); + + return nsecs_to_jiffies(usecs * NSEC_PER_USEC); +} + +/* + * hl_wait_multi_cs_completion_init - init completion structure + * + * @hdev: pointer to habanalabs device structure + * @stream_master_bitmap: stream master QIDs map, set bit indicates stream + * master QID to wait on + * + * @return valid completion struct pointer on success, otherwise error pointer + * + * up to MULTI_CS_MAX_USER_CTX calls can be done concurrently to the driver. + * the function gets the first available completion (by marking it "used") + * and initialize its values. + */ +static struct multi_cs_completion *hl_wait_multi_cs_completion_init(struct hl_device *hdev) +{ + struct multi_cs_completion *mcs_compl; + int i; + + /* find free multi_cs completion structure */ + for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) { + mcs_compl = &hdev->multi_cs_completion[i]; + spin_lock(&mcs_compl->lock); + if (!mcs_compl->used) { + mcs_compl->used = 1; + mcs_compl->timestamp = 0; + /* + * init QID map to 0 to avoid completion by CSs. the actual QID map + * to multi-CS CSs will be set incrementally at a later stage + */ + mcs_compl->stream_master_qid_map = 0; + spin_unlock(&mcs_compl->lock); + break; + } + spin_unlock(&mcs_compl->lock); + } + + if (i == MULTI_CS_MAX_USER_CTX) { + dev_err(hdev->dev, "no available multi-CS completion structure\n"); + return ERR_PTR(-ENOMEM); + } + return mcs_compl; +} + +/* + * hl_wait_multi_cs_completion_fini - return completion structure and set as + * unused + * + * @mcs_compl: pointer to the completion structure + */ +static void hl_wait_multi_cs_completion_fini( + struct multi_cs_completion *mcs_compl) +{ + /* + * free completion structure, do it under lock to be in-sync with the + * thread that signals completion + */ + spin_lock(&mcs_compl->lock); + mcs_compl->used = 0; + spin_unlock(&mcs_compl->lock); +} + +/* + * hl_wait_multi_cs_completion - wait for first CS to complete + * + * @mcs_data: multi-CS internal data + * + * @return 0 on success, otherwise non 0 error code + */ +static int hl_wait_multi_cs_completion(struct multi_cs_data *mcs_data, + struct multi_cs_completion *mcs_compl) +{ + long completion_rc; + + completion_rc = wait_for_completion_interruptible_timeout(&mcs_compl->completion, + mcs_data->timeout_jiffies); + + /* update timestamp */ + if (completion_rc > 0) + mcs_data->timestamp = mcs_compl->timestamp; + + mcs_data->wait_status = completion_rc; + + return 0; +} + +/* + * hl_multi_cs_completion_init - init array of multi-CS completion structures + * + * @hdev: pointer to habanalabs device structure + */ +void hl_multi_cs_completion_init(struct hl_device *hdev) +{ + struct multi_cs_completion *mcs_cmpl; + int i; + + for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) { + mcs_cmpl = &hdev->multi_cs_completion[i]; + mcs_cmpl->used = 0; + spin_lock_init(&mcs_cmpl->lock); + init_completion(&mcs_cmpl->completion); + } +} + +/* + * hl_multi_cs_wait_ioctl - implementation of the multi-CS wait ioctl + * + * @hpriv: pointer to the private data of the fd + * @data: pointer to multi-CS wait ioctl in/out args + * + */ +static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) +{ + struct multi_cs_completion *mcs_compl; + struct hl_device *hdev = hpriv->hdev; + struct multi_cs_data mcs_data = {}; + union hl_wait_cs_args *args = data; + struct hl_ctx *ctx = hpriv->ctx; + struct hl_fence **fence_arr; + void __user *seq_arr; + u32 size_to_copy; + u64 *cs_seq_arr; + u8 seq_arr_len; + int rc; + + if (!hdev->supports_wait_for_multi_cs) { + dev_err(hdev->dev, "Wait for multi CS is not supported\n"); + return -EPERM; + } + + seq_arr_len = args->in.seq_arr_len; + + if (seq_arr_len > HL_WAIT_MULTI_CS_LIST_MAX_LEN) { + dev_err(hdev->dev, "Can wait only up to %d CSs, input sequence is of length %u\n", + HL_WAIT_MULTI_CS_LIST_MAX_LEN, seq_arr_len); + return -EINVAL; + } + + /* allocate memory for sequence array */ + cs_seq_arr = + kmalloc_array(seq_arr_len, sizeof(*cs_seq_arr), GFP_KERNEL); + if (!cs_seq_arr) + return -ENOMEM; + + /* copy CS sequence array from user */ + seq_arr = (void __user *) (uintptr_t) args->in.seq; + size_to_copy = seq_arr_len * sizeof(*cs_seq_arr); + if (copy_from_user(cs_seq_arr, seq_arr, size_to_copy)) { + dev_err(hdev->dev, "Failed to copy multi-cs sequence array from user\n"); + rc = -EFAULT; + goto free_seq_arr; + } + + /* allocate array for the fences */ + fence_arr = kmalloc_array(seq_arr_len, sizeof(struct hl_fence *), GFP_KERNEL); + if (!fence_arr) { + rc = -ENOMEM; + goto free_seq_arr; + } + + /* initialize the multi-CS internal data */ + mcs_data.ctx = ctx; + mcs_data.seq_arr = cs_seq_arr; + mcs_data.fence_arr = fence_arr; + mcs_data.arr_len = seq_arr_len; + + hl_ctx_get(ctx); + + /* wait (with timeout) for the first CS to be completed */ + mcs_data.timeout_jiffies = hl_usecs64_to_jiffies(args->in.timeout_us); + mcs_compl = hl_wait_multi_cs_completion_init(hdev); + if (IS_ERR(mcs_compl)) { + rc = PTR_ERR(mcs_compl); + goto put_ctx; + } + + /* poll all CS fences, extract timestamp */ + mcs_data.update_ts = true; + rc = hl_cs_poll_fences(&mcs_data, mcs_compl); + /* + * skip wait for CS completion when one of the below is true: + * - an error on the poll function + * - one or more CS in the list completed + * - the user called ioctl with timeout 0 + */ + if (rc || mcs_data.completion_bitmap || !args->in.timeout_us) + goto completion_fini; + + while (true) { + rc = hl_wait_multi_cs_completion(&mcs_data, mcs_compl); + if (rc || (mcs_data.wait_status == 0)) + break; + + /* + * poll fences once again to update the CS map. + * no timestamp should be updated this time. + */ + mcs_data.update_ts = false; + rc = hl_cs_poll_fences(&mcs_data, mcs_compl); + + if (rc || mcs_data.completion_bitmap) + break; + + /* + * if hl_wait_multi_cs_completion returned before timeout (i.e. + * it got a completion) it either got completed by CS in the multi CS list + * (in which case the indication will be non empty completion_bitmap) or it + * got completed by CS submitted to one of the shared stream master but + * not in the multi CS list (in which case we should wait again but modify + * the timeout and set timestamp as zero to let a CS related to the current + * multi-CS set a new, relevant, timestamp) + */ + mcs_data.timeout_jiffies = mcs_data.wait_status; + mcs_compl->timestamp = 0; + } + +completion_fini: + hl_wait_multi_cs_completion_fini(mcs_compl); + +put_ctx: + hl_ctx_put(ctx); + kfree(fence_arr); + +free_seq_arr: + kfree(cs_seq_arr); + + if (rc) + return rc; + + if (mcs_data.wait_status == -ERESTARTSYS) { + dev_err_ratelimited(hdev->dev, + "user process got signal while waiting for Multi-CS\n"); + return -EINTR; + } + + /* update output args */ + memset(args, 0, sizeof(*args)); + + if (mcs_data.completion_bitmap) { + args->out.status = HL_WAIT_CS_STATUS_COMPLETED; + args->out.cs_completion_map = mcs_data.completion_bitmap; + + /* if timestamp not 0- it's valid */ + if (mcs_data.timestamp) { + args->out.timestamp_nsec = mcs_data.timestamp; + args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD; + } + + /* update if some CS was gone */ + if (!mcs_data.timestamp) + args->out.flags |= HL_WAIT_CS_STATUS_FLAG_GONE; + } else { + args->out.status = HL_WAIT_CS_STATUS_BUSY; + } + + return 0; +} + +static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) +{ + struct hl_device *hdev = hpriv->hdev; + union hl_wait_cs_args *args = data; + enum hl_cs_wait_status status; + u64 seq = args->in.seq; + s64 timestamp; + int rc; + + rc = _hl_cs_wait_ioctl(hdev, hpriv->ctx, args->in.timeout_us, seq, &status, ×tamp); + + if (rc == -ERESTARTSYS) { + dev_err_ratelimited(hdev->dev, + "user process got signal while waiting for CS handle %llu\n", + seq); + return -EINTR; + } + + memset(args, 0, sizeof(*args)); + + if (rc) { + if (rc == -ETIMEDOUT) { + dev_err_ratelimited(hdev->dev, + "CS %llu has timed-out while user process is waiting for it\n", + seq); + args->out.status = HL_WAIT_CS_STATUS_TIMEDOUT; + } else if (rc == -EIO) { + dev_err_ratelimited(hdev->dev, + "CS %llu has been aborted while user process is waiting for it\n", + seq); + args->out.status = HL_WAIT_CS_STATUS_ABORTED; + } + return rc; + } + + if (timestamp) { + args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD; + args->out.timestamp_nsec = timestamp; + } + + switch (status) { + case CS_WAIT_STATUS_GONE: + args->out.flags |= HL_WAIT_CS_STATUS_FLAG_GONE; + fallthrough; + case CS_WAIT_STATUS_COMPLETED: + args->out.status = HL_WAIT_CS_STATUS_COMPLETED; + break; + case CS_WAIT_STATUS_BUSY: + default: + args->out.status = HL_WAIT_CS_STATUS_BUSY; + break; + } + + return 0; +} + +static int ts_buff_get_kernel_ts_record(struct hl_mmap_mem_buf *buf, + struct hl_cb *cq_cb, + u64 ts_offset, u64 cq_offset, u64 target_value, + spinlock_t *wait_list_lock, + struct hl_user_pending_interrupt **pend) +{ + struct hl_ts_buff *ts_buff = buf->private; + struct hl_user_pending_interrupt *requested_offset_record = + (struct hl_user_pending_interrupt *)ts_buff->kernel_buff_address + + ts_offset; + struct hl_user_pending_interrupt *cb_last = + (struct hl_user_pending_interrupt *)ts_buff->kernel_buff_address + + (ts_buff->kernel_buff_size / sizeof(struct hl_user_pending_interrupt)); + unsigned long flags, iter_counter = 0; + u64 current_cq_counter; + + /* Validate ts_offset not exceeding last max */ + if (requested_offset_record >= cb_last) { + dev_err(buf->mmg->dev, "Ts offset exceeds max CB offset(0x%llx)\n", + (u64)(uintptr_t)cb_last); + return -EINVAL; + } + +start_over: + spin_lock_irqsave(wait_list_lock, flags); + + /* Unregister only if we didn't reach the target value + * since in this case there will be no handling in irq context + * and then it's safe to delete the node out of the interrupt list + * then re-use it on other interrupt + */ + if (requested_offset_record->ts_reg_info.in_use) { + current_cq_counter = *requested_offset_record->cq_kernel_addr; + if (current_cq_counter < requested_offset_record->cq_target_value) { + list_del(&requested_offset_record->wait_list_node); + spin_unlock_irqrestore(wait_list_lock, flags); + + hl_mmap_mem_buf_put(requested_offset_record->ts_reg_info.buf); + hl_cb_put(requested_offset_record->ts_reg_info.cq_cb); + + dev_dbg(buf->mmg->dev, + "ts node removed from interrupt list now can re-use\n"); + } else { + dev_dbg(buf->mmg->dev, + "ts node in middle of irq handling\n"); + + /* irq handling in the middle give it time to finish */ + spin_unlock_irqrestore(wait_list_lock, flags); + usleep_range(1, 10); + if (++iter_counter == MAX_TS_ITER_NUM) { + dev_err(buf->mmg->dev, + "handling registration interrupt took too long!!\n"); + return -EINVAL; + } + + goto start_over; + } + } else { + /* Fill up the new registration node info */ + requested_offset_record->ts_reg_info.buf = buf; + requested_offset_record->ts_reg_info.cq_cb = cq_cb; + requested_offset_record->ts_reg_info.timestamp_kernel_addr = + (u64 *) ts_buff->user_buff_address + ts_offset; + requested_offset_record->cq_kernel_addr = + (u64 *) cq_cb->kernel_address + cq_offset; + requested_offset_record->cq_target_value = target_value; + + spin_unlock_irqrestore(wait_list_lock, flags); + } + + *pend = requested_offset_record; + + dev_dbg(buf->mmg->dev, "Found available node in TS kernel CB %p\n", + requested_offset_record); + return 0; +} + +static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, + struct hl_mem_mgr *cb_mmg, struct hl_mem_mgr *mmg, + u64 timeout_us, u64 cq_counters_handle, u64 cq_counters_offset, + u64 target_value, struct hl_user_interrupt *interrupt, + bool register_ts_record, u64 ts_handle, u64 ts_offset, + u32 *status, u64 *timestamp) +{ + struct hl_user_pending_interrupt *pend; + struct hl_mmap_mem_buf *buf; + struct hl_cb *cq_cb; + unsigned long timeout, flags; + long completion_rc; + int rc = 0; + + timeout = hl_usecs64_to_jiffies(timeout_us); + + hl_ctx_get(ctx); + + cq_cb = hl_cb_get(cb_mmg, cq_counters_handle); + if (!cq_cb) { + rc = -EINVAL; + goto put_ctx; + } + + /* Validate the cq offset */ + if (((u64 *) cq_cb->kernel_address + cq_counters_offset) >= + ((u64 *) cq_cb->kernel_address + (cq_cb->size / sizeof(u64)))) { + rc = -EINVAL; + goto put_cq_cb; + } + + if (register_ts_record) { + dev_dbg(hdev->dev, "Timestamp registration: interrupt id: %u, ts offset: %llu, cq_offset: %llu\n", + interrupt->interrupt_id, ts_offset, cq_counters_offset); + buf = hl_mmap_mem_buf_get(mmg, ts_handle); + if (!buf) { + rc = -EINVAL; + goto put_cq_cb; + } + + /* get ts buffer record */ + rc = ts_buff_get_kernel_ts_record(buf, cq_cb, ts_offset, + cq_counters_offset, target_value, + &interrupt->wait_list_lock, &pend); + if (rc) + goto put_ts_buff; + } else { + pend = kzalloc(sizeof(*pend), GFP_KERNEL); + if (!pend) { + rc = -ENOMEM; + goto put_cq_cb; + } + hl_fence_init(&pend->fence, ULONG_MAX); + pend->cq_kernel_addr = (u64 *) cq_cb->kernel_address + cq_counters_offset; + pend->cq_target_value = target_value; + } + + spin_lock_irqsave(&interrupt->wait_list_lock, flags); + + /* We check for completion value as interrupt could have been received + * before we added the node to the wait list + */ + if (*pend->cq_kernel_addr >= target_value) { + if (register_ts_record) + pend->ts_reg_info.in_use = 0; + spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); + + *status = HL_WAIT_CS_STATUS_COMPLETED; + + if (register_ts_record) { + *pend->ts_reg_info.timestamp_kernel_addr = ktime_get_ns(); + goto put_ts_buff; + } else { + pend->fence.timestamp = ktime_get(); + goto set_timestamp; + } + } else if (!timeout_us) { + spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); + *status = HL_WAIT_CS_STATUS_BUSY; + pend->fence.timestamp = ktime_get(); + goto set_timestamp; + } + + /* Add pending user interrupt to relevant list for the interrupt + * handler to monitor. + * Note that we cannot have sorted list by target value, + * in order to shorten the list pass loop, since + * same list could have nodes for different cq counter handle. + * Note: + * Mark ts buff offset as in use here in the spinlock protection area + * to avoid getting in the re-use section in ts_buff_get_kernel_ts_record + * before adding the node to the list. this scenario might happen when + * multiple threads are racing on same offset and one thread could + * set the ts buff in ts_buff_get_kernel_ts_record then the other thread + * takes over and get to ts_buff_get_kernel_ts_record and then we will try + * to re-use the same ts buff offset, and will try to delete a non existing + * node from the list. + */ + if (register_ts_record) + pend->ts_reg_info.in_use = 1; + + list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head); + spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); + + if (register_ts_record) { + rc = *status = HL_WAIT_CS_STATUS_COMPLETED; + goto ts_registration_exit; + } + + /* Wait for interrupt handler to signal completion */ + completion_rc = wait_for_completion_interruptible_timeout(&pend->fence.completion, + timeout); + if (completion_rc > 0) { + *status = HL_WAIT_CS_STATUS_COMPLETED; + } else { + if (completion_rc == -ERESTARTSYS) { + dev_err_ratelimited(hdev->dev, + "user process got signal while waiting for interrupt ID %d\n", + interrupt->interrupt_id); + rc = -EINTR; + *status = HL_WAIT_CS_STATUS_ABORTED; + } else { + if (pend->fence.error == -EIO) { + dev_err_ratelimited(hdev->dev, + "interrupt based wait ioctl aborted(error:%d) due to a reset cycle initiated\n", + pend->fence.error); + rc = -EIO; + *status = HL_WAIT_CS_STATUS_ABORTED; + } else { + /* The wait has timed-out. We don't know anything beyond that + * because the workload wasn't submitted through the driver. + * Therefore, from driver's perspective, the workload is still + * executing. + */ + rc = 0; + *status = HL_WAIT_CS_STATUS_BUSY; + } + } + } + + /* + * We keep removing the node from list here, and not at the irq handler + * for completion timeout case. and if it's a registration + * for ts record, the node will be deleted in the irq handler after + * we reach the target value. + */ + spin_lock_irqsave(&interrupt->wait_list_lock, flags); + list_del(&pend->wait_list_node); + spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); + +set_timestamp: + *timestamp = ktime_to_ns(pend->fence.timestamp); + kfree(pend); + hl_cb_put(cq_cb); +ts_registration_exit: + hl_ctx_put(ctx); + + return rc; + +put_ts_buff: + hl_mmap_mem_buf_put(buf); +put_cq_cb: + hl_cb_put(cq_cb); +put_ctx: + hl_ctx_put(ctx); + + return rc; +} + +static int _hl_interrupt_wait_ioctl_user_addr(struct hl_device *hdev, struct hl_ctx *ctx, + u64 timeout_us, u64 user_address, + u64 target_value, struct hl_user_interrupt *interrupt, + u32 *status, + u64 *timestamp) +{ + struct hl_user_pending_interrupt *pend; + unsigned long timeout, flags; + u64 completion_value; + long completion_rc; + int rc = 0; + + timeout = hl_usecs64_to_jiffies(timeout_us); + + hl_ctx_get(ctx); + + pend = kzalloc(sizeof(*pend), GFP_KERNEL); + if (!pend) { + hl_ctx_put(ctx); + return -ENOMEM; + } + + hl_fence_init(&pend->fence, ULONG_MAX); + + /* Add pending user interrupt to relevant list for the interrupt + * handler to monitor + */ + spin_lock_irqsave(&interrupt->wait_list_lock, flags); + list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head); + spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); + + /* We check for completion value as interrupt could have been received + * before we added the node to the wait list + */ + if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 8)) { + dev_err(hdev->dev, "Failed to copy completion value from user\n"); + rc = -EFAULT; + goto remove_pending_user_interrupt; + } + + if (completion_value >= target_value) { + *status = HL_WAIT_CS_STATUS_COMPLETED; + /* There was no interrupt, we assume the completion is now. */ + pend->fence.timestamp = ktime_get(); + } else { + *status = HL_WAIT_CS_STATUS_BUSY; + } + + if (!timeout_us || (*status == HL_WAIT_CS_STATUS_COMPLETED)) + goto remove_pending_user_interrupt; + +wait_again: + /* Wait for interrupt handler to signal completion */ + completion_rc = wait_for_completion_interruptible_timeout(&pend->fence.completion, + timeout); + + /* If timeout did not expire we need to perform the comparison. + * If comparison fails, keep waiting until timeout expires + */ + if (completion_rc > 0) { + spin_lock_irqsave(&interrupt->wait_list_lock, flags); + /* reinit_completion must be called before we check for user + * completion value, otherwise, if interrupt is received after + * the comparison and before the next wait_for_completion, + * we will reach timeout and fail + */ + reinit_completion(&pend->fence.completion); + spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); + + if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 8)) { + dev_err(hdev->dev, "Failed to copy completion value from user\n"); + rc = -EFAULT; + + goto remove_pending_user_interrupt; + } + + if (completion_value >= target_value) { + *status = HL_WAIT_CS_STATUS_COMPLETED; + } else if (pend->fence.error) { + dev_err_ratelimited(hdev->dev, + "interrupt based wait ioctl aborted(error:%d) due to a reset cycle initiated\n", + pend->fence.error); + /* set the command completion status as ABORTED */ + *status = HL_WAIT_CS_STATUS_ABORTED; + } else { + timeout = completion_rc; + goto wait_again; + } + } else if (completion_rc == -ERESTARTSYS) { + dev_err_ratelimited(hdev->dev, + "user process got signal while waiting for interrupt ID %d\n", + interrupt->interrupt_id); + rc = -EINTR; + } else { + /* The wait has timed-out. We don't know anything beyond that + * because the workload wasn't submitted through the driver. + * Therefore, from driver's perspective, the workload is still + * executing. + */ + rc = 0; + *status = HL_WAIT_CS_STATUS_BUSY; + } + +remove_pending_user_interrupt: + spin_lock_irqsave(&interrupt->wait_list_lock, flags); + list_del(&pend->wait_list_node); + spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); + + *timestamp = ktime_to_ns(pend->fence.timestamp); + + kfree(pend); + hl_ctx_put(ctx); + + return rc; +} + +static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data) +{ + u16 interrupt_id, first_interrupt, last_interrupt; + struct hl_device *hdev = hpriv->hdev; + struct asic_fixed_properties *prop; + struct hl_user_interrupt *interrupt; + union hl_wait_cs_args *args = data; + u32 status = HL_WAIT_CS_STATUS_BUSY; + u64 timestamp = 0; + int rc, int_idx; + + prop = &hdev->asic_prop; + + if (!(prop->user_interrupt_count + prop->user_dec_intr_count)) { + dev_err(hdev->dev, "no user interrupts allowed"); + return -EPERM; + } + + interrupt_id = FIELD_GET(HL_WAIT_CS_FLAGS_INTERRUPT_MASK, args->in.flags); + + first_interrupt = prop->first_available_user_interrupt; + last_interrupt = prop->first_available_user_interrupt + prop->user_interrupt_count - 1; + + if (interrupt_id < prop->user_dec_intr_count) { + + /* Check if the requested core is enabled */ + if (!(prop->decoder_enabled_mask & BIT(interrupt_id))) { + dev_err(hdev->dev, "interrupt on a disabled core(%u) not allowed", + interrupt_id); + return -EINVAL; + } + + interrupt = &hdev->user_interrupt[interrupt_id]; + + } else if (interrupt_id >= first_interrupt && interrupt_id <= last_interrupt) { + + int_idx = interrupt_id - first_interrupt + prop->user_dec_intr_count; + interrupt = &hdev->user_interrupt[int_idx]; + + } else if (interrupt_id == HL_COMMON_USER_CQ_INTERRUPT_ID) { + interrupt = &hdev->common_user_cq_interrupt; + } else if (interrupt_id == HL_COMMON_DEC_INTERRUPT_ID) { + interrupt = &hdev->common_decoder_interrupt; + } else { + dev_err(hdev->dev, "invalid user interrupt %u", interrupt_id); + return -EINVAL; + } + + if (args->in.flags & HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ) + rc = _hl_interrupt_wait_ioctl(hdev, hpriv->ctx, &hpriv->mem_mgr, &hpriv->mem_mgr, + args->in.interrupt_timeout_us, args->in.cq_counters_handle, + args->in.cq_counters_offset, + args->in.target, interrupt, + !!(args->in.flags & HL_WAIT_CS_FLAGS_REGISTER_INTERRUPT), + args->in.timestamp_handle, args->in.timestamp_offset, + &status, ×tamp); + else + rc = _hl_interrupt_wait_ioctl_user_addr(hdev, hpriv->ctx, + args->in.interrupt_timeout_us, args->in.addr, + args->in.target, interrupt, &status, + ×tamp); + if (rc) + return rc; + + memset(args, 0, sizeof(*args)); + args->out.status = status; + + if (timestamp) { + args->out.timestamp_nsec = timestamp; + args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD; + } + + return 0; +} + +int hl_wait_ioctl(struct hl_fpriv *hpriv, void *data) +{ + union hl_wait_cs_args *args = data; + u32 flags = args->in.flags; + int rc; + + /* If the device is not operational, no point in waiting for any command submission or + * user interrupt + */ + if (!hl_device_operational(hpriv->hdev, NULL)) + return -EBUSY; + + if (flags & HL_WAIT_CS_FLAGS_INTERRUPT) + rc = hl_interrupt_wait_ioctl(hpriv, data); + else if (flags & HL_WAIT_CS_FLAGS_MULTI_CS) + rc = hl_multi_cs_wait_ioctl(hpriv, data); + else + rc = hl_cs_wait_ioctl(hpriv, data); + + return rc; +} diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c new file mode 100644 index 000000000..2f4620b79 --- /dev/null +++ b/drivers/misc/habanalabs/common/context.c @@ -0,0 +1,435 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2021 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "habanalabs.h" + +#include <linux/slab.h> + +void hl_encaps_handle_do_release(struct kref *ref) +{ + struct hl_cs_encaps_sig_handle *handle = + container_of(ref, struct hl_cs_encaps_sig_handle, refcount); + struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr; + + spin_lock(&mgr->lock); + idr_remove(&mgr->handles, handle->id); + spin_unlock(&mgr->lock); + + hl_ctx_put(handle->ctx); + kfree(handle); +} + +static void hl_encaps_handle_do_release_sob(struct kref *ref) +{ + struct hl_cs_encaps_sig_handle *handle = + container_of(ref, struct hl_cs_encaps_sig_handle, refcount); + struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr; + + /* if we're here, then there was a signals reservation but cs with + * encaps signals wasn't submitted, so need to put refcount + * to hw_sob taken at the reservation. + */ + hw_sob_put(handle->hw_sob); + + spin_lock(&mgr->lock); + idr_remove(&mgr->handles, handle->id); + spin_unlock(&mgr->lock); + + hl_ctx_put(handle->ctx); + kfree(handle); +} + +static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr) +{ + spin_lock_init(&mgr->lock); + idr_init(&mgr->handles); +} + +static void hl_encaps_sig_mgr_fini(struct hl_device *hdev, + struct hl_encaps_signals_mgr *mgr) +{ + struct hl_cs_encaps_sig_handle *handle; + struct idr *idp; + u32 id; + + idp = &mgr->handles; + + if (!idr_is_empty(idp)) { + dev_warn(hdev->dev, "device released while some encaps signals handles are still allocated\n"); + idr_for_each_entry(idp, handle, id) + kref_put(&handle->refcount, + hl_encaps_handle_do_release_sob); + } + + idr_destroy(&mgr->handles); +} + +static void hl_ctx_fini(struct hl_ctx *ctx) +{ + struct hl_device *hdev = ctx->hdev; + int i; + + /* Release all allocated HW block mapped list entries and destroy + * the mutex. + */ + hl_hw_block_mem_fini(ctx); + + /* + * If we arrived here, there are no jobs waiting for this context + * on its queues so we can safely remove it. + * This is because for each CS, we increment the ref count and for + * every CS that was finished we decrement it and we won't arrive + * to this function unless the ref count is 0 + */ + + for (i = 0 ; i < hdev->asic_prop.max_pending_cs ; i++) + hl_fence_put(ctx->cs_pending[i]); + + kfree(ctx->cs_pending); + + if (ctx->asid != HL_KERNEL_ASID_ID) { + dev_dbg(hdev->dev, "closing user context %d\n", ctx->asid); + + /* The engines are stopped as there is no executing CS, but the + * Coresight might be still working by accessing addresses + * related to the stopped engines. Hence stop it explicitly. + */ + if (hdev->in_debug) + hl_device_set_debug_mode(hdev, ctx, false); + + hdev->asic_funcs->ctx_fini(ctx); + + hl_dec_ctx_fini(ctx); + + hl_cb_va_pool_fini(ctx); + hl_vm_ctx_fini(ctx); + hl_asid_free(hdev, ctx->asid); + hl_encaps_sig_mgr_fini(hdev, &ctx->sig_mgr); + } else { + dev_dbg(hdev->dev, "closing kernel context\n"); + hdev->asic_funcs->ctx_fini(ctx); + hl_vm_ctx_fini(ctx); + hl_mmu_ctx_fini(ctx); + } +} + +void hl_ctx_do_release(struct kref *ref) +{ + struct hl_ctx *ctx; + + ctx = container_of(ref, struct hl_ctx, refcount); + + hl_ctx_fini(ctx); + + if (ctx->hpriv) { + struct hl_fpriv *hpriv = ctx->hpriv; + + mutex_lock(&hpriv->ctx_lock); + hpriv->ctx = NULL; + mutex_unlock(&hpriv->ctx_lock); + + hl_hpriv_put(hpriv); + } + + kfree(ctx); +} + +int hl_ctx_create(struct hl_device *hdev, struct hl_fpriv *hpriv) +{ + struct hl_ctx_mgr *ctx_mgr = &hpriv->ctx_mgr; + struct hl_ctx *ctx; + int rc; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) { + rc = -ENOMEM; + goto out_err; + } + + mutex_lock(&ctx_mgr->lock); + rc = idr_alloc(&ctx_mgr->handles, ctx, 1, 0, GFP_KERNEL); + mutex_unlock(&ctx_mgr->lock); + + if (rc < 0) { + dev_err(hdev->dev, "Failed to allocate IDR for a new CTX\n"); + goto free_ctx; + } + + ctx->handle = rc; + + rc = hl_ctx_init(hdev, ctx, false); + if (rc) + goto remove_from_idr; + + hl_hpriv_get(hpriv); + ctx->hpriv = hpriv; + + /* TODO: remove for multiple contexts per process */ + hpriv->ctx = ctx; + + /* TODO: remove the following line for multiple process support */ + hdev->is_compute_ctx_active = true; + + return 0; + +remove_from_idr: + mutex_lock(&ctx_mgr->lock); + idr_remove(&ctx_mgr->handles, ctx->handle); + mutex_unlock(&ctx_mgr->lock); +free_ctx: + kfree(ctx); +out_err: + return rc; +} + +int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx) +{ + int rc = 0, i; + + ctx->hdev = hdev; + + kref_init(&ctx->refcount); + + ctx->cs_sequence = 1; + spin_lock_init(&ctx->cs_lock); + atomic_set(&ctx->thread_ctx_switch_token, 1); + ctx->thread_ctx_switch_wait_token = 0; + ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs, + sizeof(struct hl_fence *), + GFP_KERNEL); + if (!ctx->cs_pending) + return -ENOMEM; + + INIT_LIST_HEAD(&ctx->outcome_store.used_list); + INIT_LIST_HEAD(&ctx->outcome_store.free_list); + hash_init(ctx->outcome_store.outcome_map); + for (i = 0; i < ARRAY_SIZE(ctx->outcome_store.nodes_pool); ++i) + list_add(&ctx->outcome_store.nodes_pool[i].list_link, + &ctx->outcome_store.free_list); + + hl_hw_block_mem_init(ctx); + + if (is_kernel_ctx) { + ctx->asid = HL_KERNEL_ASID_ID; /* Kernel driver gets ASID 0 */ + rc = hl_vm_ctx_init(ctx); + if (rc) { + dev_err(hdev->dev, "Failed to init mem ctx module\n"); + rc = -ENOMEM; + goto err_hw_block_mem_fini; + } + + rc = hdev->asic_funcs->ctx_init(ctx); + if (rc) { + dev_err(hdev->dev, "ctx_init failed\n"); + goto err_vm_ctx_fini; + } + } else { + ctx->asid = hl_asid_alloc(hdev); + if (!ctx->asid) { + dev_err(hdev->dev, "No free ASID, failed to create context\n"); + rc = -ENOMEM; + goto err_hw_block_mem_fini; + } + + rc = hl_vm_ctx_init(ctx); + if (rc) { + dev_err(hdev->dev, "Failed to init mem ctx module\n"); + rc = -ENOMEM; + goto err_asid_free; + } + + rc = hl_cb_va_pool_init(ctx); + if (rc) { + dev_err(hdev->dev, + "Failed to init VA pool for mapped CB\n"); + goto err_vm_ctx_fini; + } + + rc = hdev->asic_funcs->ctx_init(ctx); + if (rc) { + dev_err(hdev->dev, "ctx_init failed\n"); + goto err_cb_va_pool_fini; + } + + hl_encaps_sig_mgr_init(&ctx->sig_mgr); + + dev_dbg(hdev->dev, "create user context %d\n", ctx->asid); + } + + return 0; + +err_cb_va_pool_fini: + hl_cb_va_pool_fini(ctx); +err_vm_ctx_fini: + hl_vm_ctx_fini(ctx); +err_asid_free: + if (ctx->asid != HL_KERNEL_ASID_ID) + hl_asid_free(hdev, ctx->asid); +err_hw_block_mem_fini: + hl_hw_block_mem_fini(ctx); + kfree(ctx->cs_pending); + + return rc; +} + +static int hl_ctx_get_unless_zero(struct hl_ctx *ctx) +{ + return kref_get_unless_zero(&ctx->refcount); +} + +void hl_ctx_get(struct hl_ctx *ctx) +{ + kref_get(&ctx->refcount); +} + +int hl_ctx_put(struct hl_ctx *ctx) +{ + return kref_put(&ctx->refcount, hl_ctx_do_release); +} + +struct hl_ctx *hl_get_compute_ctx(struct hl_device *hdev) +{ + struct hl_ctx *ctx = NULL; + struct hl_fpriv *hpriv; + + mutex_lock(&hdev->fpriv_list_lock); + + list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) { + mutex_lock(&hpriv->ctx_lock); + ctx = hpriv->ctx; + if (ctx && !hl_ctx_get_unless_zero(ctx)) + ctx = NULL; + mutex_unlock(&hpriv->ctx_lock); + + /* There can only be a single user which has opened the compute device, so exit + * immediately once we find its context or if we see that it has been released + */ + break; + } + + mutex_unlock(&hdev->fpriv_list_lock); + + return ctx; +} + +/* + * hl_ctx_get_fence_locked - get CS fence under CS lock + * + * @ctx: pointer to the context structure. + * @seq: CS sequences number + * + * @return valid fence pointer on success, NULL if fence is gone, otherwise + * error pointer. + * + * NOTE: this function shall be called with cs_lock locked + */ +static struct hl_fence *hl_ctx_get_fence_locked(struct hl_ctx *ctx, u64 seq) +{ + struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop; + struct hl_fence *fence; + + if (seq >= ctx->cs_sequence) + return ERR_PTR(-EINVAL); + + if (seq + asic_prop->max_pending_cs < ctx->cs_sequence) + return NULL; + + fence = ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)]; + hl_fence_get(fence); + return fence; +} + +struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq) +{ + struct hl_fence *fence; + + spin_lock(&ctx->cs_lock); + + fence = hl_ctx_get_fence_locked(ctx, seq); + + spin_unlock(&ctx->cs_lock); + + return fence; +} + +/* + * hl_ctx_get_fences - get multiple CS fences under the same CS lock + * + * @ctx: pointer to the context structure. + * @seq_arr: array of CS sequences to wait for + * @fence: fence array to store the CS fences + * @arr_len: length of seq_arr and fence_arr + * + * @return 0 on success, otherwise non 0 error code + */ +int hl_ctx_get_fences(struct hl_ctx *ctx, u64 *seq_arr, + struct hl_fence **fence, u32 arr_len) +{ + struct hl_fence **fence_arr_base = fence; + int i, rc = 0; + + spin_lock(&ctx->cs_lock); + + for (i = 0; i < arr_len; i++, fence++) { + u64 seq = seq_arr[i]; + + *fence = hl_ctx_get_fence_locked(ctx, seq); + + if (IS_ERR(*fence)) { + dev_err(ctx->hdev->dev, + "Failed to get fence for CS with seq 0x%llx\n", + seq); + rc = PTR_ERR(*fence); + break; + } + } + + spin_unlock(&ctx->cs_lock); + + if (rc) + hl_fences_put(fence_arr_base, i); + + return rc; +} + +/* + * hl_ctx_mgr_init - initialize the context manager + * + * @ctx_mgr: pointer to context manager structure + * + * This manager is an object inside the hpriv object of the user process. + * The function is called when a user process opens the FD. + */ +void hl_ctx_mgr_init(struct hl_ctx_mgr *ctx_mgr) +{ + mutex_init(&ctx_mgr->lock); + idr_init(&ctx_mgr->handles); +} + +/* + * hl_ctx_mgr_fini - finalize the context manager + * + * @hdev: pointer to device structure + * @ctx_mgr: pointer to context manager structure + * + * This function goes over all the contexts in the manager and frees them. + * It is called when a process closes the FD. + */ +void hl_ctx_mgr_fini(struct hl_device *hdev, struct hl_ctx_mgr *ctx_mgr) +{ + struct hl_ctx *ctx; + struct idr *idp; + u32 id; + + idp = &ctx_mgr->handles; + + idr_for_each_entry(idp, ctx, id) + kref_put(&ctx->refcount, hl_ctx_do_release); + + idr_destroy(&ctx_mgr->handles); + mutex_destroy(&ctx_mgr->lock); +} diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c new file mode 100644 index 000000000..48d3ec8b5 --- /dev/null +++ b/drivers/misc/habanalabs/common/debugfs.c @@ -0,0 +1,1943 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2021 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "habanalabs.h" +#include "../include/hw_ip/mmu/mmu_general.h" + +#include <linux/pci.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <linux/iommu.h> + +#define MMU_ADDR_BUF_SIZE 40 +#define MMU_ASID_BUF_SIZE 10 +#define MMU_KBUF_SIZE (MMU_ADDR_BUF_SIZE + MMU_ASID_BUF_SIZE) +#define I2C_MAX_TRANSACTION_LEN 8 + +static struct dentry *hl_debug_root; + +static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr, + u8 i2c_reg, u8 i2c_len, u64 *val) +{ + struct cpucp_packet pkt; + int rc; + + if (!hl_device_operational(hdev, NULL)) + return -EBUSY; + + if (i2c_len > I2C_MAX_TRANSACTION_LEN) { + dev_err(hdev->dev, "I2C transaction length %u, exceeds maximum of %u\n", + i2c_len, I2C_MAX_TRANSACTION_LEN); + return -EINVAL; + } + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_I2C_RD << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.i2c_bus = i2c_bus; + pkt.i2c_addr = i2c_addr; + pkt.i2c_reg = i2c_reg; + pkt.i2c_len = i2c_len; + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + 0, val); + if (rc) + dev_err(hdev->dev, "Failed to read from I2C, error %d\n", rc); + + return rc; +} + +static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr, + u8 i2c_reg, u8 i2c_len, u64 val) +{ + struct cpucp_packet pkt; + int rc; + + if (!hl_device_operational(hdev, NULL)) + return -EBUSY; + + if (i2c_len > I2C_MAX_TRANSACTION_LEN) { + dev_err(hdev->dev, "I2C transaction length %u, exceeds maximum of %u\n", + i2c_len, I2C_MAX_TRANSACTION_LEN); + return -EINVAL; + } + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_I2C_WR << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.i2c_bus = i2c_bus; + pkt.i2c_addr = i2c_addr; + pkt.i2c_reg = i2c_reg; + pkt.i2c_len = i2c_len; + pkt.value = cpu_to_le64(val); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + 0, NULL); + + if (rc) + dev_err(hdev->dev, "Failed to write to I2C, error %d\n", rc); + + return rc; +} + +static void hl_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state) +{ + struct cpucp_packet pkt; + int rc; + + if (!hl_device_operational(hdev, NULL)) + return; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_LED_SET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.led_index = cpu_to_le32(led); + pkt.value = cpu_to_le64(state); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + 0, NULL); + + if (rc) + dev_err(hdev->dev, "Failed to set LED %d, error %d\n", led, rc); +} + +static int command_buffers_show(struct seq_file *s, void *data) +{ + struct hl_debugfs_entry *entry = s->private; + struct hl_dbg_device_entry *dev_entry = entry->dev_entry; + struct hl_cb *cb; + bool first = true; + + spin_lock(&dev_entry->cb_spinlock); + + list_for_each_entry(cb, &dev_entry->cb_list, debugfs_list) { + if (first) { + first = false; + seq_puts(s, "\n"); + seq_puts(s, " CB ID CTX ID CB size CB RefCnt mmap? CS counter\n"); + seq_puts(s, "---------------------------------------------------------------\n"); + } + seq_printf(s, + " %03llu %d 0x%08x %d %d %d\n", + cb->buf->handle, cb->ctx->asid, cb->size, + kref_read(&cb->buf->refcount), + atomic_read(&cb->buf->mmap), atomic_read(&cb->cs_cnt)); + } + + spin_unlock(&dev_entry->cb_spinlock); + + if (!first) + seq_puts(s, "\n"); + + return 0; +} + +static int command_submission_show(struct seq_file *s, void *data) +{ + struct hl_debugfs_entry *entry = s->private; + struct hl_dbg_device_entry *dev_entry = entry->dev_entry; + struct hl_cs *cs; + bool first = true; + + spin_lock(&dev_entry->cs_spinlock); + + list_for_each_entry(cs, &dev_entry->cs_list, debugfs_list) { + if (first) { + first = false; + seq_puts(s, "\n"); + seq_puts(s, " CS ID CS TYPE CTX ASID CS RefCnt Submitted Completed\n"); + seq_puts(s, "----------------------------------------------------------------\n"); + } + seq_printf(s, + " %llu %d %d %d %d %d\n", + cs->sequence, cs->type, cs->ctx->asid, + kref_read(&cs->refcount), + cs->submitted, cs->completed); + } + + spin_unlock(&dev_entry->cs_spinlock); + + if (!first) + seq_puts(s, "\n"); + + return 0; +} + +static int command_submission_jobs_show(struct seq_file *s, void *data) +{ + struct hl_debugfs_entry *entry = s->private; + struct hl_dbg_device_entry *dev_entry = entry->dev_entry; + struct hl_cs_job *job; + bool first = true; + + spin_lock(&dev_entry->cs_job_spinlock); + + list_for_each_entry(job, &dev_entry->cs_job_list, debugfs_list) { + if (first) { + first = false; + seq_puts(s, "\n"); + seq_puts(s, " JOB ID CS ID CS TYPE CTX ASID JOB RefCnt H/W Queue\n"); + seq_puts(s, "---------------------------------------------------------------\n"); + } + if (job->cs) + seq_printf(s, + " %02d %llu %d %d %d %d\n", + job->id, job->cs->sequence, job->cs->type, + job->cs->ctx->asid, kref_read(&job->refcount), + job->hw_queue_id); + else + seq_printf(s, + " %02d 0 0 %d %d %d\n", + job->id, HL_KERNEL_ASID_ID, + kref_read(&job->refcount), job->hw_queue_id); + } + + spin_unlock(&dev_entry->cs_job_spinlock); + + if (!first) + seq_puts(s, "\n"); + + return 0; +} + +static int userptr_show(struct seq_file *s, void *data) +{ + struct hl_debugfs_entry *entry = s->private; + struct hl_dbg_device_entry *dev_entry = entry->dev_entry; + struct hl_userptr *userptr; + char dma_dir[4][30] = {"DMA_BIDIRECTIONAL", "DMA_TO_DEVICE", + "DMA_FROM_DEVICE", "DMA_NONE"}; + bool first = true; + + spin_lock(&dev_entry->userptr_spinlock); + + list_for_each_entry(userptr, &dev_entry->userptr_list, debugfs_list) { + if (first) { + first = false; + seq_puts(s, "\n"); + seq_puts(s, " pid user virtual address size dma dir\n"); + seq_puts(s, "----------------------------------------------------------\n"); + } + seq_printf(s, " %-7d 0x%-14llx %-10llu %-30s\n", + userptr->pid, userptr->addr, userptr->size, + dma_dir[userptr->dir]); + } + + spin_unlock(&dev_entry->userptr_spinlock); + + if (!first) + seq_puts(s, "\n"); + + return 0; +} + +static int vm_show(struct seq_file *s, void *data) +{ + struct hl_debugfs_entry *entry = s->private; + struct hl_dbg_device_entry *dev_entry = entry->dev_entry; + struct hl_vm_hw_block_list_node *lnode; + struct hl_ctx *ctx; + struct hl_vm *vm; + struct hl_vm_hash_node *hnode; + struct hl_userptr *userptr; + struct hl_vm_phys_pg_pack *phys_pg_pack = NULL; + struct hl_va_range *va_range; + struct hl_vm_va_block *va_block; + enum vm_type *vm_type; + bool once = true; + u64 j; + int i; + + if (!dev_entry->hdev->mmu_enable) + return 0; + + spin_lock(&dev_entry->ctx_mem_hash_spinlock); + + list_for_each_entry(ctx, &dev_entry->ctx_mem_hash_list, debugfs_list) { + once = false; + seq_puts(s, "\n\n----------------------------------------------------"); + seq_puts(s, "\n----------------------------------------------------\n\n"); + seq_printf(s, "ctx asid: %u\n", ctx->asid); + + seq_puts(s, "\nmappings:\n\n"); + seq_puts(s, " virtual address size handle\n"); + seq_puts(s, "----------------------------------------------------\n"); + mutex_lock(&ctx->mem_hash_lock); + hash_for_each(ctx->mem_hash, i, hnode, node) { + vm_type = hnode->ptr; + + if (*vm_type == VM_TYPE_USERPTR) { + userptr = hnode->ptr; + seq_printf(s, + " 0x%-14llx %-10llu\n", + hnode->vaddr, userptr->size); + } else { + phys_pg_pack = hnode->ptr; + seq_printf(s, + " 0x%-14llx %-10llu %-4u\n", + hnode->vaddr, phys_pg_pack->total_size, + phys_pg_pack->handle); + } + } + mutex_unlock(&ctx->mem_hash_lock); + + if (ctx->asid != HL_KERNEL_ASID_ID && + !list_empty(&ctx->hw_block_mem_list)) { + seq_puts(s, "\nhw_block mappings:\n\n"); + seq_puts(s, + " virtual address block size mapped size HW block id\n"); + seq_puts(s, + "---------------------------------------------------------------\n"); + mutex_lock(&ctx->hw_block_list_lock); + list_for_each_entry(lnode, &ctx->hw_block_mem_list, node) { + seq_printf(s, + " 0x%-14lx %-6u %-6u %-9u\n", + lnode->vaddr, lnode->block_size, lnode->mapped_size, + lnode->id); + } + mutex_unlock(&ctx->hw_block_list_lock); + } + + vm = &ctx->hdev->vm; + spin_lock(&vm->idr_lock); + + if (!idr_is_empty(&vm->phys_pg_pack_handles)) + seq_puts(s, "\n\nallocations:\n"); + + idr_for_each_entry(&vm->phys_pg_pack_handles, phys_pg_pack, i) { + if (phys_pg_pack->asid != ctx->asid) + continue; + + seq_printf(s, "\nhandle: %u\n", phys_pg_pack->handle); + seq_printf(s, "page size: %u\n\n", + phys_pg_pack->page_size); + seq_puts(s, " physical address\n"); + seq_puts(s, "---------------------\n"); + for (j = 0 ; j < phys_pg_pack->npages ; j++) { + seq_printf(s, " 0x%-14llx\n", + phys_pg_pack->pages[j]); + } + } + spin_unlock(&vm->idr_lock); + + } + + spin_unlock(&dev_entry->ctx_mem_hash_spinlock); + + ctx = hl_get_compute_ctx(dev_entry->hdev); + if (ctx) { + seq_puts(s, "\nVA ranges:\n\n"); + for (i = HL_VA_RANGE_TYPE_HOST ; i < HL_VA_RANGE_TYPE_MAX ; ++i) { + va_range = ctx->va_range[i]; + seq_printf(s, " va_range %d\n", i); + seq_puts(s, "---------------------\n"); + mutex_lock(&va_range->lock); + list_for_each_entry(va_block, &va_range->list, node) { + seq_printf(s, "%#16llx - %#16llx (%#llx)\n", + va_block->start, va_block->end, + va_block->size); + } + mutex_unlock(&va_range->lock); + seq_puts(s, "\n"); + } + hl_ctx_put(ctx); + } + + if (!once) + seq_puts(s, "\n"); + + return 0; +} + +static int userptr_lookup_show(struct seq_file *s, void *data) +{ + struct hl_debugfs_entry *entry = s->private; + struct hl_dbg_device_entry *dev_entry = entry->dev_entry; + struct scatterlist *sg; + struct hl_userptr *userptr; + bool first = true; + u64 total_npages, npages, sg_start, sg_end; + dma_addr_t dma_addr; + int i; + + spin_lock(&dev_entry->userptr_spinlock); + + list_for_each_entry(userptr, &dev_entry->userptr_list, debugfs_list) { + if (dev_entry->userptr_lookup >= userptr->addr && + dev_entry->userptr_lookup < userptr->addr + userptr->size) { + total_npages = 0; + for_each_sgtable_dma_sg(userptr->sgt, sg, i) { + npages = hl_get_sg_info(sg, &dma_addr); + sg_start = userptr->addr + + total_npages * PAGE_SIZE; + sg_end = userptr->addr + + (total_npages + npages) * PAGE_SIZE; + + if (dev_entry->userptr_lookup >= sg_start && + dev_entry->userptr_lookup < sg_end) { + dma_addr += (dev_entry->userptr_lookup - + sg_start); + if (first) { + first = false; + seq_puts(s, "\n"); + seq_puts(s, " user virtual address dma address pid region start region size\n"); + seq_puts(s, "---------------------------------------------------------------------------------------\n"); + } + seq_printf(s, " 0x%-18llx 0x%-16llx %-8u 0x%-16llx %-12llu\n", + dev_entry->userptr_lookup, + (u64)dma_addr, userptr->pid, + userptr->addr, userptr->size); + } + total_npages += npages; + } + } + } + + spin_unlock(&dev_entry->userptr_spinlock); + + if (!first) + seq_puts(s, "\n"); + + return 0; +} + +static ssize_t userptr_lookup_write(struct file *file, const char __user *buf, + size_t count, loff_t *f_pos) +{ + struct seq_file *s = file->private_data; + struct hl_debugfs_entry *entry = s->private; + struct hl_dbg_device_entry *dev_entry = entry->dev_entry; + ssize_t rc; + u64 value; + + rc = kstrtoull_from_user(buf, count, 16, &value); + if (rc) + return rc; + + dev_entry->userptr_lookup = value; + + return count; +} + +static int mmu_show(struct seq_file *s, void *data) +{ + struct hl_debugfs_entry *entry = s->private; + struct hl_dbg_device_entry *dev_entry = entry->dev_entry; + struct hl_device *hdev = dev_entry->hdev; + struct hl_ctx *ctx; + struct hl_mmu_hop_info hops_info = {0}; + u64 virt_addr = dev_entry->mmu_addr, phys_addr; + int i; + + if (!hdev->mmu_enable) + return 0; + + if (dev_entry->mmu_asid == HL_KERNEL_ASID_ID) + ctx = hdev->kernel_ctx; + else + ctx = hl_get_compute_ctx(hdev); + + if (!ctx) { + dev_err(hdev->dev, "no ctx available\n"); + return 0; + } + + if (hl_mmu_get_tlb_info(ctx, virt_addr, &hops_info)) { + dev_err(hdev->dev, "virt addr 0x%llx is not mapped to phys addr\n", + virt_addr); + goto put_ctx; + } + + hl_mmu_va_to_pa(ctx, virt_addr, &phys_addr); + + if (hops_info.scrambled_vaddr && + (dev_entry->mmu_addr != hops_info.scrambled_vaddr)) + seq_printf(s, + "asid: %u, virt_addr: 0x%llx, scrambled virt_addr: 0x%llx,\nphys_addr: 0x%llx, scrambled_phys_addr: 0x%llx\n", + dev_entry->mmu_asid, dev_entry->mmu_addr, + hops_info.scrambled_vaddr, + hops_info.unscrambled_paddr, phys_addr); + else + seq_printf(s, + "asid: %u, virt_addr: 0x%llx, phys_addr: 0x%llx\n", + dev_entry->mmu_asid, dev_entry->mmu_addr, phys_addr); + + for (i = 0 ; i < hops_info.used_hops ; i++) { + seq_printf(s, "hop%d_addr: 0x%llx\n", + i, hops_info.hop_info[i].hop_addr); + seq_printf(s, "hop%d_pte_addr: 0x%llx\n", + i, hops_info.hop_info[i].hop_pte_addr); + seq_printf(s, "hop%d_pte: 0x%llx\n", + i, hops_info.hop_info[i].hop_pte_val); + } + +put_ctx: + if (dev_entry->mmu_asid != HL_KERNEL_ASID_ID) + hl_ctx_put(ctx); + + return 0; +} + +static ssize_t mmu_asid_va_write(struct file *file, const char __user *buf, + size_t count, loff_t *f_pos) +{ + struct seq_file *s = file->private_data; + struct hl_debugfs_entry *entry = s->private; + struct hl_dbg_device_entry *dev_entry = entry->dev_entry; + struct hl_device *hdev = dev_entry->hdev; + char kbuf[MMU_KBUF_SIZE]; + char *c; + ssize_t rc; + + if (!hdev->mmu_enable) + return count; + + if (count > sizeof(kbuf) - 1) + goto err; + if (copy_from_user(kbuf, buf, count)) + goto err; + kbuf[count] = 0; + + c = strchr(kbuf, ' '); + if (!c) + goto err; + *c = '\0'; + + rc = kstrtouint(kbuf, 10, &dev_entry->mmu_asid); + if (rc) + goto err; + + if (strncmp(c+1, "0x", 2)) + goto err; + rc = kstrtoull(c+3, 16, &dev_entry->mmu_addr); + if (rc) + goto err; + + return count; + +err: + dev_err(hdev->dev, "usage: echo <asid> <0xaddr> > mmu\n"); + + return -EINVAL; +} + +static int mmu_ack_error(struct seq_file *s, void *data) +{ + struct hl_debugfs_entry *entry = s->private; + struct hl_dbg_device_entry *dev_entry = entry->dev_entry; + struct hl_device *hdev = dev_entry->hdev; + int rc; + + if (!hdev->mmu_enable) + return 0; + + if (!dev_entry->mmu_cap_mask) { + dev_err(hdev->dev, "mmu_cap_mask is not set\n"); + goto err; + } + + rc = hdev->asic_funcs->ack_mmu_errors(hdev, dev_entry->mmu_cap_mask); + if (rc) + goto err; + + return 0; +err: + return -EINVAL; +} + +static ssize_t mmu_ack_error_value_write(struct file *file, + const char __user *buf, + size_t count, loff_t *f_pos) +{ + struct seq_file *s = file->private_data; + struct hl_debugfs_entry *entry = s->private; + struct hl_dbg_device_entry *dev_entry = entry->dev_entry; + struct hl_device *hdev = dev_entry->hdev; + char kbuf[MMU_KBUF_SIZE]; + ssize_t rc; + + if (!hdev->mmu_enable) + return count; + + if (count > sizeof(kbuf) - 1) + goto err; + + if (copy_from_user(kbuf, buf, count)) + goto err; + + kbuf[count] = 0; + + if (strncmp(kbuf, "0x", 2)) + goto err; + + rc = kstrtoull(kbuf, 16, &dev_entry->mmu_cap_mask); + if (rc) + goto err; + + return count; +err: + dev_err(hdev->dev, "usage: echo <0xmmu_cap_mask > > mmu_error\n"); + + return -EINVAL; +} + +static int engines_show(struct seq_file *s, void *data) +{ + struct hl_debugfs_entry *entry = s->private; + struct hl_dbg_device_entry *dev_entry = entry->dev_entry; + struct hl_device *hdev = dev_entry->hdev; + struct engines_data eng_data; + + if (hdev->reset_info.in_reset) { + dev_warn_ratelimited(hdev->dev, + "Can't check device idle during reset\n"); + return 0; + } + + eng_data.actual_size = 0; + eng_data.allocated_buf_size = HL_ENGINES_DATA_MAX_SIZE; + eng_data.buf = vmalloc(eng_data.allocated_buf_size); + if (!eng_data.buf) + return -ENOMEM; + + hdev->asic_funcs->is_device_idle(hdev, NULL, 0, &eng_data); + + if (eng_data.actual_size > eng_data.allocated_buf_size) { + dev_err(hdev->dev, + "Engines data size (%d Bytes) is bigger than allocated size (%u Bytes)\n", + eng_data.actual_size, eng_data.allocated_buf_size); + vfree(eng_data.buf); + return -ENOMEM; + } + + seq_write(s, eng_data.buf, eng_data.actual_size); + + vfree(eng_data.buf); + + return 0; +} + +static ssize_t hl_memory_scrub(struct file *f, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + u64 val = hdev->memory_scrub_val; + int rc; + + if (!hl_device_operational(hdev, NULL)) { + dev_warn_ratelimited(hdev->dev, "Can't scrub memory, device is not operational\n"); + return -EIO; + } + + mutex_lock(&hdev->fpriv_list_lock); + if (hdev->is_compute_ctx_active) { + mutex_unlock(&hdev->fpriv_list_lock); + dev_err(hdev->dev, "can't scrub dram, context exist\n"); + return -EBUSY; + } + hdev->is_in_dram_scrub = true; + mutex_unlock(&hdev->fpriv_list_lock); + + rc = hdev->asic_funcs->scrub_device_dram(hdev, val); + + mutex_lock(&hdev->fpriv_list_lock); + hdev->is_in_dram_scrub = false; + mutex_unlock(&hdev->fpriv_list_lock); + + if (rc) + return rc; + return count; +} + +static bool hl_is_device_va(struct hl_device *hdev, u64 addr) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + + if (!hdev->mmu_enable) + goto out; + + if (prop->dram_supports_virtual_memory && + (addr >= prop->dmmu.start_addr && addr < prop->dmmu.end_addr)) + return true; + + if (addr >= prop->pmmu.start_addr && + addr < prop->pmmu.end_addr) + return true; + + if (addr >= prop->pmmu_huge.start_addr && + addr < prop->pmmu_huge.end_addr) + return true; +out: + return false; +} + +static bool hl_is_device_internal_memory_va(struct hl_device *hdev, u64 addr, + u32 size) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + u64 dram_start_addr, dram_end_addr; + + if (!hdev->mmu_enable) + return false; + + if (prop->dram_supports_virtual_memory) { + dram_start_addr = prop->dmmu.start_addr; + dram_end_addr = prop->dmmu.end_addr; + } else { + dram_start_addr = prop->dram_base_address; + dram_end_addr = prop->dram_end_address; + } + + if (hl_mem_area_inside_range(addr, size, dram_start_addr, + dram_end_addr)) + return true; + + if (hl_mem_area_inside_range(addr, size, prop->sram_base_address, + prop->sram_end_address)) + return true; + + return false; +} + +static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr, u32 size, + u64 *phys_addr) +{ + struct hl_vm_phys_pg_pack *phys_pg_pack; + struct hl_ctx *ctx; + struct hl_vm_hash_node *hnode; + u64 end_address, range_size; + struct hl_userptr *userptr; + enum vm_type *vm_type; + bool valid = false; + int i, rc = 0; + + ctx = hl_get_compute_ctx(hdev); + + if (!ctx) { + dev_err(hdev->dev, "no ctx available\n"); + return -EINVAL; + } + + /* Verify address is mapped */ + mutex_lock(&ctx->mem_hash_lock); + hash_for_each(ctx->mem_hash, i, hnode, node) { + vm_type = hnode->ptr; + + if (*vm_type == VM_TYPE_USERPTR) { + userptr = hnode->ptr; + range_size = userptr->size; + } else { + phys_pg_pack = hnode->ptr; + range_size = phys_pg_pack->total_size; + } + + end_address = virt_addr + size; + if ((virt_addr >= hnode->vaddr) && + (end_address <= hnode->vaddr + range_size)) { + valid = true; + break; + } + } + mutex_unlock(&ctx->mem_hash_lock); + + if (!valid) { + dev_err(hdev->dev, + "virt addr 0x%llx is not mapped\n", + virt_addr); + rc = -EINVAL; + goto put_ctx; + } + + rc = hl_mmu_va_to_pa(ctx, virt_addr, phys_addr); + if (rc) { + dev_err(hdev->dev, + "virt addr 0x%llx is not mapped to phys addr\n", + virt_addr); + rc = -EINVAL; + } + +put_ctx: + hl_ctx_put(ctx); + + return rc; +} + +static int hl_access_dev_mem_by_region(struct hl_device *hdev, u64 addr, + u64 *val, enum debugfs_access_type acc_type, bool *found) +{ + size_t acc_size = (acc_type == DEBUGFS_READ64 || acc_type == DEBUGFS_WRITE64) ? + sizeof(u64) : sizeof(u32); + struct pci_mem_region *mem_reg; + int i; + + for (i = 0; i < PCI_REGION_NUMBER; i++) { + mem_reg = &hdev->pci_mem_region[i]; + if (!mem_reg->used) + continue; + if (addr >= mem_reg->region_base && + addr <= mem_reg->region_base + mem_reg->region_size - acc_size) { + *found = true; + return hdev->asic_funcs->access_dev_mem(hdev, i, addr, val, acc_type); + } + } + return 0; +} + +static void hl_access_host_mem(struct hl_device *hdev, u64 addr, u64 *val, + enum debugfs_access_type acc_type) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + u64 offset = prop->device_dma_offset_for_host_access; + + switch (acc_type) { + case DEBUGFS_READ32: + *val = *(u32 *) phys_to_virt(addr - offset); + break; + case DEBUGFS_WRITE32: + *(u32 *) phys_to_virt(addr - offset) = *val; + break; + case DEBUGFS_READ64: + *val = *(u64 *) phys_to_virt(addr - offset); + break; + case DEBUGFS_WRITE64: + *(u64 *) phys_to_virt(addr - offset) = *val; + break; + default: + dev_err(hdev->dev, "hostmem access-type %d id not supported\n", acc_type); + break; + } +} + +static int hl_access_mem(struct hl_device *hdev, u64 addr, u64 *val, + enum debugfs_access_type acc_type) +{ + size_t acc_size = (acc_type == DEBUGFS_READ64 || acc_type == DEBUGFS_WRITE64) ? + sizeof(u64) : sizeof(u32); + u64 host_start = hdev->asic_prop.host_base_address; + u64 host_end = hdev->asic_prop.host_end_address; + bool user_address, found = false; + int rc; + + user_address = hl_is_device_va(hdev, addr); + if (user_address) { + rc = device_va_to_pa(hdev, addr, acc_size, &addr); + if (rc) + return rc; + } + + rc = hl_access_dev_mem_by_region(hdev, addr, val, acc_type, &found); + if (rc) { + dev_err(hdev->dev, + "Failed reading addr %#llx from dev mem (%d)\n", + addr, rc); + return rc; + } + + if (found) + return 0; + + if (!user_address || device_iommu_mapped(&hdev->pdev->dev)) { + rc = -EINVAL; + goto err; + } + + if (addr >= host_start && addr <= host_end - acc_size) { + hl_access_host_mem(hdev, addr, val, acc_type); + } else { + rc = -EINVAL; + goto err; + } + + return 0; +err: + dev_err(hdev->dev, "invalid addr %#llx\n", addr); + return rc; +} + +static ssize_t hl_data_read32(struct file *f, char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + u64 value64, addr = entry->addr; + char tmp_buf[32]; + ssize_t rc; + u32 val; + + if (hdev->reset_info.in_reset) { + dev_warn_ratelimited(hdev->dev, "Can't read during reset\n"); + return 0; + } + + if (*ppos) + return 0; + + rc = hl_access_mem(hdev, addr, &value64, DEBUGFS_READ32); + if (rc) + return rc; + + val = value64; /* downcast back to 32 */ + + sprintf(tmp_buf, "0x%08x\n", val); + return simple_read_from_buffer(buf, count, ppos, tmp_buf, + strlen(tmp_buf)); +} + +static ssize_t hl_data_write32(struct file *f, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + u64 value64, addr = entry->addr; + u32 value; + ssize_t rc; + + if (hdev->reset_info.in_reset) { + dev_warn_ratelimited(hdev->dev, "Can't write during reset\n"); + return 0; + } + + rc = kstrtouint_from_user(buf, count, 16, &value); + if (rc) + return rc; + + value64 = value; + rc = hl_access_mem(hdev, addr, &value64, DEBUGFS_WRITE32); + if (rc) + return rc; + + return count; +} + +static ssize_t hl_data_read64(struct file *f, char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + u64 addr = entry->addr; + char tmp_buf[32]; + ssize_t rc; + u64 val; + + if (hdev->reset_info.in_reset) { + dev_warn_ratelimited(hdev->dev, "Can't read during reset\n"); + return 0; + } + + if (*ppos) + return 0; + + rc = hl_access_mem(hdev, addr, &val, DEBUGFS_READ64); + if (rc) + return rc; + + sprintf(tmp_buf, "0x%016llx\n", val); + return simple_read_from_buffer(buf, count, ppos, tmp_buf, + strlen(tmp_buf)); +} + +static ssize_t hl_data_write64(struct file *f, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + u64 addr = entry->addr; + u64 value; + ssize_t rc; + + if (hdev->reset_info.in_reset) { + dev_warn_ratelimited(hdev->dev, "Can't write during reset\n"); + return 0; + } + + rc = kstrtoull_from_user(buf, count, 16, &value); + if (rc) + return rc; + + rc = hl_access_mem(hdev, addr, &value, DEBUGFS_WRITE64); + if (rc) + return rc; + + return count; +} + +static ssize_t hl_dma_size_write(struct file *f, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + u64 addr = entry->addr; + ssize_t rc; + u32 size; + + if (hdev->reset_info.in_reset) { + dev_warn_ratelimited(hdev->dev, "Can't DMA during reset\n"); + return 0; + } + rc = kstrtouint_from_user(buf, count, 16, &size); + if (rc) + return rc; + + if (!size) { + dev_err(hdev->dev, "DMA read failed. size can't be 0\n"); + return -EINVAL; + } + + if (size > SZ_128M) { + dev_err(hdev->dev, + "DMA read failed. size can't be larger than 128MB\n"); + return -EINVAL; + } + + if (!hl_is_device_internal_memory_va(hdev, addr, size)) { + dev_err(hdev->dev, + "DMA read failed. Invalid 0x%010llx + 0x%08x\n", + addr, size); + return -EINVAL; + } + + /* Free the previous allocation, if there was any */ + entry->data_dma_blob_desc.size = 0; + vfree(entry->data_dma_blob_desc.data); + + entry->data_dma_blob_desc.data = vmalloc(size); + if (!entry->data_dma_blob_desc.data) + return -ENOMEM; + + rc = hdev->asic_funcs->debugfs_read_dma(hdev, addr, size, + entry->data_dma_blob_desc.data); + if (rc) { + dev_err(hdev->dev, "Failed to DMA from 0x%010llx\n", addr); + vfree(entry->data_dma_blob_desc.data); + entry->data_dma_blob_desc.data = NULL; + return -EIO; + } + + entry->data_dma_blob_desc.size = size; + + return count; +} + +static ssize_t hl_monitor_dump_trigger(struct file *f, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + u32 size, trig; + ssize_t rc; + + if (hdev->reset_info.in_reset) { + dev_warn_ratelimited(hdev->dev, "Can't dump monitors during reset\n"); + return 0; + } + rc = kstrtouint_from_user(buf, count, 10, &trig); + if (rc) + return rc; + + if (trig != 1) { + dev_err(hdev->dev, "Must write 1 to trigger monitor dump\n"); + return -EINVAL; + } + + size = sizeof(struct cpucp_monitor_dump); + + /* Free the previous allocation, if there was any */ + entry->mon_dump_blob_desc.size = 0; + vfree(entry->mon_dump_blob_desc.data); + + entry->mon_dump_blob_desc.data = vmalloc(size); + if (!entry->mon_dump_blob_desc.data) + return -ENOMEM; + + rc = hdev->asic_funcs->get_monitor_dump(hdev, entry->mon_dump_blob_desc.data); + if (rc) { + dev_err(hdev->dev, "Failed to dump monitors\n"); + vfree(entry->mon_dump_blob_desc.data); + entry->mon_dump_blob_desc.data = NULL; + return -EIO; + } + + entry->mon_dump_blob_desc.size = size; + + return count; +} + +static ssize_t hl_get_power_state(struct file *f, char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + char tmp_buf[200]; + int i; + + if (*ppos) + return 0; + + if (hdev->pdev->current_state == PCI_D0) + i = 1; + else if (hdev->pdev->current_state == PCI_D3hot) + i = 2; + else + i = 3; + + sprintf(tmp_buf, + "current power state: %d\n1 - D0\n2 - D3hot\n3 - Unknown\n", i); + return simple_read_from_buffer(buf, count, ppos, tmp_buf, + strlen(tmp_buf)); +} + +static ssize_t hl_set_power_state(struct file *f, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + u32 value; + ssize_t rc; + + rc = kstrtouint_from_user(buf, count, 10, &value); + if (rc) + return rc; + + if (value == 1) { + pci_set_power_state(hdev->pdev, PCI_D0); + pci_restore_state(hdev->pdev); + rc = pci_enable_device(hdev->pdev); + if (rc < 0) + return rc; + } else if (value == 2) { + pci_save_state(hdev->pdev); + pci_disable_device(hdev->pdev); + pci_set_power_state(hdev->pdev, PCI_D3hot); + } else { + dev_dbg(hdev->dev, "invalid power state value %u\n", value); + return -EINVAL; + } + + return count; +} + +static ssize_t hl_i2c_data_read(struct file *f, char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + char tmp_buf[32]; + u64 val; + ssize_t rc; + + if (*ppos) + return 0; + + rc = hl_debugfs_i2c_read(hdev, entry->i2c_bus, entry->i2c_addr, + entry->i2c_reg, entry->i2c_len, &val); + if (rc) { + dev_err(hdev->dev, + "Failed to read from I2C bus %d, addr %d, reg %d, len %d\n", + entry->i2c_bus, entry->i2c_addr, entry->i2c_reg, entry->i2c_len); + return rc; + } + + sprintf(tmp_buf, "%#02llx\n", val); + rc = simple_read_from_buffer(buf, count, ppos, tmp_buf, + strlen(tmp_buf)); + + return rc; +} + +static ssize_t hl_i2c_data_write(struct file *f, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + u64 value; + ssize_t rc; + + rc = kstrtou64_from_user(buf, count, 16, &value); + if (rc) + return rc; + + rc = hl_debugfs_i2c_write(hdev, entry->i2c_bus, entry->i2c_addr, + entry->i2c_reg, entry->i2c_len, value); + if (rc) { + dev_err(hdev->dev, + "Failed to write %#02llx to I2C bus %d, addr %d, reg %d, len %d\n", + value, entry->i2c_bus, entry->i2c_addr, entry->i2c_reg, entry->i2c_len); + return rc; + } + + return count; +} + +static ssize_t hl_led0_write(struct file *f, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + u32 value; + ssize_t rc; + + rc = kstrtouint_from_user(buf, count, 10, &value); + if (rc) + return rc; + + value = value ? 1 : 0; + + hl_debugfs_led_set(hdev, 0, value); + + return count; +} + +static ssize_t hl_led1_write(struct file *f, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + u32 value; + ssize_t rc; + + rc = kstrtouint_from_user(buf, count, 10, &value); + if (rc) + return rc; + + value = value ? 1 : 0; + + hl_debugfs_led_set(hdev, 1, value); + + return count; +} + +static ssize_t hl_led2_write(struct file *f, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + u32 value; + ssize_t rc; + + rc = kstrtouint_from_user(buf, count, 10, &value); + if (rc) + return rc; + + value = value ? 1 : 0; + + hl_debugfs_led_set(hdev, 2, value); + + return count; +} + +static ssize_t hl_device_read(struct file *f, char __user *buf, + size_t count, loff_t *ppos) +{ + static const char *help = + "Valid values: disable, enable, suspend, resume, cpu_timeout\n"; + return simple_read_from_buffer(buf, count, ppos, help, strlen(help)); +} + +static ssize_t hl_device_write(struct file *f, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + char data[30] = {0}; + + /* don't allow partial writes */ + if (*ppos != 0) + return 0; + + simple_write_to_buffer(data, 29, ppos, buf, count); + + if (strncmp("disable", data, strlen("disable")) == 0) { + hdev->disabled = true; + } else if (strncmp("enable", data, strlen("enable")) == 0) { + hdev->disabled = false; + } else if (strncmp("suspend", data, strlen("suspend")) == 0) { + hdev->asic_funcs->suspend(hdev); + } else if (strncmp("resume", data, strlen("resume")) == 0) { + hdev->asic_funcs->resume(hdev); + } else if (strncmp("cpu_timeout", data, strlen("cpu_timeout")) == 0) { + hdev->device_cpu_disabled = true; + } else { + dev_err(hdev->dev, + "Valid values: disable, enable, suspend, resume, cpu_timeout\n"); + count = -EINVAL; + } + + return count; +} + +static ssize_t hl_clk_gate_read(struct file *f, char __user *buf, + size_t count, loff_t *ppos) +{ + return 0; +} + +static ssize_t hl_clk_gate_write(struct file *f, const char __user *buf, + size_t count, loff_t *ppos) +{ + return count; +} + +static ssize_t hl_stop_on_err_read(struct file *f, char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + char tmp_buf[200]; + ssize_t rc; + + if (!hdev->asic_prop.configurable_stop_on_err) + return -EOPNOTSUPP; + + if (*ppos) + return 0; + + sprintf(tmp_buf, "%d\n", hdev->stop_on_err); + rc = simple_read_from_buffer(buf, strlen(tmp_buf) + 1, ppos, tmp_buf, + strlen(tmp_buf) + 1); + + return rc; +} + +static ssize_t hl_stop_on_err_write(struct file *f, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + u32 value; + ssize_t rc; + + if (!hdev->asic_prop.configurable_stop_on_err) + return -EOPNOTSUPP; + + if (hdev->reset_info.in_reset) { + dev_warn_ratelimited(hdev->dev, + "Can't change stop on error during reset\n"); + return 0; + } + + rc = kstrtouint_from_user(buf, count, 10, &value); + if (rc) + return rc; + + hdev->stop_on_err = value ? 1 : 0; + + hl_device_reset(hdev, 0); + + return count; +} + +static ssize_t hl_security_violations_read(struct file *f, char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + + hdev->asic_funcs->ack_protection_bits_errors(hdev); + + return 0; +} + +static ssize_t hl_state_dump_read(struct file *f, char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + ssize_t rc; + + down_read(&entry->state_dump_sem); + if (!entry->state_dump[entry->state_dump_head]) + rc = 0; + else + rc = simple_read_from_buffer( + buf, count, ppos, + entry->state_dump[entry->state_dump_head], + strlen(entry->state_dump[entry->state_dump_head])); + up_read(&entry->state_dump_sem); + + return rc; +} + +static ssize_t hl_state_dump_write(struct file *f, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + ssize_t rc; + u32 size; + int i; + + rc = kstrtouint_from_user(buf, count, 10, &size); + if (rc) + return rc; + + if (size <= 0 || size >= ARRAY_SIZE(entry->state_dump)) { + dev_err(hdev->dev, "Invalid number of dumps to skip\n"); + return -EINVAL; + } + + if (entry->state_dump[entry->state_dump_head]) { + down_write(&entry->state_dump_sem); + for (i = 0; i < size; ++i) { + vfree(entry->state_dump[entry->state_dump_head]); + entry->state_dump[entry->state_dump_head] = NULL; + if (entry->state_dump_head > 0) + entry->state_dump_head--; + else + entry->state_dump_head = + ARRAY_SIZE(entry->state_dump) - 1; + } + up_write(&entry->state_dump_sem); + } + + return count; +} + +static ssize_t hl_timeout_locked_read(struct file *f, char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + char tmp_buf[200]; + ssize_t rc; + + if (*ppos) + return 0; + + sprintf(tmp_buf, "%d\n", + jiffies_to_msecs(hdev->timeout_jiffies) / 1000); + rc = simple_read_from_buffer(buf, strlen(tmp_buf) + 1, ppos, tmp_buf, + strlen(tmp_buf) + 1); + + return rc; +} + +static ssize_t hl_timeout_locked_write(struct file *f, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + u32 value; + ssize_t rc; + + rc = kstrtouint_from_user(buf, count, 10, &value); + if (rc) + return rc; + + if (value) + hdev->timeout_jiffies = msecs_to_jiffies(value * 1000); + else + hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT; + + return count; +} + +static ssize_t hl_check_razwi_happened(struct file *f, char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + + hdev->asic_funcs->check_if_razwi_happened(hdev); + + return 0; +} + +static const struct file_operations hl_mem_scrub_fops = { + .owner = THIS_MODULE, + .write = hl_memory_scrub, +}; + +static const struct file_operations hl_data32b_fops = { + .owner = THIS_MODULE, + .read = hl_data_read32, + .write = hl_data_write32 +}; + +static const struct file_operations hl_data64b_fops = { + .owner = THIS_MODULE, + .read = hl_data_read64, + .write = hl_data_write64 +}; + +static const struct file_operations hl_dma_size_fops = { + .owner = THIS_MODULE, + .write = hl_dma_size_write +}; + +static const struct file_operations hl_monitor_dump_fops = { + .owner = THIS_MODULE, + .write = hl_monitor_dump_trigger +}; + +static const struct file_operations hl_i2c_data_fops = { + .owner = THIS_MODULE, + .read = hl_i2c_data_read, + .write = hl_i2c_data_write +}; + +static const struct file_operations hl_power_fops = { + .owner = THIS_MODULE, + .read = hl_get_power_state, + .write = hl_set_power_state +}; + +static const struct file_operations hl_led0_fops = { + .owner = THIS_MODULE, + .write = hl_led0_write +}; + +static const struct file_operations hl_led1_fops = { + .owner = THIS_MODULE, + .write = hl_led1_write +}; + +static const struct file_operations hl_led2_fops = { + .owner = THIS_MODULE, + .write = hl_led2_write +}; + +static const struct file_operations hl_device_fops = { + .owner = THIS_MODULE, + .read = hl_device_read, + .write = hl_device_write +}; + +static const struct file_operations hl_clk_gate_fops = { + .owner = THIS_MODULE, + .read = hl_clk_gate_read, + .write = hl_clk_gate_write +}; + +static const struct file_operations hl_stop_on_err_fops = { + .owner = THIS_MODULE, + .read = hl_stop_on_err_read, + .write = hl_stop_on_err_write +}; + +static const struct file_operations hl_security_violations_fops = { + .owner = THIS_MODULE, + .read = hl_security_violations_read +}; + +static const struct file_operations hl_state_dump_fops = { + .owner = THIS_MODULE, + .read = hl_state_dump_read, + .write = hl_state_dump_write +}; + +static const struct file_operations hl_timeout_locked_fops = { + .owner = THIS_MODULE, + .read = hl_timeout_locked_read, + .write = hl_timeout_locked_write +}; + +static const struct file_operations hl_razwi_check_fops = { + .owner = THIS_MODULE, + .read = hl_check_razwi_happened +}; + +static const struct hl_info_list hl_debugfs_list[] = { + {"command_buffers", command_buffers_show, NULL}, + {"command_submission", command_submission_show, NULL}, + {"command_submission_jobs", command_submission_jobs_show, NULL}, + {"userptr", userptr_show, NULL}, + {"vm", vm_show, NULL}, + {"userptr_lookup", userptr_lookup_show, userptr_lookup_write}, + {"mmu", mmu_show, mmu_asid_va_write}, + {"mmu_error", mmu_ack_error, mmu_ack_error_value_write}, + {"engines", engines_show, NULL}, +}; + +static int hl_debugfs_open(struct inode *inode, struct file *file) +{ + struct hl_debugfs_entry *node = inode->i_private; + + return single_open(file, node->info_ent->show, node); +} + +static ssize_t hl_debugfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *f_pos) +{ + struct hl_debugfs_entry *node = file->f_inode->i_private; + + if (node->info_ent->write) + return node->info_ent->write(file, buf, count, f_pos); + else + return -EINVAL; + +} + +static const struct file_operations hl_debugfs_fops = { + .owner = THIS_MODULE, + .open = hl_debugfs_open, + .read = seq_read, + .write = hl_debugfs_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static void add_secured_nodes(struct hl_dbg_device_entry *dev_entry) +{ + debugfs_create_u8("i2c_bus", + 0644, + dev_entry->root, + &dev_entry->i2c_bus); + + debugfs_create_u8("i2c_addr", + 0644, + dev_entry->root, + &dev_entry->i2c_addr); + + debugfs_create_u8("i2c_reg", + 0644, + dev_entry->root, + &dev_entry->i2c_reg); + + debugfs_create_u8("i2c_len", + 0644, + dev_entry->root, + &dev_entry->i2c_len); + + debugfs_create_file("i2c_data", + 0644, + dev_entry->root, + dev_entry, + &hl_i2c_data_fops); + + debugfs_create_file("led0", + 0200, + dev_entry->root, + dev_entry, + &hl_led0_fops); + + debugfs_create_file("led1", + 0200, + dev_entry->root, + dev_entry, + &hl_led1_fops); + + debugfs_create_file("led2", + 0200, + dev_entry->root, + dev_entry, + &hl_led2_fops); +} + +void hl_debugfs_add_device(struct hl_device *hdev) +{ + struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs; + int count = ARRAY_SIZE(hl_debugfs_list); + struct hl_debugfs_entry *entry; + int i; + + dev_entry->hdev = hdev; + dev_entry->entry_arr = kmalloc_array(count, + sizeof(struct hl_debugfs_entry), + GFP_KERNEL); + if (!dev_entry->entry_arr) + return; + + dev_entry->data_dma_blob_desc.size = 0; + dev_entry->data_dma_blob_desc.data = NULL; + dev_entry->mon_dump_blob_desc.size = 0; + dev_entry->mon_dump_blob_desc.data = NULL; + + INIT_LIST_HEAD(&dev_entry->file_list); + INIT_LIST_HEAD(&dev_entry->cb_list); + INIT_LIST_HEAD(&dev_entry->cs_list); + INIT_LIST_HEAD(&dev_entry->cs_job_list); + INIT_LIST_HEAD(&dev_entry->userptr_list); + INIT_LIST_HEAD(&dev_entry->ctx_mem_hash_list); + mutex_init(&dev_entry->file_mutex); + init_rwsem(&dev_entry->state_dump_sem); + spin_lock_init(&dev_entry->cb_spinlock); + spin_lock_init(&dev_entry->cs_spinlock); + spin_lock_init(&dev_entry->cs_job_spinlock); + spin_lock_init(&dev_entry->userptr_spinlock); + spin_lock_init(&dev_entry->ctx_mem_hash_spinlock); + + dev_entry->root = debugfs_create_dir(dev_name(hdev->dev), + hl_debug_root); + + debugfs_create_x64("memory_scrub_val", + 0644, + dev_entry->root, + &hdev->memory_scrub_val); + + debugfs_create_file("memory_scrub", + 0200, + dev_entry->root, + dev_entry, + &hl_mem_scrub_fops); + + debugfs_create_x64("addr", + 0644, + dev_entry->root, + &dev_entry->addr); + + debugfs_create_file("data32", + 0644, + dev_entry->root, + dev_entry, + &hl_data32b_fops); + + debugfs_create_file("data64", + 0644, + dev_entry->root, + dev_entry, + &hl_data64b_fops); + + debugfs_create_file("set_power_state", + 0200, + dev_entry->root, + dev_entry, + &hl_power_fops); + + debugfs_create_file("device", + 0200, + dev_entry->root, + dev_entry, + &hl_device_fops); + + debugfs_create_file("clk_gate", + 0200, + dev_entry->root, + dev_entry, + &hl_clk_gate_fops); + + debugfs_create_file("stop_on_err", + 0644, + dev_entry->root, + dev_entry, + &hl_stop_on_err_fops); + + debugfs_create_file("dump_security_violations", + 0644, + dev_entry->root, + dev_entry, + &hl_security_violations_fops); + + debugfs_create_file("dump_razwi_events", + 0644, + dev_entry->root, + dev_entry, + &hl_razwi_check_fops); + + debugfs_create_file("dma_size", + 0200, + dev_entry->root, + dev_entry, + &hl_dma_size_fops); + + debugfs_create_blob("data_dma", + 0400, + dev_entry->root, + &dev_entry->data_dma_blob_desc); + + debugfs_create_file("monitor_dump_trig", + 0200, + dev_entry->root, + dev_entry, + &hl_monitor_dump_fops); + + debugfs_create_blob("monitor_dump", + 0400, + dev_entry->root, + &dev_entry->mon_dump_blob_desc); + + debugfs_create_x8("skip_reset_on_timeout", + 0644, + dev_entry->root, + &hdev->reset_info.skip_reset_on_timeout); + + debugfs_create_file("state_dump", + 0600, + dev_entry->root, + dev_entry, + &hl_state_dump_fops); + + debugfs_create_file("timeout_locked", + 0644, + dev_entry->root, + dev_entry, + &hl_timeout_locked_fops); + + for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) { + debugfs_create_file(hl_debugfs_list[i].name, + 0444, + dev_entry->root, + entry, + &hl_debugfs_fops); + entry->info_ent = &hl_debugfs_list[i]; + entry->dev_entry = dev_entry; + } + + if (!hdev->asic_prop.fw_security_enabled) + add_secured_nodes(dev_entry); +} + +void hl_debugfs_remove_device(struct hl_device *hdev) +{ + struct hl_dbg_device_entry *entry = &hdev->hl_debugfs; + int i; + + debugfs_remove_recursive(entry->root); + + mutex_destroy(&entry->file_mutex); + + vfree(entry->data_dma_blob_desc.data); + vfree(entry->mon_dump_blob_desc.data); + + for (i = 0; i < ARRAY_SIZE(entry->state_dump); ++i) + vfree(entry->state_dump[i]); + + kfree(entry->entry_arr); +} + +void hl_debugfs_add_file(struct hl_fpriv *hpriv) +{ + struct hl_dbg_device_entry *dev_entry = &hpriv->hdev->hl_debugfs; + + mutex_lock(&dev_entry->file_mutex); + list_add(&hpriv->debugfs_list, &dev_entry->file_list); + mutex_unlock(&dev_entry->file_mutex); +} + +void hl_debugfs_remove_file(struct hl_fpriv *hpriv) +{ + struct hl_dbg_device_entry *dev_entry = &hpriv->hdev->hl_debugfs; + + mutex_lock(&dev_entry->file_mutex); + list_del(&hpriv->debugfs_list); + mutex_unlock(&dev_entry->file_mutex); +} + +void hl_debugfs_add_cb(struct hl_cb *cb) +{ + struct hl_dbg_device_entry *dev_entry = &cb->hdev->hl_debugfs; + + spin_lock(&dev_entry->cb_spinlock); + list_add(&cb->debugfs_list, &dev_entry->cb_list); + spin_unlock(&dev_entry->cb_spinlock); +} + +void hl_debugfs_remove_cb(struct hl_cb *cb) +{ + struct hl_dbg_device_entry *dev_entry = &cb->hdev->hl_debugfs; + + spin_lock(&dev_entry->cb_spinlock); + list_del(&cb->debugfs_list); + spin_unlock(&dev_entry->cb_spinlock); +} + +void hl_debugfs_add_cs(struct hl_cs *cs) +{ + struct hl_dbg_device_entry *dev_entry = &cs->ctx->hdev->hl_debugfs; + + spin_lock(&dev_entry->cs_spinlock); + list_add(&cs->debugfs_list, &dev_entry->cs_list); + spin_unlock(&dev_entry->cs_spinlock); +} + +void hl_debugfs_remove_cs(struct hl_cs *cs) +{ + struct hl_dbg_device_entry *dev_entry = &cs->ctx->hdev->hl_debugfs; + + spin_lock(&dev_entry->cs_spinlock); + list_del(&cs->debugfs_list); + spin_unlock(&dev_entry->cs_spinlock); +} + +void hl_debugfs_add_job(struct hl_device *hdev, struct hl_cs_job *job) +{ + struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs; + + spin_lock(&dev_entry->cs_job_spinlock); + list_add(&job->debugfs_list, &dev_entry->cs_job_list); + spin_unlock(&dev_entry->cs_job_spinlock); +} + +void hl_debugfs_remove_job(struct hl_device *hdev, struct hl_cs_job *job) +{ + struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs; + + spin_lock(&dev_entry->cs_job_spinlock); + list_del(&job->debugfs_list); + spin_unlock(&dev_entry->cs_job_spinlock); +} + +void hl_debugfs_add_userptr(struct hl_device *hdev, struct hl_userptr *userptr) +{ + struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs; + + spin_lock(&dev_entry->userptr_spinlock); + list_add(&userptr->debugfs_list, &dev_entry->userptr_list); + spin_unlock(&dev_entry->userptr_spinlock); +} + +void hl_debugfs_remove_userptr(struct hl_device *hdev, + struct hl_userptr *userptr) +{ + struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs; + + spin_lock(&dev_entry->userptr_spinlock); + list_del(&userptr->debugfs_list); + spin_unlock(&dev_entry->userptr_spinlock); +} + +void hl_debugfs_add_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx) +{ + struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs; + + spin_lock(&dev_entry->ctx_mem_hash_spinlock); + list_add(&ctx->debugfs_list, &dev_entry->ctx_mem_hash_list); + spin_unlock(&dev_entry->ctx_mem_hash_spinlock); +} + +void hl_debugfs_remove_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx) +{ + struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs; + + spin_lock(&dev_entry->ctx_mem_hash_spinlock); + list_del(&ctx->debugfs_list); + spin_unlock(&dev_entry->ctx_mem_hash_spinlock); +} + +/** + * hl_debugfs_set_state_dump - register state dump making it accessible via + * debugfs + * @hdev: pointer to the device structure + * @data: the actual dump data + * @length: the length of the data + */ +void hl_debugfs_set_state_dump(struct hl_device *hdev, char *data, + unsigned long length) +{ + struct hl_dbg_device_entry *dev_entry = &hdev->hl_debugfs; + + down_write(&dev_entry->state_dump_sem); + + dev_entry->state_dump_head = (dev_entry->state_dump_head + 1) % + ARRAY_SIZE(dev_entry->state_dump); + vfree(dev_entry->state_dump[dev_entry->state_dump_head]); + dev_entry->state_dump[dev_entry->state_dump_head] = data; + + up_write(&dev_entry->state_dump_sem); +} + +void __init hl_debugfs_init(void) +{ + hl_debug_root = debugfs_create_dir("habanalabs", NULL); +} + +void hl_debugfs_fini(void) +{ + debugfs_remove_recursive(hl_debug_root); +} diff --git a/drivers/misc/habanalabs/common/decoder.c b/drivers/misc/habanalabs/common/decoder.c new file mode 100644 index 000000000..2aab14d74 --- /dev/null +++ b/drivers/misc/habanalabs/common/decoder.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2022 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "habanalabs.h" + +#define VCMD_CONTROL_OFFSET 0x40 /* SWREG16 */ +#define VCMD_IRQ_STATUS_OFFSET 0x44 /* SWREG17 */ + +#define VCMD_IRQ_STATUS_ENDCMD_MASK 0x1 +#define VCMD_IRQ_STATUS_BUSERR_MASK 0x2 +#define VCMD_IRQ_STATUS_TIMEOUT_MASK 0x4 +#define VCMD_IRQ_STATUS_CMDERR_MASK 0x8 +#define VCMD_IRQ_STATUS_ABORT_MASK 0x10 +#define VCMD_IRQ_STATUS_RESET_MASK 0x20 + +static void dec_print_abnrm_intr_source(struct hl_device *hdev, u32 irq_status) +{ + const char *format = "abnormal interrupt source:%s%s%s%s%s%s\n"; + char *intr_source[6] = {"Unknown", "", "", "", "", ""}; + int i = 0; + + if (!irq_status) + return; + + if (irq_status & VCMD_IRQ_STATUS_ENDCMD_MASK) + intr_source[i++] = " ENDCMD"; + if (irq_status & VCMD_IRQ_STATUS_BUSERR_MASK) + intr_source[i++] = " BUSERR"; + if (irq_status & VCMD_IRQ_STATUS_TIMEOUT_MASK) + intr_source[i++] = " TIMEOUT"; + if (irq_status & VCMD_IRQ_STATUS_CMDERR_MASK) + intr_source[i++] = " CMDERR"; + if (irq_status & VCMD_IRQ_STATUS_ABORT_MASK) + intr_source[i++] = " ABORT"; + if (irq_status & VCMD_IRQ_STATUS_RESET_MASK) + intr_source[i++] = " RESET"; + + dev_err(hdev->dev, format, intr_source[0], intr_source[1], + intr_source[2], intr_source[3], intr_source[4], intr_source[5]); +} + +static void dec_error_intr_work(struct hl_device *hdev, u32 base_addr, u32 core_id) +{ + bool reset_required = false; + u32 irq_status; + + irq_status = RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET); + + dev_err(hdev->dev, "Decoder abnormal interrupt %#x, core %d\n", irq_status, core_id); + + dec_print_abnrm_intr_source(hdev, irq_status); + + if (irq_status & VCMD_IRQ_STATUS_TIMEOUT_MASK) + reset_required = true; + + /* Clear the interrupt */ + WREG32(base_addr + VCMD_IRQ_STATUS_OFFSET, irq_status); + + /* Flush the interrupt clear */ + RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET); + + if (reset_required) + hl_device_reset(hdev, HL_DRV_RESET_HARD); +} + +static void dec_completion_abnrm(struct work_struct *work) +{ + struct hl_dec *dec = container_of(work, struct hl_dec, completion_abnrm_work); + struct hl_device *hdev = dec->hdev; + + dec_error_intr_work(hdev, dec->base_addr, dec->core_id); +} + +void hl_dec_fini(struct hl_device *hdev) +{ + kfree(hdev->dec); +} + +int hl_dec_init(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct hl_dec *dec; + int rc, j; + + /* if max core is 0, nothing to do*/ + if (!prop->max_dec) + return 0; + + hdev->dec = kcalloc(prop->max_dec, sizeof(struct hl_dec), GFP_KERNEL); + if (!hdev->dec) + return -ENOMEM; + + for (j = 0 ; j < prop->max_dec ; j++) { + dec = hdev->dec + j; + + dec->hdev = hdev; + INIT_WORK(&dec->completion_abnrm_work, dec_completion_abnrm); + dec->core_id = j; + dec->base_addr = hdev->asic_funcs->get_dec_base_addr(hdev, j); + if (!dec->base_addr) { + dev_err(hdev->dev, "Invalid base address of decoder %d\n", j); + rc = -EINVAL; + goto err_dec_fini; + } + } + + return 0; + +err_dec_fini: + hl_dec_fini(hdev); + + return rc; +} + +void hl_dec_ctx_fini(struct hl_ctx *ctx) +{ + struct hl_device *hdev = ctx->hdev; + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct hl_dec *dec; + int j; + + for (j = 0 ; j < prop->max_dec ; j++) { + if (!!(prop->decoder_enabled_mask & BIT(j))) { + dec = hdev->dec + j; + /* Stop the decoder */ + WREG32(dec->base_addr + VCMD_CONTROL_OFFSET, 0); + } + } +} diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c new file mode 100644 index 000000000..9ee1b6abd --- /dev/null +++ b/drivers/misc/habanalabs/common/device.c @@ -0,0 +1,2258 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2022 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#define pr_fmt(fmt) "habanalabs: " fmt + +#include <uapi/misc/habanalabs.h> +#include "habanalabs.h" + +#include <linux/pci.h> +#include <linux/hwmon.h> + +#include <trace/events/habanalabs.h> + +#define HL_RESET_DELAY_USEC 10000 /* 10ms */ + +enum dma_alloc_type { + DMA_ALLOC_COHERENT, + DMA_ALLOC_CPU_ACCESSIBLE, + DMA_ALLOC_POOL, +}; + +#define MEM_SCRUB_DEFAULT_VAL 0x1122334455667788 + +/* + * hl_set_dram_bar- sets the bar to allow later access to address + * + * @hdev: pointer to habanalabs device structure. + * @addr: the address the caller wants to access. + * @region: the PCI region. + * + * @return: the old BAR base address on success, U64_MAX for failure. + * The caller should set it back to the old address after use. + * + * In case the bar space does not cover the whole address space, + * the bar base address should be set to allow access to a given address. + * This function can be called also if the bar doesn't need to be set, + * in that case it just won't change the base. + */ +static u64 hl_set_dram_bar(struct hl_device *hdev, u64 addr, struct pci_mem_region *region) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + u64 bar_base_addr, old_base; + + if (is_power_of_2(prop->dram_pci_bar_size)) + bar_base_addr = addr & ~(prop->dram_pci_bar_size - 0x1ull); + else + bar_base_addr = DIV_ROUND_DOWN_ULL(addr, prop->dram_pci_bar_size) * + prop->dram_pci_bar_size; + + old_base = hdev->asic_funcs->set_dram_bar_base(hdev, bar_base_addr); + + /* in case of success we need to update the new BAR base */ + if (old_base != U64_MAX) + region->region_base = bar_base_addr; + + return old_base; +} + +static int hl_access_sram_dram_region(struct hl_device *hdev, u64 addr, u64 *val, + enum debugfs_access_type acc_type, enum pci_region region_type) +{ + struct pci_mem_region *region = &hdev->pci_mem_region[region_type]; + void __iomem *acc_addr; + u64 old_base = 0, rc; + + if (region_type == PCI_REGION_DRAM) { + old_base = hl_set_dram_bar(hdev, addr, region); + if (old_base == U64_MAX) + return -EIO; + } + + acc_addr = hdev->pcie_bar[region->bar_id] + addr - region->region_base + + region->offset_in_bar; + switch (acc_type) { + case DEBUGFS_READ8: + *val = readb(acc_addr); + break; + case DEBUGFS_WRITE8: + writeb(*val, acc_addr); + break; + case DEBUGFS_READ32: + *val = readl(acc_addr); + break; + case DEBUGFS_WRITE32: + writel(*val, acc_addr); + break; + case DEBUGFS_READ64: + *val = readq(acc_addr); + break; + case DEBUGFS_WRITE64: + writeq(*val, acc_addr); + break; + } + + if (region_type == PCI_REGION_DRAM) { + rc = hl_set_dram_bar(hdev, old_base, region); + if (rc == U64_MAX) + return -EIO; + } + + return 0; +} + +static void *hl_dma_alloc_common(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, + gfp_t flag, enum dma_alloc_type alloc_type, + const char *caller) +{ + void *ptr = NULL; + + switch (alloc_type) { + case DMA_ALLOC_COHERENT: + ptr = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, size, dma_handle, flag); + break; + case DMA_ALLOC_CPU_ACCESSIBLE: + ptr = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, size, dma_handle); + break; + case DMA_ALLOC_POOL: + ptr = hdev->asic_funcs->asic_dma_pool_zalloc(hdev, size, flag, dma_handle); + break; + } + + if (trace_habanalabs_dma_alloc_enabled() && !ZERO_OR_NULL_PTR(ptr)) + trace_habanalabs_dma_alloc(hdev->dev, (u64) (uintptr_t) ptr, *dma_handle, size, + caller); + + return ptr; +} + +static void hl_asic_dma_free_common(struct hl_device *hdev, size_t size, void *cpu_addr, + dma_addr_t dma_handle, enum dma_alloc_type alloc_type, + const char *caller) +{ + switch (alloc_type) { + case DMA_ALLOC_COHERENT: + hdev->asic_funcs->asic_dma_free_coherent(hdev, size, cpu_addr, dma_handle); + break; + case DMA_ALLOC_CPU_ACCESSIBLE: + hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, size, cpu_addr); + break; + case DMA_ALLOC_POOL: + hdev->asic_funcs->asic_dma_pool_free(hdev, cpu_addr, dma_handle); + break; + } + + trace_habanalabs_dma_free(hdev->dev, (u64) (uintptr_t) cpu_addr, dma_handle, size, caller); +} + +void *hl_asic_dma_alloc_coherent_caller(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, + gfp_t flag, const char *caller) +{ + return hl_dma_alloc_common(hdev, size, dma_handle, flag, DMA_ALLOC_COHERENT, caller); +} + +void hl_asic_dma_free_coherent_caller(struct hl_device *hdev, size_t size, void *cpu_addr, + dma_addr_t dma_handle, const char *caller) +{ + hl_asic_dma_free_common(hdev, size, cpu_addr, dma_handle, DMA_ALLOC_COHERENT, caller); +} + +void *hl_cpu_accessible_dma_pool_alloc_caller(struct hl_device *hdev, size_t size, + dma_addr_t *dma_handle, const char *caller) +{ + return hl_dma_alloc_common(hdev, size, dma_handle, 0, DMA_ALLOC_CPU_ACCESSIBLE, caller); +} + +void hl_cpu_accessible_dma_pool_free_caller(struct hl_device *hdev, size_t size, void *vaddr, + const char *caller) +{ + hl_asic_dma_free_common(hdev, size, vaddr, 0, DMA_ALLOC_CPU_ACCESSIBLE, caller); +} + +void *hl_asic_dma_pool_zalloc_caller(struct hl_device *hdev, size_t size, gfp_t mem_flags, + dma_addr_t *dma_handle, const char *caller) +{ + return hl_dma_alloc_common(hdev, size, dma_handle, mem_flags, DMA_ALLOC_POOL, caller); +} + +void hl_asic_dma_pool_free_caller(struct hl_device *hdev, void *vaddr, dma_addr_t dma_addr, + const char *caller) +{ + hl_asic_dma_free_common(hdev, 0, vaddr, dma_addr, DMA_ALLOC_POOL, caller); +} + +int hl_dma_map_sgtable(struct hl_device *hdev, struct sg_table *sgt, enum dma_data_direction dir) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct scatterlist *sg; + int rc, i; + + rc = dma_map_sgtable(&hdev->pdev->dev, sgt, dir, 0); + if (rc) + return rc; + + /* Shift to the device's base physical address of host memory if necessary */ + if (prop->device_dma_offset_for_host_access) + for_each_sgtable_dma_sg(sgt, sg, i) + sg->dma_address += prop->device_dma_offset_for_host_access; + + return 0; +} + +void hl_dma_unmap_sgtable(struct hl_device *hdev, struct sg_table *sgt, enum dma_data_direction dir) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct scatterlist *sg; + int i; + + /* Cancel the device's base physical address of host memory if necessary */ + if (prop->device_dma_offset_for_host_access) + for_each_sgtable_dma_sg(sgt, sg, i) + sg->dma_address -= prop->device_dma_offset_for_host_access; + + dma_unmap_sgtable(&hdev->pdev->dev, sgt, dir, 0); +} + +/* + * hl_access_cfg_region - access the config region + * + * @hdev: pointer to habanalabs device structure + * @addr: the address to access + * @val: the value to write from or read to + * @acc_type: the type of access (read/write 64/32) + */ +int hl_access_cfg_region(struct hl_device *hdev, u64 addr, u64 *val, + enum debugfs_access_type acc_type) +{ + struct pci_mem_region *cfg_region = &hdev->pci_mem_region[PCI_REGION_CFG]; + u32 val_h, val_l; + + if (!IS_ALIGNED(addr, sizeof(u32))) { + dev_err(hdev->dev, "address %#llx not a multiple of %zu\n", addr, sizeof(u32)); + return -EINVAL; + } + + switch (acc_type) { + case DEBUGFS_READ32: + *val = RREG32(addr - cfg_region->region_base); + break; + case DEBUGFS_WRITE32: + WREG32(addr - cfg_region->region_base, *val); + break; + case DEBUGFS_READ64: + val_l = RREG32(addr - cfg_region->region_base); + val_h = RREG32(addr + sizeof(u32) - cfg_region->region_base); + + *val = (((u64) val_h) << 32) | val_l; + break; + case DEBUGFS_WRITE64: + WREG32(addr - cfg_region->region_base, lower_32_bits(*val)); + WREG32(addr + sizeof(u32) - cfg_region->region_base, upper_32_bits(*val)); + break; + default: + dev_err(hdev->dev, "access type %d is not supported\n", acc_type); + return -EOPNOTSUPP; + } + + return 0; +} + +/* + * hl_access_dev_mem - access device memory + * + * @hdev: pointer to habanalabs device structure + * @region_type: the type of the region the address belongs to + * @addr: the address to access + * @val: the value to write from or read to + * @acc_type: the type of access (r/w, 32/64) + */ +int hl_access_dev_mem(struct hl_device *hdev, enum pci_region region_type, + u64 addr, u64 *val, enum debugfs_access_type acc_type) +{ + switch (region_type) { + case PCI_REGION_CFG: + return hl_access_cfg_region(hdev, addr, val, acc_type); + case PCI_REGION_SRAM: + case PCI_REGION_DRAM: + return hl_access_sram_dram_region(hdev, addr, val, acc_type, + region_type); + default: + return -EFAULT; + } + + return 0; +} + +void hl_engine_data_sprintf(struct engines_data *e, const char *fmt, ...) +{ + va_list args; + int str_size; + + va_start(args, fmt); + /* Calculate formatted string length. Assuming each string is null terminated, hence + * increment result by 1 + */ + str_size = vsnprintf(NULL, 0, fmt, args) + 1; + va_end(args); + + if ((e->actual_size + str_size) < e->allocated_buf_size) { + va_start(args, fmt); + vsnprintf(e->buf + e->actual_size, str_size, fmt, args); + va_end(args); + } + + /* Need to update the size even when not updating destination buffer to get the exact size + * of all input strings + */ + e->actual_size += str_size; +} + +enum hl_device_status hl_device_status(struct hl_device *hdev) +{ + enum hl_device_status status; + + if (hdev->reset_info.in_reset) { + if (hdev->reset_info.in_compute_reset) + status = HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE; + else + status = HL_DEVICE_STATUS_IN_RESET; + } else if (hdev->reset_info.needs_reset) { + status = HL_DEVICE_STATUS_NEEDS_RESET; + } else if (hdev->disabled) { + status = HL_DEVICE_STATUS_MALFUNCTION; + } else if (!hdev->init_done) { + status = HL_DEVICE_STATUS_IN_DEVICE_CREATION; + } else { + status = HL_DEVICE_STATUS_OPERATIONAL; + } + + return status; +} + +bool hl_device_operational(struct hl_device *hdev, + enum hl_device_status *status) +{ + enum hl_device_status current_status; + + current_status = hl_device_status(hdev); + if (status) + *status = current_status; + + switch (current_status) { + case HL_DEVICE_STATUS_IN_RESET: + case HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE: + case HL_DEVICE_STATUS_MALFUNCTION: + case HL_DEVICE_STATUS_NEEDS_RESET: + return false; + case HL_DEVICE_STATUS_OPERATIONAL: + case HL_DEVICE_STATUS_IN_DEVICE_CREATION: + default: + return true; + } +} + +static void hpriv_release(struct kref *ref) +{ + u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; + bool device_is_idle = true; + struct hl_fpriv *hpriv; + struct hl_device *hdev; + + hpriv = container_of(ref, struct hl_fpriv, refcount); + + hdev = hpriv->hdev; + + hdev->asic_funcs->send_device_activity(hdev, false); + + put_pid(hpriv->taskpid); + + hl_debugfs_remove_file(hpriv); + + mutex_destroy(&hpriv->ctx_lock); + mutex_destroy(&hpriv->restore_phase_mutex); + + if ((!hdev->pldm) && (hdev->pdev) && + (!hdev->asic_funcs->is_device_idle(hdev, + idle_mask, + HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL))) { + dev_err(hdev->dev, + "device not idle after user context is closed (0x%llx_%llx)\n", + idle_mask[1], idle_mask[0]); + + device_is_idle = false; + } + + /* We need to remove the user from the list to make sure the reset process won't + * try to kill the user process. Because, if we got here, it means there are no + * more driver/device resources that the user process is occupying so there is + * no need to kill it + * + * However, we can't set the compute_ctx to NULL at this stage. This is to prevent + * a race between the release and opening the device again. We don't want to let + * a user open the device while there a reset is about to happen. + */ + mutex_lock(&hdev->fpriv_list_lock); + list_del(&hpriv->dev_node); + mutex_unlock(&hdev->fpriv_list_lock); + + if (!device_is_idle || hdev->reset_upon_device_release) { + hl_device_reset(hdev, HL_DRV_RESET_DEV_RELEASE); + } else { + int rc = hdev->asic_funcs->scrub_device_mem(hdev); + + if (rc) + dev_err(hdev->dev, "failed to scrub memory from hpriv release (%d)\n", rc); + } + + /* Now we can mark the compute_ctx as not active. Even if a reset is running in a different + * thread, we don't care because the in_reset is marked so if a user will try to open + * the device it will fail on that, even if compute_ctx is false. + */ + mutex_lock(&hdev->fpriv_list_lock); + hdev->is_compute_ctx_active = false; + mutex_unlock(&hdev->fpriv_list_lock); + + hdev->compute_ctx_in_release = 0; + + /* release the eventfd */ + if (hpriv->notifier_event.eventfd) + eventfd_ctx_put(hpriv->notifier_event.eventfd); + + mutex_destroy(&hpriv->notifier_event.lock); + + kfree(hpriv); +} + +void hl_hpriv_get(struct hl_fpriv *hpriv) +{ + kref_get(&hpriv->refcount); +} + +int hl_hpriv_put(struct hl_fpriv *hpriv) +{ + return kref_put(&hpriv->refcount, hpriv_release); +} + +/* + * hl_device_release - release function for habanalabs device + * + * @inode: pointer to inode structure + * @filp: pointer to file structure + * + * Called when process closes an habanalabs device + */ +static int hl_device_release(struct inode *inode, struct file *filp) +{ + struct hl_fpriv *hpriv = filp->private_data; + struct hl_device *hdev = hpriv->hdev; + + filp->private_data = NULL; + + if (!hdev) { + pr_crit("Closing FD after device was removed. Memory leak will occur and it is advised to reboot.\n"); + put_pid(hpriv->taskpid); + return 0; + } + + /* Each pending user interrupt holds the user's context, hence we + * must release them all before calling hl_ctx_mgr_fini(). + */ + hl_release_pending_user_interrupts(hpriv->hdev); + + hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr); + hl_mem_mgr_fini(&hpriv->mem_mgr); + + hdev->compute_ctx_in_release = 1; + + if (!hl_hpriv_put(hpriv)) + dev_notice(hdev->dev, + "User process closed FD but device still in use\n"); + + hdev->last_open_session_duration_jif = + jiffies - hdev->last_successful_open_jif; + + return 0; +} + +static int hl_device_release_ctrl(struct inode *inode, struct file *filp) +{ + struct hl_fpriv *hpriv = filp->private_data; + struct hl_device *hdev = hpriv->hdev; + + filp->private_data = NULL; + + if (!hdev) { + pr_err("Closing FD after device was removed\n"); + goto out; + } + + mutex_lock(&hdev->fpriv_ctrl_list_lock); + list_del(&hpriv->dev_node); + mutex_unlock(&hdev->fpriv_ctrl_list_lock); +out: + /* release the eventfd */ + if (hpriv->notifier_event.eventfd) + eventfd_ctx_put(hpriv->notifier_event.eventfd); + + mutex_destroy(&hpriv->notifier_event.lock); + put_pid(hpriv->taskpid); + + kfree(hpriv); + + return 0; +} + +/* + * hl_mmap - mmap function for habanalabs device + * + * @*filp: pointer to file structure + * @*vma: pointer to vm_area_struct of the process + * + * Called when process does an mmap on habanalabs device. Call the relevant mmap + * function at the end of the common code. + */ +static int hl_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct hl_fpriv *hpriv = filp->private_data; + struct hl_device *hdev = hpriv->hdev; + unsigned long vm_pgoff; + + if (!hdev) { + pr_err_ratelimited("Trying to mmap after device was removed! Please close FD\n"); + return -ENODEV; + } + + vm_pgoff = vma->vm_pgoff; + + switch (vm_pgoff & HL_MMAP_TYPE_MASK) { + case HL_MMAP_TYPE_BLOCK: + vma->vm_pgoff = HL_MMAP_OFFSET_VALUE_GET(vm_pgoff); + return hl_hw_block_mmap(hpriv, vma); + + case HL_MMAP_TYPE_CB: + case HL_MMAP_TYPE_TS_BUFF: + return hl_mem_mgr_mmap(&hpriv->mem_mgr, vma, NULL); + } + return -EINVAL; +} + +static const struct file_operations hl_ops = { + .owner = THIS_MODULE, + .open = hl_device_open, + .release = hl_device_release, + .mmap = hl_mmap, + .unlocked_ioctl = hl_ioctl, + .compat_ioctl = hl_ioctl +}; + +static const struct file_operations hl_ctrl_ops = { + .owner = THIS_MODULE, + .open = hl_device_open_ctrl, + .release = hl_device_release_ctrl, + .unlocked_ioctl = hl_ioctl_control, + .compat_ioctl = hl_ioctl_control +}; + +static void device_release_func(struct device *dev) +{ + kfree(dev); +} + +/* + * device_init_cdev - Initialize cdev and device for habanalabs device + * + * @hdev: pointer to habanalabs device structure + * @hclass: pointer to the class object of the device + * @minor: minor number of the specific device + * @fpos: file operations to install for this device + * @name: name of the device as it will appear in the filesystem + * @cdev: pointer to the char device object that will be initialized + * @dev: pointer to the device object that will be initialized + * + * Initialize a cdev and a Linux device for habanalabs's device. + */ +static int device_init_cdev(struct hl_device *hdev, struct class *hclass, + int minor, const struct file_operations *fops, + char *name, struct cdev *cdev, + struct device **dev) +{ + cdev_init(cdev, fops); + cdev->owner = THIS_MODULE; + + *dev = kzalloc(sizeof(**dev), GFP_KERNEL); + if (!*dev) + return -ENOMEM; + + device_initialize(*dev); + (*dev)->devt = MKDEV(hdev->major, minor); + (*dev)->class = hclass; + (*dev)->release = device_release_func; + dev_set_drvdata(*dev, hdev); + dev_set_name(*dev, "%s", name); + + return 0; +} + +static int device_cdev_sysfs_add(struct hl_device *hdev) +{ + int rc; + + rc = cdev_device_add(&hdev->cdev, hdev->dev); + if (rc) { + dev_err(hdev->dev, + "failed to add a char device to the system\n"); + return rc; + } + + rc = cdev_device_add(&hdev->cdev_ctrl, hdev->dev_ctrl); + if (rc) { + dev_err(hdev->dev, + "failed to add a control char device to the system\n"); + goto delete_cdev_device; + } + + /* hl_sysfs_init() must be done after adding the device to the system */ + rc = hl_sysfs_init(hdev); + if (rc) { + dev_err(hdev->dev, "failed to initialize sysfs\n"); + goto delete_ctrl_cdev_device; + } + + hdev->cdev_sysfs_created = true; + + return 0; + +delete_ctrl_cdev_device: + cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl); +delete_cdev_device: + cdev_device_del(&hdev->cdev, hdev->dev); + return rc; +} + +static void device_cdev_sysfs_del(struct hl_device *hdev) +{ + if (!hdev->cdev_sysfs_created) + goto put_devices; + + hl_sysfs_fini(hdev); + cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl); + cdev_device_del(&hdev->cdev, hdev->dev); + +put_devices: + put_device(hdev->dev); + put_device(hdev->dev_ctrl); +} + +static void device_hard_reset_pending(struct work_struct *work) +{ + struct hl_device_reset_work *device_reset_work = + container_of(work, struct hl_device_reset_work, reset_work.work); + struct hl_device *hdev = device_reset_work->hdev; + u32 flags; + int rc; + + flags = device_reset_work->flags | HL_DRV_RESET_FROM_RESET_THR; + + rc = hl_device_reset(hdev, flags); + if ((rc == -EBUSY) && !hdev->device_fini_pending) { + dev_info(hdev->dev, + "Could not reset device. will try again in %u seconds", + HL_PENDING_RESET_PER_SEC); + + queue_delayed_work(device_reset_work->wq, + &device_reset_work->reset_work, + msecs_to_jiffies(HL_PENDING_RESET_PER_SEC * 1000)); + } +} + +/* + * device_early_init - do some early initialization for the habanalabs device + * + * @hdev: pointer to habanalabs device structure + * + * Install the relevant function pointers and call the early_init function, + * if such a function exists + */ +static int device_early_init(struct hl_device *hdev) +{ + int i, rc; + char workq_name[32]; + + switch (hdev->asic_type) { + case ASIC_GOYA: + goya_set_asic_funcs(hdev); + strscpy(hdev->asic_name, "GOYA", sizeof(hdev->asic_name)); + break; + case ASIC_GAUDI: + gaudi_set_asic_funcs(hdev); + strscpy(hdev->asic_name, "GAUDI", sizeof(hdev->asic_name)); + break; + case ASIC_GAUDI_SEC: + gaudi_set_asic_funcs(hdev); + strscpy(hdev->asic_name, "GAUDI SEC", sizeof(hdev->asic_name)); + break; + case ASIC_GAUDI2: + gaudi2_set_asic_funcs(hdev); + strscpy(hdev->asic_name, "GAUDI2", sizeof(hdev->asic_name)); + break; + case ASIC_GAUDI2_SEC: + gaudi2_set_asic_funcs(hdev); + strscpy(hdev->asic_name, "GAUDI2 SEC", sizeof(hdev->asic_name)); + break; + default: + dev_err(hdev->dev, "Unrecognized ASIC type %d\n", + hdev->asic_type); + return -EINVAL; + } + + rc = hdev->asic_funcs->early_init(hdev); + if (rc) + return rc; + + rc = hl_asid_init(hdev); + if (rc) + goto early_fini; + + if (hdev->asic_prop.completion_queues_count) { + hdev->cq_wq = kcalloc(hdev->asic_prop.completion_queues_count, + sizeof(struct workqueue_struct *), + GFP_KERNEL); + if (!hdev->cq_wq) { + rc = -ENOMEM; + goto asid_fini; + } + } + + for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) { + snprintf(workq_name, 32, "hl-free-jobs-%u", (u32) i); + hdev->cq_wq[i] = create_singlethread_workqueue(workq_name); + if (hdev->cq_wq[i] == NULL) { + dev_err(hdev->dev, "Failed to allocate CQ workqueue\n"); + rc = -ENOMEM; + goto free_cq_wq; + } + } + + hdev->eq_wq = alloc_workqueue("hl-events", WQ_UNBOUND, 0); + if (hdev->eq_wq == NULL) { + dev_err(hdev->dev, "Failed to allocate EQ workqueue\n"); + rc = -ENOMEM; + goto free_cq_wq; + } + + hdev->cs_cmplt_wq = alloc_workqueue("hl-cs-completions", WQ_UNBOUND, 0); + if (!hdev->cs_cmplt_wq) { + dev_err(hdev->dev, + "Failed to allocate CS completions workqueue\n"); + rc = -ENOMEM; + goto free_eq_wq; + } + + hdev->ts_free_obj_wq = alloc_workqueue("hl-ts-free-obj", WQ_UNBOUND, 0); + if (!hdev->ts_free_obj_wq) { + dev_err(hdev->dev, + "Failed to allocate Timestamp registration free workqueue\n"); + rc = -ENOMEM; + goto free_cs_cmplt_wq; + } + + hdev->pf_wq = alloc_workqueue("hl-prefetch", WQ_UNBOUND, 0); + if (!hdev->pf_wq) { + dev_err(hdev->dev, "Failed to allocate MMU prefetch workqueue\n"); + rc = -ENOMEM; + goto free_ts_free_wq; + } + + hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info), + GFP_KERNEL); + if (!hdev->hl_chip_info) { + rc = -ENOMEM; + goto free_pf_wq; + } + + rc = hl_mmu_if_set_funcs(hdev); + if (rc) + goto free_chip_info; + + hl_mem_mgr_init(hdev->dev, &hdev->kernel_mem_mgr); + + hdev->device_reset_work.wq = + create_singlethread_workqueue("hl_device_reset"); + if (!hdev->device_reset_work.wq) { + rc = -ENOMEM; + dev_err(hdev->dev, "Failed to create device reset WQ\n"); + goto free_cb_mgr; + } + + INIT_DELAYED_WORK(&hdev->device_reset_work.reset_work, + device_hard_reset_pending); + hdev->device_reset_work.hdev = hdev; + hdev->device_fini_pending = 0; + + mutex_init(&hdev->send_cpu_message_lock); + mutex_init(&hdev->debug_lock); + INIT_LIST_HEAD(&hdev->cs_mirror_list); + spin_lock_init(&hdev->cs_mirror_lock); + spin_lock_init(&hdev->reset_info.lock); + INIT_LIST_HEAD(&hdev->fpriv_list); + INIT_LIST_HEAD(&hdev->fpriv_ctrl_list); + mutex_init(&hdev->fpriv_list_lock); + mutex_init(&hdev->fpriv_ctrl_list_lock); + mutex_init(&hdev->clk_throttling.lock); + + return 0; + +free_cb_mgr: + hl_mem_mgr_fini(&hdev->kernel_mem_mgr); +free_chip_info: + kfree(hdev->hl_chip_info); +free_pf_wq: + destroy_workqueue(hdev->pf_wq); +free_ts_free_wq: + destroy_workqueue(hdev->ts_free_obj_wq); +free_cs_cmplt_wq: + destroy_workqueue(hdev->cs_cmplt_wq); +free_eq_wq: + destroy_workqueue(hdev->eq_wq); +free_cq_wq: + for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) + if (hdev->cq_wq[i]) + destroy_workqueue(hdev->cq_wq[i]); + kfree(hdev->cq_wq); +asid_fini: + hl_asid_fini(hdev); +early_fini: + if (hdev->asic_funcs->early_fini) + hdev->asic_funcs->early_fini(hdev); + + return rc; +} + +/* + * device_early_fini - finalize all that was done in device_early_init + * + * @hdev: pointer to habanalabs device structure + * + */ +static void device_early_fini(struct hl_device *hdev) +{ + int i; + + mutex_destroy(&hdev->debug_lock); + mutex_destroy(&hdev->send_cpu_message_lock); + + mutex_destroy(&hdev->fpriv_list_lock); + mutex_destroy(&hdev->fpriv_ctrl_list_lock); + + mutex_destroy(&hdev->clk_throttling.lock); + + hl_mem_mgr_fini(&hdev->kernel_mem_mgr); + + kfree(hdev->hl_chip_info); + + destroy_workqueue(hdev->pf_wq); + destroy_workqueue(hdev->ts_free_obj_wq); + destroy_workqueue(hdev->cs_cmplt_wq); + destroy_workqueue(hdev->eq_wq); + destroy_workqueue(hdev->device_reset_work.wq); + + for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) + destroy_workqueue(hdev->cq_wq[i]); + kfree(hdev->cq_wq); + + hl_asid_fini(hdev); + + if (hdev->asic_funcs->early_fini) + hdev->asic_funcs->early_fini(hdev); +} + +static bool is_pci_link_healthy(struct hl_device *hdev) +{ + u16 vendor_id; + + if (!hdev->pdev) + return false; + + pci_read_config_word(hdev->pdev, PCI_VENDOR_ID, &vendor_id); + + return (vendor_id == PCI_VENDOR_ID_HABANALABS); +} + +static void hl_device_heartbeat(struct work_struct *work) +{ + struct hl_device *hdev = container_of(work, struct hl_device, + work_heartbeat.work); + + if (!hl_device_operational(hdev, NULL)) + goto reschedule; + + if (!hdev->asic_funcs->send_heartbeat(hdev)) + goto reschedule; + + if (hl_device_operational(hdev, NULL)) + dev_err(hdev->dev, "Device heartbeat failed! PCI link is %s\n", + is_pci_link_healthy(hdev) ? "healthy" : "broken"); + + hl_device_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT); + + return; + +reschedule: + /* + * prev_reset_trigger tracks consecutive fatal h/w errors until first + * heartbeat immediately post reset. + * If control reached here, then at least one heartbeat work has been + * scheduled since last reset/init cycle. + * So if the device is not already in reset cycle, reset the flag + * prev_reset_trigger as no reset occurred with HL_DRV_RESET_FW_FATAL_ERR + * status for at least one heartbeat. From this point driver restarts + * tracking future consecutive fatal errors. + */ + if (!hdev->reset_info.in_reset) + hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT; + + schedule_delayed_work(&hdev->work_heartbeat, + usecs_to_jiffies(HL_HEARTBEAT_PER_USEC)); +} + +/* + * device_late_init - do late stuff initialization for the habanalabs device + * + * @hdev: pointer to habanalabs device structure + * + * Do stuff that either needs the device H/W queues to be active or needs + * to happen after all the rest of the initialization is finished + */ +static int device_late_init(struct hl_device *hdev) +{ + int rc; + + if (hdev->asic_funcs->late_init) { + rc = hdev->asic_funcs->late_init(hdev); + if (rc) { + dev_err(hdev->dev, + "failed late initialization for the H/W\n"); + return rc; + } + } + + hdev->high_pll = hdev->asic_prop.high_pll; + + if (hdev->heartbeat) { + INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat); + schedule_delayed_work(&hdev->work_heartbeat, + usecs_to_jiffies(HL_HEARTBEAT_PER_USEC)); + } + + hdev->late_init_done = true; + + return 0; +} + +/* + * device_late_fini - finalize all that was done in device_late_init + * + * @hdev: pointer to habanalabs device structure + * + */ +static void device_late_fini(struct hl_device *hdev) +{ + if (!hdev->late_init_done) + return; + + if (hdev->heartbeat) + cancel_delayed_work_sync(&hdev->work_heartbeat); + + if (hdev->asic_funcs->late_fini) + hdev->asic_funcs->late_fini(hdev); + + hdev->late_init_done = false; +} + +int hl_device_utilization(struct hl_device *hdev, u32 *utilization) +{ + u64 max_power, curr_power, dc_power, dividend; + int rc; + + max_power = hdev->max_power; + dc_power = hdev->asic_prop.dc_power_default; + rc = hl_fw_cpucp_power_get(hdev, &curr_power); + + if (rc) + return rc; + + curr_power = clamp(curr_power, dc_power, max_power); + + dividend = (curr_power - dc_power) * 100; + *utilization = (u32) div_u64(dividend, (max_power - dc_power)); + + return 0; +} + +int hl_device_set_debug_mode(struct hl_device *hdev, struct hl_ctx *ctx, bool enable) +{ + int rc = 0; + + mutex_lock(&hdev->debug_lock); + + if (!enable) { + if (!hdev->in_debug) { + dev_err(hdev->dev, + "Failed to disable debug mode because device was not in debug mode\n"); + rc = -EFAULT; + goto out; + } + + if (!hdev->reset_info.hard_reset_pending) + hdev->asic_funcs->halt_coresight(hdev, ctx); + + hdev->in_debug = 0; + + goto out; + } + + if (hdev->in_debug) { + dev_err(hdev->dev, + "Failed to enable debug mode because device is already in debug mode\n"); + rc = -EFAULT; + goto out; + } + + hdev->in_debug = 1; + +out: + mutex_unlock(&hdev->debug_lock); + + return rc; +} + +static void take_release_locks(struct hl_device *hdev) +{ + /* Flush anyone that is inside the critical section of enqueue + * jobs to the H/W + */ + hdev->asic_funcs->hw_queues_lock(hdev); + hdev->asic_funcs->hw_queues_unlock(hdev); + + /* Flush processes that are sending message to CPU */ + mutex_lock(&hdev->send_cpu_message_lock); + mutex_unlock(&hdev->send_cpu_message_lock); + + /* Flush anyone that is inside device open */ + mutex_lock(&hdev->fpriv_list_lock); + mutex_unlock(&hdev->fpriv_list_lock); + mutex_lock(&hdev->fpriv_ctrl_list_lock); + mutex_unlock(&hdev->fpriv_ctrl_list_lock); +} + +static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_reset, + bool skip_wq_flush) +{ + if (hard_reset) + device_late_fini(hdev); + + /* + * Halt the engines and disable interrupts so we won't get any more + * completions from H/W and we won't have any accesses from the + * H/W to the host machine + */ + hdev->asic_funcs->halt_engines(hdev, hard_reset, fw_reset); + + /* Go over all the queues, release all CS and their jobs */ + hl_cs_rollback_all(hdev, skip_wq_flush); + + /* flush the MMU prefetch workqueue */ + flush_workqueue(hdev->pf_wq); + + /* Release all pending user interrupts, each pending user interrupt + * holds a reference to user context + */ + hl_release_pending_user_interrupts(hdev); +} + +/* + * hl_device_suspend - initiate device suspend + * + * @hdev: pointer to habanalabs device structure + * + * Puts the hw in the suspend state (all asics). + * Returns 0 for success or an error on failure. + * Called at driver suspend. + */ +int hl_device_suspend(struct hl_device *hdev) +{ + int rc; + + pci_save_state(hdev->pdev); + + /* Block future CS/VM/JOB completion operations */ + spin_lock(&hdev->reset_info.lock); + if (hdev->reset_info.in_reset) { + spin_unlock(&hdev->reset_info.lock); + dev_err(hdev->dev, "Can't suspend while in reset\n"); + return -EIO; + } + hdev->reset_info.in_reset = 1; + spin_unlock(&hdev->reset_info.lock); + + /* This blocks all other stuff that is not blocked by in_reset */ + hdev->disabled = true; + + take_release_locks(hdev); + + rc = hdev->asic_funcs->suspend(hdev); + if (rc) + dev_err(hdev->dev, + "Failed to disable PCI access of device CPU\n"); + + /* Shut down the device */ + pci_disable_device(hdev->pdev); + pci_set_power_state(hdev->pdev, PCI_D3hot); + + return 0; +} + +/* + * hl_device_resume - initiate device resume + * + * @hdev: pointer to habanalabs device structure + * + * Bring the hw back to operating state (all asics). + * Returns 0 for success or an error on failure. + * Called at driver resume. + */ +int hl_device_resume(struct hl_device *hdev) +{ + int rc; + + pci_set_power_state(hdev->pdev, PCI_D0); + pci_restore_state(hdev->pdev); + rc = pci_enable_device_mem(hdev->pdev); + if (rc) { + dev_err(hdev->dev, + "Failed to enable PCI device in resume\n"); + return rc; + } + + pci_set_master(hdev->pdev); + + rc = hdev->asic_funcs->resume(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to resume device after suspend\n"); + goto disable_device; + } + + + /* 'in_reset' was set to true during suspend, now we must clear it in order + * for hard reset to be performed + */ + spin_lock(&hdev->reset_info.lock); + hdev->reset_info.in_reset = 0; + spin_unlock(&hdev->reset_info.lock); + + rc = hl_device_reset(hdev, HL_DRV_RESET_HARD); + if (rc) { + dev_err(hdev->dev, "Failed to reset device during resume\n"); + goto disable_device; + } + + return 0; + +disable_device: + pci_clear_master(hdev->pdev); + pci_disable_device(hdev->pdev); + + return rc; +} + +static int device_kill_open_processes(struct hl_device *hdev, u32 timeout, bool control_dev) +{ + struct task_struct *task = NULL; + struct list_head *fd_list; + struct hl_fpriv *hpriv; + struct mutex *fd_lock; + u32 pending_cnt; + + fd_lock = control_dev ? &hdev->fpriv_ctrl_list_lock : &hdev->fpriv_list_lock; + fd_list = control_dev ? &hdev->fpriv_ctrl_list : &hdev->fpriv_list; + + /* Giving time for user to close FD, and for processes that are inside + * hl_device_open to finish + */ + if (!list_empty(fd_list)) + ssleep(1); + + if (timeout) { + pending_cnt = timeout; + } else { + if (hdev->process_kill_trial_cnt) { + /* Processes have been already killed */ + pending_cnt = 1; + goto wait_for_processes; + } else { + /* Wait a small period after process kill */ + pending_cnt = HL_PENDING_RESET_PER_SEC; + } + } + + mutex_lock(fd_lock); + + /* This section must be protected because we are dereferencing + * pointers that are freed if the process exits + */ + list_for_each_entry(hpriv, fd_list, dev_node) { + task = get_pid_task(hpriv->taskpid, PIDTYPE_PID); + if (task) { + dev_info(hdev->dev, "Killing user process pid=%d\n", + task_pid_nr(task)); + send_sig(SIGKILL, task, 1); + usleep_range(1000, 10000); + + put_task_struct(task); + } else { + /* + * If we got here, it means that process was killed from outside the driver + * right after it started looping on fd_list and before get_pid_task, thus + * we don't need to kill it. + */ + dev_dbg(hdev->dev, + "Can't get task struct for user process, assuming process was killed from outside the driver\n"); + } + } + + mutex_unlock(fd_lock); + + /* + * We killed the open users, but that doesn't mean they are closed. + * It could be that they are running a long cleanup phase in the driver + * e.g. MMU unmappings, or running other long teardown flow even before + * our cleanup. + * Therefore we need to wait again to make sure they are closed before + * continuing with the reset. + */ + +wait_for_processes: + while ((!list_empty(fd_list)) && (pending_cnt)) { + dev_dbg(hdev->dev, + "Waiting for all unmap operations to finish before hard reset\n"); + + pending_cnt--; + + ssleep(1); + } + + /* All processes exited successfully */ + if (list_empty(fd_list)) + return 0; + + /* Give up waiting for processes to exit */ + if (hdev->process_kill_trial_cnt == HL_PENDING_RESET_MAX_TRIALS) + return -ETIME; + + hdev->process_kill_trial_cnt++; + + return -EBUSY; +} + +static void device_disable_open_processes(struct hl_device *hdev, bool control_dev) +{ + struct list_head *fd_list; + struct hl_fpriv *hpriv; + struct mutex *fd_lock; + + fd_lock = control_dev ? &hdev->fpriv_ctrl_list_lock : &hdev->fpriv_list_lock; + fd_list = control_dev ? &hdev->fpriv_ctrl_list : &hdev->fpriv_list; + + mutex_lock(fd_lock); + list_for_each_entry(hpriv, fd_list, dev_node) + hpriv->hdev = NULL; + mutex_unlock(fd_lock); +} + +static void handle_reset_trigger(struct hl_device *hdev, u32 flags) +{ + u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT; + + /* + * 'reset cause' is being updated here, because getting here + * means that it's the 1st time and the last time we're here + * ('in_reset' makes sure of it). This makes sure that + * 'reset_cause' will continue holding its 1st recorded reason! + */ + if (flags & HL_DRV_RESET_HEARTBEAT) { + hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT; + cur_reset_trigger = HL_DRV_RESET_HEARTBEAT; + } else if (flags & HL_DRV_RESET_TDR) { + hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_TDR; + cur_reset_trigger = HL_DRV_RESET_TDR; + } else if (flags & HL_DRV_RESET_FW_FATAL_ERR) { + hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; + cur_reset_trigger = HL_DRV_RESET_FW_FATAL_ERR; + } else { + hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; + } + + /* + * If reset cause is same twice, then reset_trigger_repeated + * is set and if this reset is due to a fatal FW error + * device is set to an unstable state. + */ + if (hdev->reset_info.prev_reset_trigger != cur_reset_trigger) { + hdev->reset_info.prev_reset_trigger = cur_reset_trigger; + hdev->reset_info.reset_trigger_repeated = 0; + } else { + hdev->reset_info.reset_trigger_repeated = 1; + } + + /* If reset is due to heartbeat, device CPU is no responsive in + * which case no point sending PCI disable message to it. + * + * If F/W is performing the reset, no need to send it a message to disable + * PCI access + */ + if ((flags & HL_DRV_RESET_HARD) && + !(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) { + /* Disable PCI access from device F/W so he won't send + * us additional interrupts. We disable MSI/MSI-X at + * the halt_engines function and we can't have the F/W + * sending us interrupts after that. We need to disable + * the access here because if the device is marked + * disable, the message won't be send. Also, in case + * of heartbeat, the device CPU is marked as disable + * so this message won't be sent + */ + if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) + dev_warn(hdev->dev, + "Failed to disable PCI access by F/W\n"); + } +} + +/* + * hl_device_reset - reset the device + * + * @hdev: pointer to habanalabs device structure + * @flags: reset flags. + * + * Block future CS and wait for pending CS to be enqueued + * Call ASIC H/W fini + * Flush all completions + * Re-initialize all internal data structures + * Call ASIC H/W init, late_init + * Test queues + * Enable device + * + * Returns 0 for success or an error on failure. + */ +int hl_device_reset(struct hl_device *hdev, u32 flags) +{ + bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false, + reset_upon_device_release = false, schedule_hard_reset = false, + skip_wq_flush, delay_reset; + u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; + struct hl_ctx *ctx; + int i, rc; + + if (!hdev->init_done) { + dev_err(hdev->dev, "Can't reset before initialization is done\n"); + return 0; + } + + hard_reset = !!(flags & HL_DRV_RESET_HARD); + from_hard_reset_thread = !!(flags & HL_DRV_RESET_FROM_RESET_THR); + fw_reset = !!(flags & HL_DRV_RESET_BYPASS_REQ_TO_FW); + skip_wq_flush = !!(flags & HL_DRV_RESET_DEV_RELEASE); + delay_reset = !!(flags & HL_DRV_RESET_DELAY); + + if (!hard_reset && !hdev->asic_prop.supports_compute_reset) { + hard_instead_soft = true; + hard_reset = true; + } + + if (hdev->reset_upon_device_release && (flags & HL_DRV_RESET_DEV_RELEASE)) { + if (hard_reset) { + dev_crit(hdev->dev, + "Aborting reset because hard-reset is mutually exclusive with reset-on-device-release\n"); + return -EINVAL; + } + + reset_upon_device_release = true; + + goto do_reset; + } + + if (!hard_reset && !hdev->asic_prop.allow_inference_soft_reset) { + hard_instead_soft = true; + hard_reset = true; + } + + if (hard_instead_soft) + dev_dbg(hdev->dev, "Doing hard-reset instead of compute reset\n"); + +do_reset: + /* Re-entry of reset thread */ + if (from_hard_reset_thread && hdev->process_kill_trial_cnt) + goto kill_processes; + + /* + * Prevent concurrency in this function - only one reset should be + * done at any given time. Only need to perform this if we didn't + * get from the dedicated hard reset thread + */ + if (!from_hard_reset_thread) { + /* Block future CS/VM/JOB completion operations */ + spin_lock(&hdev->reset_info.lock); + if (hdev->reset_info.in_reset) { + /* We only allow scheduling of a hard reset during compute reset */ + if (hard_reset && hdev->reset_info.in_compute_reset) + hdev->reset_info.hard_reset_schedule_flags = flags; + spin_unlock(&hdev->reset_info.lock); + return 0; + } + + /* This still allows the completion of some KDMA ops + * Update this before in_reset because in_compute_reset implies we are in reset + */ + hdev->reset_info.in_compute_reset = !hard_reset; + + hdev->reset_info.in_reset = 1; + + spin_unlock(&hdev->reset_info.lock); + + if (delay_reset) + usleep_range(HL_RESET_DELAY_USEC, HL_RESET_DELAY_USEC << 1); + + handle_reset_trigger(hdev, flags); + + /* This also blocks future CS/VM/JOB completion operations */ + hdev->disabled = true; + + take_release_locks(hdev); + + if (hard_reset) + dev_info(hdev->dev, "Going to reset device\n"); + else if (reset_upon_device_release) + dev_dbg(hdev->dev, "Going to reset device after release by user\n"); + else + dev_dbg(hdev->dev, "Going to reset engines of inference device\n"); + } + +again: + if ((hard_reset) && (!from_hard_reset_thread)) { + hdev->reset_info.hard_reset_pending = true; + + hdev->process_kill_trial_cnt = 0; + + hdev->device_reset_work.flags = flags; + + /* + * Because the reset function can't run from heartbeat work, + * we need to call the reset function from a dedicated work. + */ + queue_delayed_work(hdev->device_reset_work.wq, + &hdev->device_reset_work.reset_work, 0); + + return 0; + } + + cleanup_resources(hdev, hard_reset, fw_reset, skip_wq_flush); + +kill_processes: + if (hard_reset) { + /* Kill processes here after CS rollback. This is because the + * process can't really exit until all its CSs are done, which + * is what we do in cs rollback + */ + rc = device_kill_open_processes(hdev, 0, false); + + if (rc == -EBUSY) { + if (hdev->device_fini_pending) { + dev_crit(hdev->dev, + "%s Failed to kill all open processes, stopping hard reset\n", + dev_name(&(hdev)->pdev->dev)); + goto out_err; + } + + /* signal reset thread to reschedule */ + return rc; + } + + if (rc) { + dev_crit(hdev->dev, + "%s Failed to kill all open processes, stopping hard reset\n", + dev_name(&(hdev)->pdev->dev)); + goto out_err; + } + + /* Flush the Event queue workers to make sure no other thread is + * reading or writing to registers during the reset + */ + flush_workqueue(hdev->eq_wq); + } + + /* Reset the H/W. It will be in idle state after this returns */ + hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset); + + if (hard_reset) { + hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE; + + /* Release kernel context */ + if (hdev->kernel_ctx && hl_ctx_put(hdev->kernel_ctx) == 1) + hdev->kernel_ctx = NULL; + + hl_vm_fini(hdev); + hl_mmu_fini(hdev); + hl_eq_reset(hdev, &hdev->event_queue); + } + + /* Re-initialize PI,CI to 0 in all queues (hw queue, cq) */ + hl_hw_queue_reset(hdev, hard_reset); + for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) + hl_cq_reset(hdev, &hdev->completion_queue[i]); + + /* Make sure the context switch phase will run again */ + ctx = hl_get_compute_ctx(hdev); + if (ctx) { + atomic_set(&ctx->thread_ctx_switch_token, 1); + ctx->thread_ctx_switch_wait_token = 0; + hl_ctx_put(ctx); + } + + /* Finished tear-down, starting to re-initialize */ + + if (hard_reset) { + hdev->device_cpu_disabled = false; + hdev->reset_info.hard_reset_pending = false; + + if (hdev->reset_info.reset_trigger_repeated && + (hdev->reset_info.prev_reset_trigger == + HL_DRV_RESET_FW_FATAL_ERR)) { + /* if there 2 back to back resets from FW, + * ensure driver puts the driver in a unusable state + */ + dev_crit(hdev->dev, + "%s Consecutive FW fatal errors received, stopping hard reset\n", + dev_name(&(hdev)->pdev->dev)); + rc = -EIO; + goto out_err; + } + + if (hdev->kernel_ctx) { + dev_crit(hdev->dev, + "%s kernel ctx was alive during hard reset, something is terribly wrong\n", + dev_name(&(hdev)->pdev->dev)); + rc = -EBUSY; + goto out_err; + } + + rc = hl_mmu_init(hdev); + if (rc) { + dev_err(hdev->dev, + "Failed to initialize MMU S/W after hard reset\n"); + goto out_err; + } + + /* Allocate the kernel context */ + hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), + GFP_KERNEL); + if (!hdev->kernel_ctx) { + rc = -ENOMEM; + hl_mmu_fini(hdev); + goto out_err; + } + + hdev->is_compute_ctx_active = false; + + rc = hl_ctx_init(hdev, hdev->kernel_ctx, true); + if (rc) { + dev_err(hdev->dev, + "failed to init kernel ctx in hard reset\n"); + kfree(hdev->kernel_ctx); + hdev->kernel_ctx = NULL; + hl_mmu_fini(hdev); + goto out_err; + } + } + + /* Device is now enabled as part of the initialization requires + * communication with the device firmware to get information that + * is required for the initialization itself + */ + hdev->disabled = false; + + /* F/W security enabled indication might be updated after hard-reset */ + if (hard_reset) { + rc = hl_fw_read_preboot_status(hdev); + if (rc) + goto out_err; + } + + rc = hdev->asic_funcs->hw_init(hdev); + if (rc) { + dev_err(hdev->dev, "failed to initialize the H/W after reset\n"); + goto out_err; + } + + /* If device is not idle fail the reset process */ + if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask, + HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) { + dev_err(hdev->dev, "device is not idle (mask 0x%llx_%llx) after reset\n", + idle_mask[1], idle_mask[0]); + rc = -EIO; + goto out_err; + } + + /* Check that the communication with the device is working */ + rc = hdev->asic_funcs->test_queues(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to detect if device is alive after reset\n"); + goto out_err; + } + + if (hard_reset) { + rc = device_late_init(hdev); + if (rc) { + dev_err(hdev->dev, "Failed late init after hard reset\n"); + goto out_err; + } + + rc = hl_vm_init(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to init memory module after hard reset\n"); + goto out_err; + } + + if (!hdev->asic_prop.fw_security_enabled) + hl_fw_set_max_power(hdev); + } else { + rc = hdev->asic_funcs->compute_reset_late_init(hdev); + if (rc) { + if (reset_upon_device_release) + dev_err(hdev->dev, + "Failed late init in reset after device release\n"); + else + dev_err(hdev->dev, "Failed late init after compute reset\n"); + goto out_err; + } + } + + rc = hdev->asic_funcs->scrub_device_mem(hdev); + if (rc) { + dev_err(hdev->dev, "scrub mem failed from device reset (%d)\n", rc); + return rc; + } + + spin_lock(&hdev->reset_info.lock); + hdev->reset_info.in_compute_reset = 0; + + /* Schedule hard reset only if requested and if not already in hard reset. + * We keep 'in_reset' enabled, so no other reset can go in during the hard + * reset schedule + */ + if (!hard_reset && hdev->reset_info.hard_reset_schedule_flags) + schedule_hard_reset = true; + else + hdev->reset_info.in_reset = 0; + + spin_unlock(&hdev->reset_info.lock); + + hdev->reset_info.needs_reset = false; + + if (hard_reset) + dev_info(hdev->dev, + "Successfully finished resetting the %s device\n", + dev_name(&(hdev)->pdev->dev)); + else + dev_dbg(hdev->dev, + "Successfully finished resetting the %s device\n", + dev_name(&(hdev)->pdev->dev)); + + if (hard_reset) { + hdev->reset_info.hard_reset_cnt++; + + /* After reset is done, we are ready to receive events from + * the F/W. We can't do it before because we will ignore events + * and if those events are fatal, we won't know about it and + * the device will be operational although it shouldn't be + */ + hdev->asic_funcs->enable_events_from_fw(hdev); + } else if (!reset_upon_device_release) { + hdev->reset_info.compute_reset_cnt++; + } + + if (schedule_hard_reset) { + dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n"); + flags = hdev->reset_info.hard_reset_schedule_flags; + hdev->reset_info.hard_reset_schedule_flags = 0; + hdev->disabled = true; + hard_reset = true; + handle_reset_trigger(hdev, flags); + goto again; + } + + return 0; + +out_err: + hdev->disabled = true; + + spin_lock(&hdev->reset_info.lock); + hdev->reset_info.in_compute_reset = 0; + + if (hard_reset) { + dev_err(hdev->dev, + "%s Failed to reset! Device is NOT usable\n", + dev_name(&(hdev)->pdev->dev)); + hdev->reset_info.hard_reset_cnt++; + } else if (reset_upon_device_release) { + spin_unlock(&hdev->reset_info.lock); + dev_err(hdev->dev, "Failed to reset device after user release\n"); + flags |= HL_DRV_RESET_HARD; + flags &= ~HL_DRV_RESET_DEV_RELEASE; + hard_reset = true; + goto again; + } else { + spin_unlock(&hdev->reset_info.lock); + dev_err(hdev->dev, "Failed to do compute reset\n"); + hdev->reset_info.compute_reset_cnt++; + flags |= HL_DRV_RESET_HARD; + hard_reset = true; + goto again; + } + + hdev->reset_info.in_reset = 0; + + spin_unlock(&hdev->reset_info.lock); + + return rc; +} + +static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event_mask) +{ + mutex_lock(¬ifier_event->lock); + notifier_event->events_mask |= event_mask; + + if (notifier_event->eventfd) + eventfd_signal(notifier_event->eventfd, 1); + + mutex_unlock(¬ifier_event->lock); +} + +/* + * hl_notifier_event_send_all - notify all user processes via eventfd + * + * @hdev: pointer to habanalabs device structure + * @event_mask: the occurred event/s + * Returns 0 for success or an error on failure. + */ +void hl_notifier_event_send_all(struct hl_device *hdev, u64 event_mask) +{ + struct hl_fpriv *hpriv; + + mutex_lock(&hdev->fpriv_list_lock); + + list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) + hl_notifier_event_send(&hpriv->notifier_event, event_mask); + + mutex_unlock(&hdev->fpriv_list_lock); + + /* control device */ + mutex_lock(&hdev->fpriv_ctrl_list_lock); + + list_for_each_entry(hpriv, &hdev->fpriv_ctrl_list, dev_node) + hl_notifier_event_send(&hpriv->notifier_event, event_mask); + + mutex_unlock(&hdev->fpriv_ctrl_list_lock); +} + +/* + * hl_device_init - main initialization function for habanalabs device + * + * @hdev: pointer to habanalabs device structure + * + * Allocate an id for the device, do early initialization and then call the + * ASIC specific initialization functions. Finally, create the cdev and the + * Linux device to expose it to the user + */ +int hl_device_init(struct hl_device *hdev, struct class *hclass) +{ + int i, rc, cq_cnt, user_interrupt_cnt, cq_ready_cnt; + char *name; + bool add_cdev_sysfs_on_err = false; + + hdev->cdev_idx = hdev->id / 2; + + name = kasprintf(GFP_KERNEL, "hl%d", hdev->cdev_idx); + if (!name) { + rc = -ENOMEM; + goto out_disabled; + } + + /* Initialize cdev and device structures */ + rc = device_init_cdev(hdev, hclass, hdev->id, &hl_ops, name, + &hdev->cdev, &hdev->dev); + + kfree(name); + + if (rc) + goto out_disabled; + + name = kasprintf(GFP_KERNEL, "hl_controlD%d", hdev->cdev_idx); + if (!name) { + rc = -ENOMEM; + goto free_dev; + } + + /* Initialize cdev and device structures for control device */ + rc = device_init_cdev(hdev, hclass, hdev->id_control, &hl_ctrl_ops, + name, &hdev->cdev_ctrl, &hdev->dev_ctrl); + + kfree(name); + + if (rc) + goto free_dev; + + /* Initialize ASIC function pointers and perform early init */ + rc = device_early_init(hdev); + if (rc) + goto free_dev_ctrl; + + user_interrupt_cnt = hdev->asic_prop.user_dec_intr_count + + hdev->asic_prop.user_interrupt_count; + + if (user_interrupt_cnt) { + hdev->user_interrupt = kcalloc(user_interrupt_cnt, sizeof(*hdev->user_interrupt), + GFP_KERNEL); + if (!hdev->user_interrupt) { + rc = -ENOMEM; + goto early_fini; + } + } + + /* + * Start calling ASIC initialization. First S/W then H/W and finally + * late init + */ + rc = hdev->asic_funcs->sw_init(hdev); + if (rc) + goto free_usr_intr_mem; + + + /* initialize completion structure for multi CS wait */ + hl_multi_cs_completion_init(hdev); + + /* + * Initialize the H/W queues. Must be done before hw_init, because + * there the addresses of the kernel queue are being written to the + * registers of the device + */ + rc = hl_hw_queues_create(hdev); + if (rc) { + dev_err(hdev->dev, "failed to initialize kernel queues\n"); + goto sw_fini; + } + + cq_cnt = hdev->asic_prop.completion_queues_count; + + /* + * Initialize the completion queues. Must be done before hw_init, + * because there the addresses of the completion queues are being + * passed as arguments to request_irq + */ + if (cq_cnt) { + hdev->completion_queue = kcalloc(cq_cnt, + sizeof(*hdev->completion_queue), + GFP_KERNEL); + + if (!hdev->completion_queue) { + dev_err(hdev->dev, + "failed to allocate completion queues\n"); + rc = -ENOMEM; + goto hw_queues_destroy; + } + } + + for (i = 0, cq_ready_cnt = 0 ; i < cq_cnt ; i++, cq_ready_cnt++) { + rc = hl_cq_init(hdev, &hdev->completion_queue[i], + hdev->asic_funcs->get_queue_id_for_cq(hdev, i)); + if (rc) { + dev_err(hdev->dev, + "failed to initialize completion queue\n"); + goto cq_fini; + } + hdev->completion_queue[i].cq_idx = i; + } + + hdev->shadow_cs_queue = kcalloc(hdev->asic_prop.max_pending_cs, + sizeof(struct hl_cs *), GFP_KERNEL); + if (!hdev->shadow_cs_queue) { + rc = -ENOMEM; + goto cq_fini; + } + + /* + * Initialize the event queue. Must be done before hw_init, + * because there the address of the event queue is being + * passed as argument to request_irq + */ + rc = hl_eq_init(hdev, &hdev->event_queue); + if (rc) { + dev_err(hdev->dev, "failed to initialize event queue\n"); + goto free_shadow_cs_queue; + } + + /* MMU S/W must be initialized before kernel context is created */ + rc = hl_mmu_init(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to initialize MMU S/W structures\n"); + goto eq_fini; + } + + /* Allocate the kernel context */ + hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL); + if (!hdev->kernel_ctx) { + rc = -ENOMEM; + goto mmu_fini; + } + + hdev->is_compute_ctx_active = false; + + hdev->asic_funcs->state_dump_init(hdev); + + hdev->memory_scrub_val = MEM_SCRUB_DEFAULT_VAL; + hl_debugfs_add_device(hdev); + + /* debugfs nodes are created in hl_ctx_init so it must be called after + * hl_debugfs_add_device. + */ + rc = hl_ctx_init(hdev, hdev->kernel_ctx, true); + if (rc) { + dev_err(hdev->dev, "failed to initialize kernel context\n"); + kfree(hdev->kernel_ctx); + goto remove_device_from_debugfs; + } + + rc = hl_cb_pool_init(hdev); + if (rc) { + dev_err(hdev->dev, "failed to initialize CB pool\n"); + goto release_ctx; + } + + rc = hl_dec_init(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to initialize the decoder module\n"); + goto cb_pool_fini; + } + + /* + * From this point, override rc (=0) in case of an error to allow + * debugging (by adding char devices and create sysfs nodes as part of + * the error flow). + */ + add_cdev_sysfs_on_err = true; + + /* Device is now enabled as part of the initialization requires + * communication with the device firmware to get information that + * is required for the initialization itself + */ + hdev->disabled = false; + + rc = hdev->asic_funcs->hw_init(hdev); + if (rc) { + dev_err(hdev->dev, "failed to initialize the H/W\n"); + rc = 0; + goto out_disabled; + } + + /* Check that the communication with the device is working */ + rc = hdev->asic_funcs->test_queues(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to detect if device is alive\n"); + rc = 0; + goto out_disabled; + } + + rc = device_late_init(hdev); + if (rc) { + dev_err(hdev->dev, "Failed late initialization\n"); + rc = 0; + goto out_disabled; + } + + dev_info(hdev->dev, "Found %s device with %lluGB DRAM\n", + hdev->asic_name, + hdev->asic_prop.dram_size / SZ_1G); + + rc = hl_vm_init(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to initialize memory module\n"); + rc = 0; + goto out_disabled; + } + + /* + * Expose devices and sysfs nodes to user. + * From here there is no need to add char devices and create sysfs nodes + * in case of an error. + */ + add_cdev_sysfs_on_err = false; + rc = device_cdev_sysfs_add(hdev); + if (rc) { + dev_err(hdev->dev, + "Failed to add char devices and sysfs nodes\n"); + rc = 0; + goto out_disabled; + } + + /* Need to call this again because the max power might change, + * depending on card type for certain ASICs + */ + if (hdev->asic_prop.set_max_power_on_device_init && + !hdev->asic_prop.fw_security_enabled) + hl_fw_set_max_power(hdev); + + /* + * hl_hwmon_init() must be called after device_late_init(), because only + * there we get the information from the device about which + * hwmon-related sensors the device supports. + * Furthermore, it must be done after adding the device to the system. + */ + rc = hl_hwmon_init(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to initialize hwmon\n"); + rc = 0; + goto out_disabled; + } + + dev_notice(hdev->dev, + "Successfully added device %s to habanalabs driver\n", + dev_name(&(hdev)->pdev->dev)); + + hdev->init_done = true; + + /* After initialization is done, we are ready to receive events from + * the F/W. We can't do it before because we will ignore events and if + * those events are fatal, we won't know about it and the device will + * be operational although it shouldn't be + */ + hdev->asic_funcs->enable_events_from_fw(hdev); + + return 0; + +cb_pool_fini: + hl_cb_pool_fini(hdev); +release_ctx: + if (hl_ctx_put(hdev->kernel_ctx) != 1) + dev_err(hdev->dev, + "kernel ctx is still alive on initialization failure\n"); +remove_device_from_debugfs: + hl_debugfs_remove_device(hdev); +mmu_fini: + hl_mmu_fini(hdev); +eq_fini: + hl_eq_fini(hdev, &hdev->event_queue); +free_shadow_cs_queue: + kfree(hdev->shadow_cs_queue); +cq_fini: + for (i = 0 ; i < cq_ready_cnt ; i++) + hl_cq_fini(hdev, &hdev->completion_queue[i]); + kfree(hdev->completion_queue); +hw_queues_destroy: + hl_hw_queues_destroy(hdev); +sw_fini: + hdev->asic_funcs->sw_fini(hdev); +free_usr_intr_mem: + kfree(hdev->user_interrupt); +early_fini: + device_early_fini(hdev); +free_dev_ctrl: + put_device(hdev->dev_ctrl); +free_dev: + put_device(hdev->dev); +out_disabled: + hdev->disabled = true; + if (add_cdev_sysfs_on_err) + device_cdev_sysfs_add(hdev); + if (hdev->pdev) + dev_err(&hdev->pdev->dev, + "Failed to initialize hl%d. Device %s is NOT usable !\n", + hdev->cdev_idx, dev_name(&(hdev)->pdev->dev)); + else + pr_err("Failed to initialize hl%d. Device %s is NOT usable !\n", + hdev->cdev_idx, dev_name(&(hdev)->pdev->dev)); + + return rc; +} + +/* + * hl_device_fini - main tear-down function for habanalabs device + * + * @hdev: pointer to habanalabs device structure + * + * Destroy the device, call ASIC fini functions and release the id + */ +void hl_device_fini(struct hl_device *hdev) +{ + bool device_in_reset; + ktime_t timeout; + u64 reset_sec; + int i, rc; + + dev_info(hdev->dev, "Removing device\n"); + + hdev->device_fini_pending = 1; + flush_delayed_work(&hdev->device_reset_work.reset_work); + + if (hdev->pldm) + reset_sec = HL_PLDM_HARD_RESET_MAX_TIMEOUT; + else + reset_sec = HL_HARD_RESET_MAX_TIMEOUT; + + /* + * This function is competing with the reset function, so try to + * take the reset atomic and if we are already in middle of reset, + * wait until reset function is finished. Reset function is designed + * to always finish. However, in Gaudi, because of all the network + * ports, the hard reset could take between 10-30 seconds + */ + + timeout = ktime_add_us(ktime_get(), reset_sec * 1000 * 1000); + + spin_lock(&hdev->reset_info.lock); + device_in_reset = !!hdev->reset_info.in_reset; + if (!device_in_reset) + hdev->reset_info.in_reset = 1; + spin_unlock(&hdev->reset_info.lock); + + while (device_in_reset) { + usleep_range(50, 200); + + spin_lock(&hdev->reset_info.lock); + device_in_reset = !!hdev->reset_info.in_reset; + if (!device_in_reset) + hdev->reset_info.in_reset = 1; + spin_unlock(&hdev->reset_info.lock); + + if (ktime_compare(ktime_get(), timeout) > 0) { + dev_crit(hdev->dev, + "%s Failed to remove device because reset function did not finish\n", + dev_name(&(hdev)->pdev->dev)); + return; + } + } + + /* Disable PCI access from device F/W so it won't send us additional + * interrupts. We disable MSI/MSI-X at the halt_engines function and we + * can't have the F/W sending us interrupts after that. We need to + * disable the access here because if the device is marked disable, the + * message won't be send. Also, in case of heartbeat, the device CPU is + * marked as disable so this message won't be sent + */ + hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0); + + /* Mark device as disabled */ + hdev->disabled = true; + + take_release_locks(hdev); + + hdev->reset_info.hard_reset_pending = true; + + hl_hwmon_fini(hdev); + + cleanup_resources(hdev, true, false, false); + + /* Kill processes here after CS rollback. This is because the process + * can't really exit until all its CSs are done, which is what we + * do in cs rollback + */ + dev_info(hdev->dev, + "Waiting for all processes to exit (timeout of %u seconds)", + HL_PENDING_RESET_LONG_SEC); + + rc = device_kill_open_processes(hdev, HL_PENDING_RESET_LONG_SEC, false); + if (rc) { + dev_crit(hdev->dev, "Failed to kill all open processes\n"); + device_disable_open_processes(hdev, false); + } + + rc = device_kill_open_processes(hdev, 0, true); + if (rc) { + dev_crit(hdev->dev, "Failed to kill all control device open processes\n"); + device_disable_open_processes(hdev, true); + } + + hl_cb_pool_fini(hdev); + + /* Reset the H/W. It will be in idle state after this returns */ + hdev->asic_funcs->hw_fini(hdev, true, false); + + hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE; + + /* Release kernel context */ + if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1)) + dev_err(hdev->dev, "kernel ctx is still alive\n"); + + hl_debugfs_remove_device(hdev); + + hl_dec_fini(hdev); + + hl_vm_fini(hdev); + + hl_mmu_fini(hdev); + + hl_eq_fini(hdev, &hdev->event_queue); + + kfree(hdev->shadow_cs_queue); + + for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) + hl_cq_fini(hdev, &hdev->completion_queue[i]); + kfree(hdev->completion_queue); + kfree(hdev->user_interrupt); + + hl_hw_queues_destroy(hdev); + + /* Call ASIC S/W finalize function */ + hdev->asic_funcs->sw_fini(hdev); + + device_early_fini(hdev); + + /* Hide devices and sysfs nodes from user */ + device_cdev_sysfs_del(hdev); + + pr_info("removed device successfully\n"); +} + +/* + * MMIO register access helper functions. + */ + +/* + * hl_rreg - Read an MMIO register + * + * @hdev: pointer to habanalabs device structure + * @reg: MMIO register offset (in bytes) + * + * Returns the value of the MMIO register we are asked to read + * + */ +inline u32 hl_rreg(struct hl_device *hdev, u32 reg) +{ + return readl(hdev->rmmio + reg); +} + +/* + * hl_wreg - Write to an MMIO register + * + * @hdev: pointer to habanalabs device structure + * @reg: MMIO register offset (in bytes) + * @val: 32-bit value + * + * Writes the 32-bit value into the MMIO register + * + */ +inline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val) +{ + writel(val, hdev->rmmio + reg); +} diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c new file mode 100644 index 000000000..f18e53bbb --- /dev/null +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -0,0 +1,3021 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2022 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "habanalabs.h" +#include "../include/common/hl_boot_if.h" + +#include <linux/firmware.h> +#include <linux/crc32.h> +#include <linux/slab.h> +#include <linux/ctype.h> + +#define FW_FILE_MAX_SIZE 0x1400000 /* maximum size of 20MB */ + +static char *extract_fw_ver_from_str(const char *fw_str) +{ + char *str, *fw_ver, *whitespace; + u32 ver_offset; + + fw_ver = kmalloc(VERSION_MAX_LEN, GFP_KERNEL); + if (!fw_ver) + return NULL; + + str = strnstr(fw_str, "fw-", VERSION_MAX_LEN); + if (!str) + goto free_fw_ver; + + /* Skip the fw- part */ + str += 3; + ver_offset = str - fw_str; + + /* Copy until the next whitespace */ + whitespace = strnstr(str, " ", VERSION_MAX_LEN - ver_offset); + if (!whitespace) + goto free_fw_ver; + + strscpy(fw_ver, str, whitespace - str + 1); + + return fw_ver; + +free_fw_ver: + kfree(fw_ver); + return NULL; +} + +static int extract_fw_sub_versions(struct hl_device *hdev, char *preboot_ver) +{ + char major[8], minor[8], *first_dot, *second_dot; + int rc; + + first_dot = strnstr(preboot_ver, ".", 10); + if (first_dot) { + strscpy(major, preboot_ver, first_dot - preboot_ver + 1); + rc = kstrtou32(major, 10, &hdev->fw_major_version); + } else { + rc = -EINVAL; + } + + if (rc) { + dev_err(hdev->dev, "Error %d parsing preboot major version\n", rc); + goto out; + } + + /* skip the first dot */ + first_dot++; + + second_dot = strnstr(first_dot, ".", 10); + if (second_dot) { + strscpy(minor, first_dot, second_dot - first_dot + 1); + rc = kstrtou32(minor, 10, &hdev->fw_minor_version); + } else { + rc = -EINVAL; + } + + if (rc) + dev_err(hdev->dev, "Error %d parsing preboot minor version\n", rc); + +out: + kfree(preboot_ver); + return rc; +} + +static int hl_request_fw(struct hl_device *hdev, + const struct firmware **firmware_p, + const char *fw_name) +{ + size_t fw_size; + int rc; + + rc = request_firmware(firmware_p, fw_name, hdev->dev); + if (rc) { + dev_err(hdev->dev, "Firmware file %s is not found! (error %d)\n", + fw_name, rc); + goto out; + } + + fw_size = (*firmware_p)->size; + if ((fw_size % 4) != 0) { + dev_err(hdev->dev, "Illegal %s firmware size %zu\n", + fw_name, fw_size); + rc = -EINVAL; + goto release_fw; + } + + dev_dbg(hdev->dev, "%s firmware size == %zu\n", fw_name, fw_size); + + if (fw_size > FW_FILE_MAX_SIZE) { + dev_err(hdev->dev, + "FW file size %zu exceeds maximum of %u bytes\n", + fw_size, FW_FILE_MAX_SIZE); + rc = -EINVAL; + goto release_fw; + } + + return 0; + +release_fw: + release_firmware(*firmware_p); +out: + return rc; +} + +/** + * hl_release_firmware() - release FW + * + * @fw: fw descriptor + * + * note: this inline function added to serve as a comprehensive mirror for the + * hl_request_fw function. + */ +static inline void hl_release_firmware(const struct firmware *fw) +{ + release_firmware(fw); +} + +/** + * hl_fw_copy_fw_to_device() - copy FW to device + * + * @hdev: pointer to hl_device structure. + * @fw: fw descriptor + * @dst: IO memory mapped address space to copy firmware to + * @src_offset: offset in src FW to copy from + * @size: amount of bytes to copy (0 to copy the whole binary) + * + * actual copy of FW binary data to device, shared by static and dynamic loaders + */ +static int hl_fw_copy_fw_to_device(struct hl_device *hdev, + const struct firmware *fw, void __iomem *dst, + u32 src_offset, u32 size) +{ + const void *fw_data; + + /* size 0 indicates to copy the whole file */ + if (!size) + size = fw->size; + + if (src_offset + size > fw->size) { + dev_err(hdev->dev, + "size to copy(%u) and offset(%u) are invalid\n", + size, src_offset); + return -EINVAL; + } + + fw_data = (const void *) fw->data; + + memcpy_toio(dst, fw_data + src_offset, size); + return 0; +} + +/** + * hl_fw_copy_msg_to_device() - copy message to device + * + * @hdev: pointer to hl_device structure. + * @msg: message + * @dst: IO memory mapped address space to copy firmware to + * @src_offset: offset in src message to copy from + * @size: amount of bytes to copy (0 to copy the whole binary) + * + * actual copy of message data to device. + */ +static int hl_fw_copy_msg_to_device(struct hl_device *hdev, + struct lkd_msg_comms *msg, void __iomem *dst, + u32 src_offset, u32 size) +{ + void *msg_data; + + /* size 0 indicates to copy the whole file */ + if (!size) + size = sizeof(struct lkd_msg_comms); + + if (src_offset + size > sizeof(struct lkd_msg_comms)) { + dev_err(hdev->dev, + "size to copy(%u) and offset(%u) are invalid\n", + size, src_offset); + return -EINVAL; + } + + msg_data = (void *) msg; + + memcpy_toio(dst, msg_data + src_offset, size); + + return 0; +} + +/** + * hl_fw_load_fw_to_device() - Load F/W code to device's memory. + * + * @hdev: pointer to hl_device structure. + * @fw_name: the firmware image name + * @dst: IO memory mapped address space to copy firmware to + * @src_offset: offset in src FW to copy from + * @size: amount of bytes to copy (0 to copy the whole binary) + * + * Copy fw code from firmware file to device memory. + * + * Return: 0 on success, non-zero for failure. + */ +int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name, + void __iomem *dst, u32 src_offset, u32 size) +{ + const struct firmware *fw; + int rc; + + rc = hl_request_fw(hdev, &fw, fw_name); + if (rc) + return rc; + + rc = hl_fw_copy_fw_to_device(hdev, fw, dst, src_offset, size); + + hl_release_firmware(fw); + return rc; +} + +int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode, u64 value) +{ + struct cpucp_packet pkt = {}; + + pkt.ctl = cpu_to_le32(opcode << CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.value = cpu_to_le64(value); + + return hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL); +} + +int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg, + u16 len, u32 timeout, u64 *result) +{ + struct hl_hw_queue *queue = &hdev->kernel_queues[hw_queue_id]; + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct cpucp_packet *pkt; + dma_addr_t pkt_dma_addr; + struct hl_bd *sent_bd; + u32 tmp, expected_ack_val, pi, opcode; + int rc; + + pkt = hl_cpu_accessible_dma_pool_alloc(hdev, len, &pkt_dma_addr); + if (!pkt) { + dev_err(hdev->dev, + "Failed to allocate DMA memory for packet to CPU\n"); + return -ENOMEM; + } + + memcpy(pkt, msg, len); + + mutex_lock(&hdev->send_cpu_message_lock); + + /* CPU-CP messages can be sent during soft-reset */ + if (hdev->disabled && !hdev->reset_info.in_compute_reset) { + rc = 0; + goto out; + } + + if (hdev->device_cpu_disabled) { + rc = -EIO; + goto out; + } + + /* set fence to a non valid value */ + pkt->fence = cpu_to_le32(UINT_MAX); + pi = queue->pi; + + /* + * The CPU queue is a synchronous queue with an effective depth of + * a single entry (although it is allocated with room for multiple + * entries). We lock on it using 'send_cpu_message_lock' which + * serializes accesses to the CPU queue. + * Which means that we don't need to lock the access to the entire H/W + * queues module when submitting a JOB to the CPU queue. + */ + hl_hw_queue_submit_bd(hdev, queue, hl_queue_inc_ptr(queue->pi), len, pkt_dma_addr); + + if (prop->fw_app_cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN) + expected_ack_val = queue->pi; + else + expected_ack_val = CPUCP_PACKET_FENCE_VAL; + + rc = hl_poll_timeout_memory(hdev, &pkt->fence, tmp, + (tmp == expected_ack_val), 1000, + timeout, true); + + hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id); + + if (rc == -ETIMEDOUT) { + /* If FW performed reset just before sending it a packet, we will get a timeout. + * This is expected behavior, hence no need for error message. + */ + if (!hl_device_operational(hdev, NULL) && !hdev->reset_info.in_compute_reset) + dev_dbg(hdev->dev, "Device CPU packet timeout (0x%x) due to FW reset\n", + tmp); + else + dev_err(hdev->dev, "Device CPU packet timeout (0x%x)\n", tmp); + hdev->device_cpu_disabled = true; + goto out; + } + + tmp = le32_to_cpu(pkt->ctl); + + rc = (tmp & CPUCP_PKT_CTL_RC_MASK) >> CPUCP_PKT_CTL_RC_SHIFT; + if (rc) { + opcode = (tmp & CPUCP_PKT_CTL_OPCODE_MASK) >> CPUCP_PKT_CTL_OPCODE_SHIFT; + + if (!prop->supports_advanced_cpucp_rc) { + dev_dbg(hdev->dev, "F/W ERROR %d for CPU packet %d\n", rc, opcode); + goto scrub_descriptor; + } + + switch (rc) { + case cpucp_packet_invalid: + dev_err(hdev->dev, + "CPU packet %d is not supported by F/W\n", opcode); + break; + case cpucp_packet_fault: + dev_err(hdev->dev, + "F/W failed processing CPU packet %d\n", opcode); + break; + case cpucp_packet_invalid_pkt: + dev_dbg(hdev->dev, + "CPU packet %d is not supported by F/W\n", opcode); + break; + case cpucp_packet_invalid_params: + dev_err(hdev->dev, + "F/W reports invalid parameters for CPU packet %d\n", opcode); + break; + + default: + dev_err(hdev->dev, + "Unknown F/W ERROR %d for CPU packet %d\n", rc, opcode); + } + + /* propagate the return code from the f/w to the callers who want to check it */ + if (result) + *result = rc; + + rc = -EIO; + + } else if (result) { + *result = le64_to_cpu(pkt->result); + } + +scrub_descriptor: + /* Scrub previous buffer descriptor 'ctl' field which contains the + * previous PI value written during packet submission. + * We must do this or else F/W can read an old value upon queue wraparound. + */ + sent_bd = queue->kernel_address; + sent_bd += hl_pi_2_offset(pi); + sent_bd->ctl = cpu_to_le32(UINT_MAX); + +out: + mutex_unlock(&hdev->send_cpu_message_lock); + + hl_cpu_accessible_dma_pool_free(hdev, len, pkt); + + return rc; +} + +int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type) +{ + struct cpucp_packet pkt; + u64 result; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_UNMASK_RAZWI_IRQ << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.value = cpu_to_le64(event_type); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + 0, &result); + + if (rc) + dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type); + + return rc; +} + +int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr, + size_t irq_arr_size) +{ + struct cpucp_unmask_irq_arr_packet *pkt; + size_t total_pkt_size; + u64 result; + int rc; + + total_pkt_size = sizeof(struct cpucp_unmask_irq_arr_packet) + + irq_arr_size; + + /* data should be aligned to 8 bytes in order to CPU-CP to copy it */ + total_pkt_size = (total_pkt_size + 0x7) & ~0x7; + + /* total_pkt_size is casted to u16 later on */ + if (total_pkt_size > USHRT_MAX) { + dev_err(hdev->dev, "too many elements in IRQ array\n"); + return -EINVAL; + } + + pkt = kzalloc(total_pkt_size, GFP_KERNEL); + if (!pkt) + return -ENOMEM; + + pkt->length = cpu_to_le32(irq_arr_size / sizeof(irq_arr[0])); + memcpy(&pkt->irqs, irq_arr, irq_arr_size); + + pkt->cpucp_pkt.ctl = cpu_to_le32(CPUCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY << + CPUCP_PKT_CTL_OPCODE_SHIFT); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) pkt, + total_pkt_size, 0, &result); + + if (rc) + dev_err(hdev->dev, "failed to unmask IRQ array\n"); + + kfree(pkt); + + return rc; +} + +int hl_fw_test_cpu_queue(struct hl_device *hdev) +{ + struct cpucp_packet test_pkt = {}; + u64 result; + int rc; + + test_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST << + CPUCP_PKT_CTL_OPCODE_SHIFT); + test_pkt.value = cpu_to_le64(CPUCP_PACKET_FENCE_VAL); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &test_pkt, + sizeof(test_pkt), 0, &result); + + if (!rc) { + if (result != CPUCP_PACKET_FENCE_VAL) + dev_err(hdev->dev, + "CPU queue test failed (%#08llx)\n", result); + } else { + dev_err(hdev->dev, "CPU queue test failed, error %d\n", rc); + } + + return rc; +} + +void *hl_fw_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size, + dma_addr_t *dma_handle) +{ + u64 kernel_addr; + + kernel_addr = gen_pool_alloc(hdev->cpu_accessible_dma_pool, size); + + *dma_handle = hdev->cpu_accessible_dma_address + + (kernel_addr - (u64) (uintptr_t) hdev->cpu_accessible_dma_mem); + + return (void *) (uintptr_t) kernel_addr; +} + +void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size, + void *vaddr) +{ + gen_pool_free(hdev->cpu_accessible_dma_pool, (u64) (uintptr_t) vaddr, + size); +} + +int hl_fw_send_device_activity(struct hl_device *hdev, bool open) +{ + struct cpucp_packet pkt; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + pkt.ctl = cpu_to_le32(CPUCP_PACKET_ACTIVE_STATUS_SET << CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.value = cpu_to_le64(open); + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL); + if (rc) + dev_err(hdev->dev, "failed to send device activity msg(%u)\n", open); + + return rc; +} + +int hl_fw_send_heartbeat(struct hl_device *hdev) +{ + struct cpucp_packet hb_pkt; + u64 result; + int rc; + + memset(&hb_pkt, 0, sizeof(hb_pkt)); + hb_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST << + CPUCP_PKT_CTL_OPCODE_SHIFT); + hb_pkt.value = cpu_to_le64(CPUCP_PACKET_FENCE_VAL); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt, + sizeof(hb_pkt), 0, &result); + + if ((rc) || (result != CPUCP_PACKET_FENCE_VAL)) + return -EIO; + + if (le32_to_cpu(hb_pkt.status_mask) & + CPUCP_PKT_HB_STATUS_EQ_FAULT_MASK) { + dev_warn(hdev->dev, "FW reported EQ fault during heartbeat\n"); + rc = -EIO; + } + + return rc; +} + +static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, + u32 sts_val) +{ + bool err_exists = false; + + if (!(err_val & CPU_BOOT_ERR0_ENABLED)) + return false; + + if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL) { + dev_err(hdev->dev, + "Device boot error - DRAM initialization failed\n"); + err_exists = true; + } + + if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED) { + dev_err(hdev->dev, "Device boot error - FIT image corrupted\n"); + err_exists = true; + } + + if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL) { + dev_err(hdev->dev, + "Device boot error - Thermal Sensor initialization failed\n"); + err_exists = true; + } + + if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED) { + if (hdev->bmc_enable) { + dev_err(hdev->dev, + "Device boot error - Skipped waiting for BMC\n"); + err_exists = true; + } else { + dev_info(hdev->dev, + "Device boot message - Skipped waiting for BMC\n"); + /* This is an info so we don't want it to disable the + * device + */ + err_val &= ~CPU_BOOT_ERR0_BMC_WAIT_SKIPPED; + } + } + + if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY) { + dev_err(hdev->dev, + "Device boot error - Serdes data from BMC not available\n"); + err_exists = true; + } + + if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL) { + dev_err(hdev->dev, + "Device boot error - NIC F/W initialization failed\n"); + err_exists = true; + } + + if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY) { + dev_err(hdev->dev, + "Device boot warning - security not ready\n"); + err_exists = true; + } + + if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL) { + dev_err(hdev->dev, "Device boot error - security failure\n"); + err_exists = true; + } + + if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL) { + dev_err(hdev->dev, "Device boot error - eFuse failure\n"); + err_exists = true; + } + + if (err_val & CPU_BOOT_ERR0_SEC_IMG_VER_FAIL) { + dev_err(hdev->dev, "Device boot error - Failed to load preboot secondary image\n"); + err_exists = true; + } + + if (err_val & CPU_BOOT_ERR0_PLL_FAIL) { + dev_err(hdev->dev, "Device boot error - PLL failure\n"); + err_exists = true; + } + + if (err_val & CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL) { + /* Ignore this bit, don't prevent driver loading */ + dev_dbg(hdev->dev, "device unusable status is set\n"); + err_val &= ~CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL; + } + + if (err_val & CPU_BOOT_ERR0_BINNING_FAIL) { + dev_err(hdev->dev, "Device boot error - binning failure\n"); + err_exists = true; + } + + if (sts_val & CPU_BOOT_DEV_STS0_ENABLED) + dev_dbg(hdev->dev, "Device status0 %#x\n", sts_val); + + /* All warnings should go here in order not to reach the unknown error validation */ + if (err_val & CPU_BOOT_ERR0_EEPROM_FAIL) { + dev_warn(hdev->dev, + "Device boot warning - EEPROM failure detected, default settings applied\n"); + /* This is a warning so we don't want it to disable the + * device + */ + err_val &= ~CPU_BOOT_ERR0_EEPROM_FAIL; + } + + if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) { + dev_warn(hdev->dev, + "Device boot warning - Skipped DRAM initialization\n"); + /* This is a warning so we don't want it to disable the + * device + */ + err_val &= ~CPU_BOOT_ERR0_DRAM_SKIPPED; + } + + if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL) { + dev_warn(hdev->dev, + "Device boot warning - Failed to load preboot primary image\n"); + /* This is a warning so we don't want it to disable the + * device as we have a secondary preboot image + */ + err_val &= ~CPU_BOOT_ERR0_PRI_IMG_VER_FAIL; + } + + if (err_val & CPU_BOOT_ERR0_TPM_FAIL) { + dev_warn(hdev->dev, + "Device boot warning - TPM failure\n"); + /* This is a warning so we don't want it to disable the + * device + */ + err_val &= ~CPU_BOOT_ERR0_TPM_FAIL; + } + + if (!err_exists && (err_val & ~CPU_BOOT_ERR0_ENABLED)) { + dev_err(hdev->dev, + "Device boot error - unknown ERR0 error 0x%08x\n", err_val); + err_exists = true; + } + + /* return error only if it's in the predefined mask */ + if (err_exists && ((err_val & ~CPU_BOOT_ERR0_ENABLED) & + lower_32_bits(hdev->boot_error_status_mask))) + return true; + + return false; +} + +/* placeholder for ERR1 as no errors defined there yet */ +static bool fw_report_boot_dev1(struct hl_device *hdev, u32 err_val, + u32 sts_val) +{ + /* + * keep this variable to preserve the logic of the function. + * this way it would require less modifications when error will be + * added to DEV_ERR1 + */ + bool err_exists = false; + + if (!(err_val & CPU_BOOT_ERR1_ENABLED)) + return false; + + if (sts_val & CPU_BOOT_DEV_STS1_ENABLED) + dev_dbg(hdev->dev, "Device status1 %#x\n", sts_val); + + if (!err_exists && (err_val & ~CPU_BOOT_ERR1_ENABLED)) { + dev_err(hdev->dev, + "Device boot error - unknown ERR1 error 0x%08x\n", + err_val); + err_exists = true; + } + + /* return error only if it's in the predefined mask */ + if (err_exists && ((err_val & ~CPU_BOOT_ERR1_ENABLED) & + upper_32_bits(hdev->boot_error_status_mask))) + return true; + + return false; +} + +static int fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg, + u32 boot_err1_reg, u32 cpu_boot_dev_status0_reg, + u32 cpu_boot_dev_status1_reg) +{ + u32 err_val, status_val; + bool err_exists = false; + + /* Some of the firmware status codes are deprecated in newer f/w + * versions. In those versions, the errors are reported + * in different registers. Therefore, we need to check those + * registers and print the exact errors. Moreover, there + * may be multiple errors, so we need to report on each error + * separately. Some of the error codes might indicate a state + * that is not an error per-se, but it is an error in production + * environment + */ + err_val = RREG32(boot_err0_reg); + status_val = RREG32(cpu_boot_dev_status0_reg); + err_exists = fw_report_boot_dev0(hdev, err_val, status_val); + + err_val = RREG32(boot_err1_reg); + status_val = RREG32(cpu_boot_dev_status1_reg); + err_exists |= fw_report_boot_dev1(hdev, err_val, status_val); + + if (err_exists) + return -EIO; + + return 0; +} + +int hl_fw_cpucp_info_get(struct hl_device *hdev, + u32 sts_boot_dev_sts0_reg, + u32 sts_boot_dev_sts1_reg, u32 boot_err0_reg, + u32 boot_err1_reg) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct cpucp_packet pkt = {}; + dma_addr_t cpucp_info_dma_addr; + void *cpucp_info_cpu_addr; + char *kernel_ver; + u64 result; + int rc; + + cpucp_info_cpu_addr = hl_cpu_accessible_dma_pool_alloc(hdev, sizeof(struct cpucp_info), + &cpucp_info_dma_addr); + if (!cpucp_info_cpu_addr) { + dev_err(hdev->dev, + "Failed to allocate DMA memory for CPU-CP info packet\n"); + return -ENOMEM; + } + + memset(cpucp_info_cpu_addr, 0, sizeof(struct cpucp_info)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_INFO_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.addr = cpu_to_le64(cpucp_info_dma_addr); + pkt.data_max_size = cpu_to_le32(sizeof(struct cpucp_info)); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + HL_CPUCP_INFO_TIMEOUT_USEC, &result); + if (rc) { + dev_err(hdev->dev, + "Failed to handle CPU-CP info pkt, error %d\n", rc); + goto out; + } + + rc = fw_read_errors(hdev, boot_err0_reg, boot_err1_reg, + sts_boot_dev_sts0_reg, sts_boot_dev_sts1_reg); + if (rc) { + dev_err(hdev->dev, "Errors in device boot\n"); + goto out; + } + + memcpy(&prop->cpucp_info, cpucp_info_cpu_addr, + sizeof(prop->cpucp_info)); + + rc = hl_build_hwmon_channel_info(hdev, prop->cpucp_info.sensors); + if (rc) { + dev_err(hdev->dev, + "Failed to build hwmon channel info, error %d\n", rc); + rc = -EFAULT; + goto out; + } + + kernel_ver = extract_fw_ver_from_str(prop->cpucp_info.kernel_version); + if (kernel_ver) { + dev_info(hdev->dev, "Linux version %s", kernel_ver); + kfree(kernel_ver); + } + + /* assume EQ code doesn't need to check eqe index */ + hdev->event_queue.check_eqe_index = false; + + /* Read FW application security bits again */ + if (prop->fw_cpu_boot_dev_sts0_valid) { + prop->fw_app_cpu_boot_dev_sts0 = RREG32(sts_boot_dev_sts0_reg); + if (prop->fw_app_cpu_boot_dev_sts0 & + CPU_BOOT_DEV_STS0_EQ_INDEX_EN) + hdev->event_queue.check_eqe_index = true; + } + + if (prop->fw_cpu_boot_dev_sts1_valid) + prop->fw_app_cpu_boot_dev_sts1 = RREG32(sts_boot_dev_sts1_reg); + +out: + hl_cpu_accessible_dma_pool_free(hdev, sizeof(struct cpucp_info), cpucp_info_cpu_addr); + + return rc; +} + +static int hl_fw_send_msi_info_msg(struct hl_device *hdev) +{ + struct cpucp_array_data_packet *pkt; + size_t total_pkt_size, data_size; + u64 result; + int rc; + + /* skip sending this info for unsupported ASICs */ + if (!hdev->asic_funcs->get_msi_info) + return 0; + + data_size = CPUCP_NUM_OF_MSI_TYPES * sizeof(u32); + total_pkt_size = sizeof(struct cpucp_array_data_packet) + data_size; + + /* data should be aligned to 8 bytes in order to CPU-CP to copy it */ + total_pkt_size = (total_pkt_size + 0x7) & ~0x7; + + /* total_pkt_size is casted to u16 later on */ + if (total_pkt_size > USHRT_MAX) { + dev_err(hdev->dev, "CPUCP array data is too big\n"); + return -EINVAL; + } + + pkt = kzalloc(total_pkt_size, GFP_KERNEL); + if (!pkt) + return -ENOMEM; + + pkt->length = cpu_to_le32(CPUCP_NUM_OF_MSI_TYPES); + + memset((void *) &pkt->data, 0xFF, data_size); + hdev->asic_funcs->get_msi_info(pkt->data); + + pkt->cpucp_pkt.ctl = cpu_to_le32(CPUCP_PACKET_MSI_INFO_SET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *)pkt, + total_pkt_size, 0, &result); + + /* + * in case packet result is invalid it means that FW does not support + * this feature and will use default/hard coded MSI values. no reason + * to stop the boot + */ + if (rc && result == cpucp_packet_invalid) + rc = 0; + + if (rc) + dev_err(hdev->dev, "failed to send CPUCP array data\n"); + + kfree(pkt); + + return rc; +} + +int hl_fw_cpucp_handshake(struct hl_device *hdev, + u32 sts_boot_dev_sts0_reg, + u32 sts_boot_dev_sts1_reg, u32 boot_err0_reg, + u32 boot_err1_reg) +{ + int rc; + + rc = hl_fw_cpucp_info_get(hdev, sts_boot_dev_sts0_reg, + sts_boot_dev_sts1_reg, boot_err0_reg, + boot_err1_reg); + if (rc) + return rc; + + return hl_fw_send_msi_info_msg(hdev); +} + +int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size) +{ + struct cpucp_packet pkt = {}; + void *eeprom_info_cpu_addr; + dma_addr_t eeprom_info_dma_addr; + u64 result; + int rc; + + eeprom_info_cpu_addr = hl_cpu_accessible_dma_pool_alloc(hdev, max_size, + &eeprom_info_dma_addr); + if (!eeprom_info_cpu_addr) { + dev_err(hdev->dev, + "Failed to allocate DMA memory for CPU-CP EEPROM packet\n"); + return -ENOMEM; + } + + memset(eeprom_info_cpu_addr, 0, max_size); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_EEPROM_DATA_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.addr = cpu_to_le64(eeprom_info_dma_addr); + pkt.data_max_size = cpu_to_le32(max_size); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + HL_CPUCP_EEPROM_TIMEOUT_USEC, &result); + + if (rc) { + dev_err(hdev->dev, + "Failed to handle CPU-CP EEPROM packet, error %d\n", + rc); + goto out; + } + + /* result contains the actual size */ + memcpy(data, eeprom_info_cpu_addr, min((size_t)result, max_size)); + +out: + hl_cpu_accessible_dma_pool_free(hdev, max_size, eeprom_info_cpu_addr); + + return rc; +} + +int hl_fw_get_monitor_dump(struct hl_device *hdev, void *data) +{ + struct cpucp_monitor_dump *mon_dump_cpu_addr; + dma_addr_t mon_dump_dma_addr; + struct cpucp_packet pkt = {}; + size_t data_size; + __le32 *src_ptr; + u32 *dst_ptr; + u64 result; + int i, rc; + + data_size = sizeof(struct cpucp_monitor_dump); + mon_dump_cpu_addr = hl_cpu_accessible_dma_pool_alloc(hdev, data_size, &mon_dump_dma_addr); + if (!mon_dump_cpu_addr) { + dev_err(hdev->dev, + "Failed to allocate DMA memory for CPU-CP monitor-dump packet\n"); + return -ENOMEM; + } + + memset(mon_dump_cpu_addr, 0, data_size); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_MONITOR_DUMP_GET << CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.addr = cpu_to_le64(mon_dump_dma_addr); + pkt.data_max_size = cpu_to_le32(data_size); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + HL_CPUCP_MON_DUMP_TIMEOUT_USEC, &result); + if (rc) { + dev_err(hdev->dev, "Failed to handle CPU-CP monitor-dump packet, error %d\n", rc); + goto out; + } + + /* result contains the actual size */ + src_ptr = (__le32 *) mon_dump_cpu_addr; + dst_ptr = data; + for (i = 0; i < (data_size / sizeof(u32)); i++) { + *dst_ptr = le32_to_cpu(*src_ptr); + src_ptr++; + dst_ptr++; + } + +out: + hl_cpu_accessible_dma_pool_free(hdev, data_size, mon_dump_cpu_addr); + + return rc; +} + +int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev, + struct hl_info_pci_counters *counters) +{ + struct cpucp_packet pkt = {}; + u64 result; + int rc; + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_PCIE_THROUGHPUT_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + + /* Fetch PCI rx counter */ + pkt.index = cpu_to_le32(cpucp_pcie_throughput_rx); + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + HL_CPUCP_INFO_TIMEOUT_USEC, &result); + if (rc) { + dev_err(hdev->dev, + "Failed to handle CPU-CP PCI info pkt, error %d\n", rc); + return rc; + } + counters->rx_throughput = result; + + memset(&pkt, 0, sizeof(pkt)); + pkt.ctl = cpu_to_le32(CPUCP_PACKET_PCIE_THROUGHPUT_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + + /* Fetch PCI tx counter */ + pkt.index = cpu_to_le32(cpucp_pcie_throughput_tx); + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + HL_CPUCP_INFO_TIMEOUT_USEC, &result); + if (rc) { + dev_err(hdev->dev, + "Failed to handle CPU-CP PCI info pkt, error %d\n", rc); + return rc; + } + counters->tx_throughput = result; + + /* Fetch PCI replay counter */ + memset(&pkt, 0, sizeof(pkt)); + pkt.ctl = cpu_to_le32(CPUCP_PACKET_PCIE_REPLAY_CNT_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + HL_CPUCP_INFO_TIMEOUT_USEC, &result); + if (rc) { + dev_err(hdev->dev, + "Failed to handle CPU-CP PCI info pkt, error %d\n", rc); + return rc; + } + counters->replay_cnt = (u32) result; + + return rc; +} + +int hl_fw_cpucp_total_energy_get(struct hl_device *hdev, u64 *total_energy) +{ + struct cpucp_packet pkt = {}; + u64 result; + int rc; + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_TOTAL_ENERGY_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + HL_CPUCP_INFO_TIMEOUT_USEC, &result); + if (rc) { + dev_err(hdev->dev, + "Failed to handle CpuCP total energy pkt, error %d\n", + rc); + return rc; + } + + *total_energy = result; + + return rc; +} + +int get_used_pll_index(struct hl_device *hdev, u32 input_pll_index, + enum pll_index *pll_index) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + u8 pll_byte, pll_bit_off; + bool dynamic_pll; + int fw_pll_idx; + + dynamic_pll = !!(prop->fw_app_cpu_boot_dev_sts0 & + CPU_BOOT_DEV_STS0_DYN_PLL_EN); + + if (!dynamic_pll) { + /* + * in case we are working with legacy FW (each asic has unique + * PLL numbering) use the driver based index as they are + * aligned with fw legacy numbering + */ + *pll_index = input_pll_index; + return 0; + } + + /* retrieve a FW compatible PLL index based on + * ASIC specific user request + */ + fw_pll_idx = hdev->asic_funcs->map_pll_idx_to_fw_idx(input_pll_index); + if (fw_pll_idx < 0) { + dev_err(hdev->dev, "Invalid PLL index (%u) error %d\n", + input_pll_index, fw_pll_idx); + return -EINVAL; + } + + /* PLL map is a u8 array */ + pll_byte = prop->cpucp_info.pll_map[fw_pll_idx >> 3]; + pll_bit_off = fw_pll_idx & 0x7; + + if (!(pll_byte & BIT(pll_bit_off))) { + dev_err(hdev->dev, "PLL index %d is not supported\n", + fw_pll_idx); + return -EINVAL; + } + + *pll_index = fw_pll_idx; + + return 0; +} + +int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, u32 pll_index, + u16 *pll_freq_arr) +{ + struct cpucp_packet pkt; + enum pll_index used_pll_idx; + u64 result; + int rc; + + rc = get_used_pll_index(hdev, pll_index, &used_pll_idx); + if (rc) + return rc; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_PLL_INFO_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.pll_type = __cpu_to_le16((u16)used_pll_idx); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + HL_CPUCP_INFO_TIMEOUT_USEC, &result); + if (rc) { + dev_err(hdev->dev, "Failed to read PLL info, error %d\n", rc); + return rc; + } + + pll_freq_arr[0] = FIELD_GET(CPUCP_PKT_RES_PLL_OUT0_MASK, result); + pll_freq_arr[1] = FIELD_GET(CPUCP_PKT_RES_PLL_OUT1_MASK, result); + pll_freq_arr[2] = FIELD_GET(CPUCP_PKT_RES_PLL_OUT2_MASK, result); + pll_freq_arr[3] = FIELD_GET(CPUCP_PKT_RES_PLL_OUT3_MASK, result); + + return 0; +} + +int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power) +{ + struct cpucp_packet pkt; + u64 result; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_POWER_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.type = cpu_to_le16(CPUCP_POWER_INPUT); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + HL_CPUCP_INFO_TIMEOUT_USEC, &result); + if (rc) { + dev_err(hdev->dev, "Failed to read power, error %d\n", rc); + return rc; + } + + *power = result; + + return rc; +} + +int hl_fw_dram_replaced_row_get(struct hl_device *hdev, + struct cpucp_hbm_row_info *info) +{ + struct cpucp_hbm_row_info *cpucp_repl_rows_info_cpu_addr; + dma_addr_t cpucp_repl_rows_info_dma_addr; + struct cpucp_packet pkt = {}; + u64 result; + int rc; + + cpucp_repl_rows_info_cpu_addr = hl_cpu_accessible_dma_pool_alloc(hdev, + sizeof(struct cpucp_hbm_row_info), + &cpucp_repl_rows_info_dma_addr); + if (!cpucp_repl_rows_info_cpu_addr) { + dev_err(hdev->dev, + "Failed to allocate DMA memory for CPU-CP replaced rows info packet\n"); + return -ENOMEM; + } + + memset(cpucp_repl_rows_info_cpu_addr, 0, sizeof(struct cpucp_hbm_row_info)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.addr = cpu_to_le64(cpucp_repl_rows_info_dma_addr); + pkt.data_max_size = cpu_to_le32(sizeof(struct cpucp_hbm_row_info)); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + HL_CPUCP_INFO_TIMEOUT_USEC, &result); + if (rc) { + dev_err(hdev->dev, + "Failed to handle CPU-CP replaced rows info pkt, error %d\n", rc); + goto out; + } + + memcpy(info, cpucp_repl_rows_info_cpu_addr, sizeof(*info)); + +out: + hl_cpu_accessible_dma_pool_free(hdev, sizeof(struct cpucp_hbm_row_info), + cpucp_repl_rows_info_cpu_addr); + + return rc; +} + +int hl_fw_dram_pending_row_get(struct hl_device *hdev, u32 *pend_rows_num) +{ + struct cpucp_packet pkt; + u64 result; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_HBM_PENDING_ROWS_STATUS << CPUCP_PKT_CTL_OPCODE_SHIFT); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result); + if (rc) { + dev_err(hdev->dev, + "Failed to handle CPU-CP pending rows info pkt, error %d\n", rc); + goto out; + } + + *pend_rows_num = (u32) result; +out: + return rc; +} + +int hl_fw_cpucp_engine_core_asid_set(struct hl_device *hdev, u32 asid) +{ + struct cpucp_packet pkt; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_ENGINE_CORE_ASID_SET << CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.value = cpu_to_le64(asid); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + HL_CPUCP_INFO_TIMEOUT_USEC, NULL); + if (rc) + dev_err(hdev->dev, + "Failed on ASID configuration request for engine core, error %d\n", + rc); + + return rc; +} + +void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev) +{ + struct static_fw_load_mgr *static_loader = + &hdev->fw_loader.static_loader; + int rc; + + if (hdev->asic_prop.dynamic_fw_load) { + rc = hl_fw_dynamic_send_protocol_cmd(hdev, &hdev->fw_loader, + COMMS_RST_DEV, 0, false, + hdev->fw_loader.cpu_timeout); + if (rc) + dev_warn(hdev->dev, "Failed sending COMMS_RST_DEV\n"); + } else { + WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_RST_DEV); + } +} + +void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev) +{ + struct static_fw_load_mgr *static_loader = + &hdev->fw_loader.static_loader; + int rc; + + if (hdev->device_cpu_is_halted) + return; + + /* Stop device CPU to make sure nothing bad happens */ + if (hdev->asic_prop.dynamic_fw_load) { + rc = hl_fw_dynamic_send_protocol_cmd(hdev, &hdev->fw_loader, + COMMS_GOTO_WFE, 0, true, + hdev->fw_loader.cpu_timeout); + if (rc) + dev_warn(hdev->dev, "Failed sending COMMS_GOTO_WFE\n"); + } else { + WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_GOTO_WFE); + msleep(static_loader->cpu_reset_wait_msec); + + /* Must clear this register in order to prevent preboot + * from reading WFE after reboot + */ + WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_NA); + } + + hdev->device_cpu_is_halted = true; +} + +static void detect_cpu_boot_status(struct hl_device *hdev, u32 status) +{ + /* Some of the status codes below are deprecated in newer f/w + * versions but we keep them here for backward compatibility + */ + switch (status) { + case CPU_BOOT_STATUS_NA: + dev_err(hdev->dev, + "Device boot progress - BTL/ROM did NOT run\n"); + break; + case CPU_BOOT_STATUS_IN_WFE: + dev_err(hdev->dev, + "Device boot progress - Stuck inside WFE loop\n"); + break; + case CPU_BOOT_STATUS_IN_BTL: + dev_err(hdev->dev, + "Device boot progress - Stuck in BTL\n"); + break; + case CPU_BOOT_STATUS_IN_PREBOOT: + dev_err(hdev->dev, + "Device boot progress - Stuck in Preboot\n"); + break; + case CPU_BOOT_STATUS_IN_SPL: + dev_err(hdev->dev, + "Device boot progress - Stuck in SPL\n"); + break; + case CPU_BOOT_STATUS_IN_UBOOT: + dev_err(hdev->dev, + "Device boot progress - Stuck in u-boot\n"); + break; + case CPU_BOOT_STATUS_DRAM_INIT_FAIL: + dev_err(hdev->dev, + "Device boot progress - DRAM initialization failed\n"); + break; + case CPU_BOOT_STATUS_UBOOT_NOT_READY: + dev_err(hdev->dev, + "Device boot progress - Cannot boot\n"); + break; + case CPU_BOOT_STATUS_TS_INIT_FAIL: + dev_err(hdev->dev, + "Device boot progress - Thermal Sensor initialization failed\n"); + break; + case CPU_BOOT_STATUS_SECURITY_READY: + dev_err(hdev->dev, + "Device boot progress - Stuck in preboot after security initialization\n"); + break; + default: + dev_err(hdev->dev, + "Device boot progress - Invalid status code %d\n", + status); + break; + } +} + +static int hl_fw_wait_preboot_ready(struct hl_device *hdev) +{ + struct pre_fw_load_props *pre_fw_load = &hdev->fw_loader.pre_fw_load; + u32 status; + int rc; + + /* Need to check two possible scenarios: + * + * CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT - for newer firmwares where + * the preboot is waiting for the boot fit + * + * All other status values - for older firmwares where the uboot was + * loaded from the FLASH + */ + rc = hl_poll_timeout( + hdev, + pre_fw_load->cpu_boot_status_reg, + status, + (status == CPU_BOOT_STATUS_NIC_FW_RDY) || + (status == CPU_BOOT_STATUS_READY_TO_BOOT) || + (status == CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT), + hdev->fw_poll_interval_usec, + pre_fw_load->wait_for_preboot_timeout); + + if (rc) { + dev_err(hdev->dev, "CPU boot ready status timeout\n"); + detect_cpu_boot_status(hdev, status); + + /* If we read all FF, then something is totally wrong, no point + * of reading specific errors + */ + if (status != -1) + fw_read_errors(hdev, pre_fw_load->boot_err0_reg, + pre_fw_load->boot_err1_reg, + pre_fw_load->sts_boot_dev_sts0_reg, + pre_fw_load->sts_boot_dev_sts1_reg); + return -EIO; + } + + hdev->fw_loader.fw_comp_loaded |= FW_TYPE_PREBOOT_CPU; + + return 0; +} + +static int hl_fw_read_preboot_caps(struct hl_device *hdev) +{ + struct pre_fw_load_props *pre_fw_load; + struct asic_fixed_properties *prop; + u32 reg_val; + int rc; + + prop = &hdev->asic_prop; + pre_fw_load = &hdev->fw_loader.pre_fw_load; + + rc = hl_fw_wait_preboot_ready(hdev); + if (rc) + return rc; + + /* + * the registers DEV_STS* contain FW capabilities/features. + * We can rely on this registers only if bit CPU_BOOT_DEV_STS*_ENABLED + * is set. + * In the first read of this register we store the value of this + * register ONLY if the register is enabled (which will be propagated + * to next stages) and also mark the register as valid. + * In case it is not enabled the stored value will be left 0- all + * caps/features are off + */ + reg_val = RREG32(pre_fw_load->sts_boot_dev_sts0_reg); + if (reg_val & CPU_BOOT_DEV_STS0_ENABLED) { + prop->fw_cpu_boot_dev_sts0_valid = true; + prop->fw_preboot_cpu_boot_dev_sts0 = reg_val; + } + + reg_val = RREG32(pre_fw_load->sts_boot_dev_sts1_reg); + if (reg_val & CPU_BOOT_DEV_STS1_ENABLED) { + prop->fw_cpu_boot_dev_sts1_valid = true; + prop->fw_preboot_cpu_boot_dev_sts1 = reg_val; + } + + prop->dynamic_fw_load = !!(prop->fw_preboot_cpu_boot_dev_sts0 & + CPU_BOOT_DEV_STS0_FW_LD_COM_EN); + + /* initialize FW loader once we know what load protocol is used */ + hdev->asic_funcs->init_firmware_loader(hdev); + + dev_dbg(hdev->dev, "Attempting %s FW load\n", + prop->dynamic_fw_load ? "dynamic" : "legacy"); + return 0; +} + +static int hl_fw_static_read_device_fw_version(struct hl_device *hdev, + enum hl_fw_component fwc) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct fw_load_mgr *fw_loader = &hdev->fw_loader; + struct static_fw_load_mgr *static_loader; + char *dest, *boot_ver, *preboot_ver; + u32 ver_off, limit; + const char *name; + char btl_ver[32]; + + static_loader = &hdev->fw_loader.static_loader; + + switch (fwc) { + case FW_COMP_BOOT_FIT: + ver_off = RREG32(static_loader->boot_fit_version_offset_reg); + dest = prop->uboot_ver; + name = "Boot-fit"; + limit = static_loader->boot_fit_version_max_off; + break; + case FW_COMP_PREBOOT: + ver_off = RREG32(static_loader->preboot_version_offset_reg); + dest = prop->preboot_ver; + name = "Preboot"; + limit = static_loader->preboot_version_max_off; + break; + default: + dev_warn(hdev->dev, "Undefined FW component: %d\n", fwc); + return -EIO; + } + + ver_off &= static_loader->sram_offset_mask; + + if (ver_off < limit) { + memcpy_fromio(dest, + hdev->pcie_bar[fw_loader->sram_bar_id] + ver_off, + VERSION_MAX_LEN); + } else { + dev_err(hdev->dev, "%s version offset (0x%x) is above SRAM\n", + name, ver_off); + strscpy(dest, "unavailable", VERSION_MAX_LEN); + return -EIO; + } + + if (fwc == FW_COMP_BOOT_FIT) { + boot_ver = extract_fw_ver_from_str(prop->uboot_ver); + if (boot_ver) { + dev_info(hdev->dev, "boot-fit version %s\n", boot_ver); + kfree(boot_ver); + } + } else if (fwc == FW_COMP_PREBOOT) { + preboot_ver = strnstr(prop->preboot_ver, "Preboot", + VERSION_MAX_LEN); + if (preboot_ver && preboot_ver != prop->preboot_ver) { + strscpy(btl_ver, prop->preboot_ver, + min((int) (preboot_ver - prop->preboot_ver), + 31)); + dev_info(hdev->dev, "%s\n", btl_ver); + } + + preboot_ver = extract_fw_ver_from_str(prop->preboot_ver); + if (preboot_ver) { + dev_info(hdev->dev, "preboot version %s\n", + preboot_ver); + kfree(preboot_ver); + } + } + + return 0; +} + +/** + * hl_fw_preboot_update_state - update internal data structures during + * handshake with preboot + * + * + * @hdev: pointer to the habanalabs device structure + * + * @return 0 on success, otherwise non-zero error code + */ +static void hl_fw_preboot_update_state(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + u32 cpu_boot_dev_sts0, cpu_boot_dev_sts1; + + cpu_boot_dev_sts0 = prop->fw_preboot_cpu_boot_dev_sts0; + cpu_boot_dev_sts1 = prop->fw_preboot_cpu_boot_dev_sts1; + + /* We read boot_dev_sts registers multiple times during boot: + * 1. preboot - a. Check whether the security status bits are valid + * b. Check whether fw security is enabled + * c. Check whether hard reset is done by preboot + * 2. boot cpu - a. Fetch boot cpu security status + * b. Check whether hard reset is done by boot cpu + * 3. FW application - a. Fetch fw application security status + * b. Check whether hard reset is done by fw app + */ + prop->hard_reset_done_by_fw = !!(cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_FW_HARD_RST_EN); + + prop->fw_security_enabled = !!(cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_SECURITY_EN); + + dev_dbg(hdev->dev, "Firmware preboot boot device status0 %#x\n", + cpu_boot_dev_sts0); + + dev_dbg(hdev->dev, "Firmware preboot boot device status1 %#x\n", + cpu_boot_dev_sts1); + + dev_dbg(hdev->dev, "Firmware preboot hard-reset is %s\n", + prop->hard_reset_done_by_fw ? "enabled" : "disabled"); + + dev_dbg(hdev->dev, "firmware-level security is %s\n", + prop->fw_security_enabled ? "enabled" : "disabled"); + + dev_dbg(hdev->dev, "GIC controller is %s\n", + prop->gic_interrupts_enable ? "enabled" : "disabled"); +} + +static int hl_fw_static_read_preboot_status(struct hl_device *hdev) +{ + int rc; + + rc = hl_fw_static_read_device_fw_version(hdev, FW_COMP_PREBOOT); + if (rc) + return rc; + + return 0; +} + +int hl_fw_read_preboot_status(struct hl_device *hdev) +{ + int rc; + + if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU)) + return 0; + + /* get FW pre-load parameters */ + hdev->asic_funcs->init_firmware_preload_params(hdev); + + /* + * In order to determine boot method (static VS dynamic) we need to + * read the boot caps register + */ + rc = hl_fw_read_preboot_caps(hdev); + if (rc) + return rc; + + hl_fw_preboot_update_state(hdev); + + /* no need to read preboot status in dynamic load */ + if (hdev->asic_prop.dynamic_fw_load) + return 0; + + return hl_fw_static_read_preboot_status(hdev); +} + +/* associate string with COMM status */ +static char *hl_dynamic_fw_status_str[COMMS_STS_INVLD_LAST] = { + [COMMS_STS_NOOP] = "NOOP", + [COMMS_STS_ACK] = "ACK", + [COMMS_STS_OK] = "OK", + [COMMS_STS_ERR] = "ERR", + [COMMS_STS_VALID_ERR] = "VALID_ERR", + [COMMS_STS_TIMEOUT_ERR] = "TIMEOUT_ERR", +}; + +/** + * hl_fw_dynamic_report_error_status - report error status + * + * @hdev: pointer to the habanalabs device structure + * @status: value of FW status register + * @expected_status: the expected status + */ +static void hl_fw_dynamic_report_error_status(struct hl_device *hdev, + u32 status, + enum comms_sts expected_status) +{ + enum comms_sts comm_status = + FIELD_GET(COMMS_STATUS_STATUS_MASK, status); + + if (comm_status < COMMS_STS_INVLD_LAST) + dev_err(hdev->dev, "Device status %s, expected status: %s\n", + hl_dynamic_fw_status_str[comm_status], + hl_dynamic_fw_status_str[expected_status]); + else + dev_err(hdev->dev, "Device status unknown %d, expected status: %s\n", + comm_status, + hl_dynamic_fw_status_str[expected_status]); +} + +/** + * hl_fw_dynamic_send_cmd - send LKD to FW cmd + * + * @hdev: pointer to the habanalabs device structure + * @fw_loader: managing structure for loading device's FW + * @cmd: LKD to FW cmd code + * @size: size of next FW component to be loaded (0 if not necessary) + * + * LDK to FW exact command layout is defined at struct comms_command. + * note: the size argument is used only when the next FW component should be + * loaded, otherwise it shall be 0. the size is used by the FW in later + * protocol stages and when sending only indicating the amount of memory + * to be allocated by the FW to receive the next boot component. + */ +static void hl_fw_dynamic_send_cmd(struct hl_device *hdev, + struct fw_load_mgr *fw_loader, + enum comms_cmd cmd, unsigned int size) +{ + struct cpu_dyn_regs *dyn_regs; + u32 val; + + dyn_regs = &fw_loader->dynamic_loader.comm_desc.cpu_dyn_regs; + + val = FIELD_PREP(COMMS_COMMAND_CMD_MASK, cmd); + val |= FIELD_PREP(COMMS_COMMAND_SIZE_MASK, size); + + WREG32(le32_to_cpu(dyn_regs->kmd_msg_to_cpu), val); +} + +/** + * hl_fw_dynamic_extract_fw_response - update the FW response + * + * @hdev: pointer to the habanalabs device structure + * @fw_loader: managing structure for loading device's FW + * @response: FW response + * @status: the status read from CPU status register + * + * @return 0 on success, otherwise non-zero error code + */ +static int hl_fw_dynamic_extract_fw_response(struct hl_device *hdev, + struct fw_load_mgr *fw_loader, + struct fw_response *response, + u32 status) +{ + response->status = FIELD_GET(COMMS_STATUS_STATUS_MASK, status); + response->ram_offset = FIELD_GET(COMMS_STATUS_OFFSET_MASK, status) << + COMMS_STATUS_OFFSET_ALIGN_SHIFT; + response->ram_type = FIELD_GET(COMMS_STATUS_RAM_TYPE_MASK, status); + + if ((response->ram_type != COMMS_SRAM) && + (response->ram_type != COMMS_DRAM)) { + dev_err(hdev->dev, "FW status: invalid RAM type %u\n", + response->ram_type); + return -EIO; + } + + return 0; +} + +/** + * hl_fw_dynamic_wait_for_status - wait for status in dynamic FW load + * + * @hdev: pointer to the habanalabs device structure + * @fw_loader: managing structure for loading device's FW + * @expected_status: expected status to wait for + * @timeout: timeout for status wait + * + * @return 0 on success, otherwise non-zero error code + * + * waiting for status from FW include polling the FW status register until + * expected status is received or timeout occurs (whatever occurs first). + */ +static int hl_fw_dynamic_wait_for_status(struct hl_device *hdev, + struct fw_load_mgr *fw_loader, + enum comms_sts expected_status, + u32 timeout) +{ + struct cpu_dyn_regs *dyn_regs; + u32 status; + int rc; + + dyn_regs = &fw_loader->dynamic_loader.comm_desc.cpu_dyn_regs; + + /* Wait for expected status */ + rc = hl_poll_timeout( + hdev, + le32_to_cpu(dyn_regs->cpu_cmd_status_to_host), + status, + FIELD_GET(COMMS_STATUS_STATUS_MASK, status) == expected_status, + hdev->fw_comms_poll_interval_usec, + timeout); + + if (rc) { + hl_fw_dynamic_report_error_status(hdev, status, + expected_status); + return -EIO; + } + + /* + * skip storing FW response for NOOP to preserve the actual desired + * FW status + */ + if (expected_status == COMMS_STS_NOOP) + return 0; + + rc = hl_fw_dynamic_extract_fw_response(hdev, fw_loader, + &fw_loader->dynamic_loader.response, + status); + return rc; +} + +/** + * hl_fw_dynamic_send_clear_cmd - send clear command to FW + * + * @hdev: pointer to the habanalabs device structure + * @fw_loader: managing structure for loading device's FW + * + * @return 0 on success, otherwise non-zero error code + * + * after command cycle between LKD to FW CPU (i.e. LKD got an expected status + * from FW) we need to clear the CPU status register in order to avoid garbage + * between command cycles. + * This is done by sending clear command and polling the CPU to LKD status + * register to hold the status NOOP + */ +static int hl_fw_dynamic_send_clear_cmd(struct hl_device *hdev, + struct fw_load_mgr *fw_loader) +{ + hl_fw_dynamic_send_cmd(hdev, fw_loader, COMMS_CLR_STS, 0); + + return hl_fw_dynamic_wait_for_status(hdev, fw_loader, COMMS_STS_NOOP, + fw_loader->cpu_timeout); +} + +/** + * hl_fw_dynamic_send_protocol_cmd - send LKD to FW cmd and wait for ACK + * + * @hdev: pointer to the habanalabs device structure + * @fw_loader: managing structure for loading device's FW + * @cmd: LKD to FW cmd code + * @size: size of next FW component to be loaded (0 if not necessary) + * @wait_ok: if true also wait for OK response from FW + * @timeout: timeout for status wait + * + * @return 0 on success, otherwise non-zero error code + * + * brief: + * when sending protocol command we have the following steps: + * - send clear (clear command and verify clear status register) + * - send the actual protocol command + * - wait for ACK on the protocol command + * - send clear + * - send NOOP + * if, in addition, the specific protocol command should wait for OK then: + * - wait for OK + * - send clear + * - send NOOP + * + * NOTES: + * send clear: this is necessary in order to clear the status register to avoid + * leftovers between command + * NOOP command: necessary to avoid loop on the clear command by the FW + */ +int hl_fw_dynamic_send_protocol_cmd(struct hl_device *hdev, + struct fw_load_mgr *fw_loader, + enum comms_cmd cmd, unsigned int size, + bool wait_ok, u32 timeout) +{ + int rc; + + /* first send clear command to clean former commands */ + rc = hl_fw_dynamic_send_clear_cmd(hdev, fw_loader); + + /* send the actual command */ + hl_fw_dynamic_send_cmd(hdev, fw_loader, cmd, size); + + /* wait for ACK for the command */ + rc = hl_fw_dynamic_wait_for_status(hdev, fw_loader, COMMS_STS_ACK, + timeout); + if (rc) + return rc; + + /* clear command to prepare for NOOP command */ + rc = hl_fw_dynamic_send_clear_cmd(hdev, fw_loader); + if (rc) + return rc; + + /* send the actual NOOP command */ + hl_fw_dynamic_send_cmd(hdev, fw_loader, COMMS_NOOP, 0); + + if (!wait_ok) + return 0; + + rc = hl_fw_dynamic_wait_for_status(hdev, fw_loader, COMMS_STS_OK, + timeout); + if (rc) + return rc; + + /* clear command to prepare for NOOP command */ + rc = hl_fw_dynamic_send_clear_cmd(hdev, fw_loader); + if (rc) + return rc; + + /* send the actual NOOP command */ + hl_fw_dynamic_send_cmd(hdev, fw_loader, COMMS_NOOP, 0); + + return 0; +} + +/** + * hl_fw_compat_crc32 - CRC compatible with FW + * + * @data: pointer to the data + * @size: size of the data + * + * @return the CRC32 result + * + * NOTE: kernel's CRC32 differs from standard CRC32 calculation. + * in order to be aligned we need to flip the bits of both the input + * initial CRC and kernel's CRC32 result. + * in addition both sides use initial CRC of 0, + */ +static u32 hl_fw_compat_crc32(u8 *data, size_t size) +{ + return ~crc32_le(~((u32)0), data, size); +} + +/** + * hl_fw_dynamic_validate_memory_bound - validate memory bounds for memory + * transfer (image or descriptor) between + * host and FW + * + * @hdev: pointer to the habanalabs device structure + * @addr: device address of memory transfer + * @size: memory transfer size + * @region: PCI memory region + * + * @return 0 on success, otherwise non-zero error code + */ +static int hl_fw_dynamic_validate_memory_bound(struct hl_device *hdev, + u64 addr, size_t size, + struct pci_mem_region *region) +{ + u64 end_addr; + + /* now make sure that the memory transfer is within region's bounds */ + end_addr = addr + size; + if (end_addr >= region->region_base + region->region_size) { + dev_err(hdev->dev, + "dynamic FW load: memory transfer end address out of memory region bounds. addr: %llx\n", + end_addr); + return -EIO; + } + + /* + * now make sure memory transfer is within predefined BAR bounds. + * this is to make sure we do not need to set the bar (e.g. for DRAM + * memory transfers) + */ + if (end_addr >= region->region_base - region->offset_in_bar + + region->bar_size) { + dev_err(hdev->dev, + "FW image beyond PCI BAR bounds\n"); + return -EIO; + } + + return 0; +} + +/** + * hl_fw_dynamic_validate_descriptor - validate FW descriptor + * + * @hdev: pointer to the habanalabs device structure + * @fw_loader: managing structure for loading device's FW + * @fw_desc: the descriptor form FW + * + * @return 0 on success, otherwise non-zero error code + */ +static int hl_fw_dynamic_validate_descriptor(struct hl_device *hdev, + struct fw_load_mgr *fw_loader, + struct lkd_fw_comms_desc *fw_desc) +{ + struct pci_mem_region *region; + enum pci_region region_id; + size_t data_size; + u32 data_crc32; + u8 *data_ptr; + u64 addr; + int rc; + + if (le32_to_cpu(fw_desc->header.magic) != HL_COMMS_DESC_MAGIC) + dev_warn(hdev->dev, "Invalid magic for dynamic FW descriptor (%x)\n", + fw_desc->header.magic); + + if (fw_desc->header.version != HL_COMMS_DESC_VER) + dev_warn(hdev->dev, "Invalid version for dynamic FW descriptor (%x)\n", + fw_desc->header.version); + + /* + * Calc CRC32 of data without header. use the size of the descriptor + * reported by firmware, without calculating it ourself, to allow adding + * more fields to the lkd_fw_comms_desc structure. + * note that no alignment/stride address issues here as all structures + * are 64 bit padded. + */ + data_ptr = (u8 *)fw_desc + sizeof(struct comms_desc_header); + data_size = le16_to_cpu(fw_desc->header.size); + + data_crc32 = hl_fw_compat_crc32(data_ptr, data_size); + if (data_crc32 != le32_to_cpu(fw_desc->header.crc32)) { + dev_err(hdev->dev, "CRC32 mismatch for dynamic FW descriptor (%x:%x)\n", + data_crc32, fw_desc->header.crc32); + return -EIO; + } + + /* find memory region to which to copy the image */ + addr = le64_to_cpu(fw_desc->img_addr); + region_id = hl_get_pci_memory_region(hdev, addr); + if ((region_id != PCI_REGION_SRAM) && ((region_id != PCI_REGION_DRAM))) { + dev_err(hdev->dev, "Invalid region to copy FW image address=%llx\n", addr); + return -EIO; + } + + region = &hdev->pci_mem_region[region_id]; + + /* store the region for the copy stage */ + fw_loader->dynamic_loader.image_region = region; + + /* + * here we know that the start address is valid, now make sure that the + * image is within region's bounds + */ + rc = hl_fw_dynamic_validate_memory_bound(hdev, addr, + fw_loader->dynamic_loader.fw_image_size, + region); + if (rc) { + dev_err(hdev->dev, "invalid mem transfer request for FW image\n"); + return rc; + } + + /* here we can mark the descriptor as valid as the content has been validated */ + fw_loader->dynamic_loader.fw_desc_valid = true; + + return 0; +} + +static int hl_fw_dynamic_validate_response(struct hl_device *hdev, + struct fw_response *response, + struct pci_mem_region *region) +{ + u64 device_addr; + int rc; + + device_addr = region->region_base + response->ram_offset; + + /* + * validate that the descriptor is within region's bounds + * Note that as the start address was supplied according to the RAM + * type- testing only the end address is enough + */ + rc = hl_fw_dynamic_validate_memory_bound(hdev, device_addr, + sizeof(struct lkd_fw_comms_desc), + region); + return rc; +} + +/** + * hl_fw_dynamic_read_and_validate_descriptor - read and validate FW descriptor + * + * @hdev: pointer to the habanalabs device structure + * @fw_loader: managing structure for loading device's FW + * + * @return 0 on success, otherwise non-zero error code + */ +static int hl_fw_dynamic_read_and_validate_descriptor(struct hl_device *hdev, + struct fw_load_mgr *fw_loader) +{ + struct lkd_fw_comms_desc *fw_desc; + struct pci_mem_region *region; + struct fw_response *response; + enum pci_region region_id; + void __iomem *src; + int rc; + + fw_desc = &fw_loader->dynamic_loader.comm_desc; + response = &fw_loader->dynamic_loader.response; + + region_id = (response->ram_type == COMMS_SRAM) ? + PCI_REGION_SRAM : PCI_REGION_DRAM; + + region = &hdev->pci_mem_region[region_id]; + + rc = hl_fw_dynamic_validate_response(hdev, response, region); + if (rc) { + dev_err(hdev->dev, + "invalid mem transfer request for FW descriptor\n"); + return rc; + } + + /* + * extract address to copy the descriptor from + * in addition, as the descriptor value is going to be over-ridden by new data- we mark it + * as invalid. + * it will be marked again as valid once validated + */ + fw_loader->dynamic_loader.fw_desc_valid = false; + src = hdev->pcie_bar[region->bar_id] + region->offset_in_bar + + response->ram_offset; + memcpy_fromio(fw_desc, src, sizeof(struct lkd_fw_comms_desc)); + + return hl_fw_dynamic_validate_descriptor(hdev, fw_loader, fw_desc); +} + +/** + * hl_fw_dynamic_request_descriptor - handshake with CPU to get FW descriptor + * + * @hdev: pointer to the habanalabs device structure + * @fw_loader: managing structure for loading device's FW + * @next_image_size: size to allocate for next FW component + * + * @return 0 on success, otherwise non-zero error code + */ +static int hl_fw_dynamic_request_descriptor(struct hl_device *hdev, + struct fw_load_mgr *fw_loader, + size_t next_image_size) +{ + int rc; + + rc = hl_fw_dynamic_send_protocol_cmd(hdev, fw_loader, COMMS_PREP_DESC, + next_image_size, true, + fw_loader->cpu_timeout); + if (rc) + return rc; + + return hl_fw_dynamic_read_and_validate_descriptor(hdev, fw_loader); +} + +/** + * hl_fw_dynamic_read_device_fw_version - read FW version to exposed properties + * + * @hdev: pointer to the habanalabs device structure + * @fwc: the firmware component + * @fw_version: fw component's version string + */ +static int hl_fw_dynamic_read_device_fw_version(struct hl_device *hdev, + enum hl_fw_component fwc, + const char *fw_version) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + char *preboot_ver, *boot_ver; + char btl_ver[32]; + + switch (fwc) { + case FW_COMP_BOOT_FIT: + strscpy(prop->uboot_ver, fw_version, VERSION_MAX_LEN); + boot_ver = extract_fw_ver_from_str(prop->uboot_ver); + if (boot_ver) { + dev_info(hdev->dev, "boot-fit version %s\n", boot_ver); + kfree(boot_ver); + } + + break; + case FW_COMP_PREBOOT: + strscpy(prop->preboot_ver, fw_version, VERSION_MAX_LEN); + preboot_ver = strnstr(prop->preboot_ver, "Preboot", + VERSION_MAX_LEN); + if (preboot_ver && preboot_ver != prop->preboot_ver) { + strscpy(btl_ver, prop->preboot_ver, + min((int) (preboot_ver - prop->preboot_ver), 31)); + dev_info(hdev->dev, "%s\n", btl_ver); + } + + preboot_ver = extract_fw_ver_from_str(prop->preboot_ver); + if (preboot_ver) { + int rc; + + dev_info(hdev->dev, "preboot version %s\n", preboot_ver); + + /* This function takes care of freeing preboot_ver */ + rc = extract_fw_sub_versions(hdev, preboot_ver); + if (rc) + return rc; + } + + break; + default: + dev_warn(hdev->dev, "Undefined FW component: %d\n", fwc); + return -EINVAL; + } + + return 0; +} + +/** + * hl_fw_dynamic_copy_image - copy image to memory allocated by the FW + * + * @hdev: pointer to the habanalabs device structure + * @fw: fw descriptor + * @fw_loader: managing structure for loading device's FW + */ +static int hl_fw_dynamic_copy_image(struct hl_device *hdev, + const struct firmware *fw, + struct fw_load_mgr *fw_loader) +{ + struct lkd_fw_comms_desc *fw_desc; + struct pci_mem_region *region; + void __iomem *dest; + u64 addr; + int rc; + + fw_desc = &fw_loader->dynamic_loader.comm_desc; + addr = le64_to_cpu(fw_desc->img_addr); + + /* find memory region to which to copy the image */ + region = fw_loader->dynamic_loader.image_region; + + dest = hdev->pcie_bar[region->bar_id] + region->offset_in_bar + + (addr - region->region_base); + + rc = hl_fw_copy_fw_to_device(hdev, fw, dest, + fw_loader->boot_fit_img.src_off, + fw_loader->boot_fit_img.copy_size); + + return rc; +} + +/** + * hl_fw_dynamic_copy_msg - copy msg to memory allocated by the FW + * + * @hdev: pointer to the habanalabs device structure + * @msg: message + * @fw_loader: managing structure for loading device's FW + */ +static int hl_fw_dynamic_copy_msg(struct hl_device *hdev, + struct lkd_msg_comms *msg, struct fw_load_mgr *fw_loader) +{ + struct lkd_fw_comms_desc *fw_desc; + struct pci_mem_region *region; + void __iomem *dest; + u64 addr; + int rc; + + fw_desc = &fw_loader->dynamic_loader.comm_desc; + addr = le64_to_cpu(fw_desc->img_addr); + + /* find memory region to which to copy the image */ + region = fw_loader->dynamic_loader.image_region; + + dest = hdev->pcie_bar[region->bar_id] + region->offset_in_bar + + (addr - region->region_base); + + rc = hl_fw_copy_msg_to_device(hdev, msg, dest, 0, 0); + + return rc; +} + +/** + * hl_fw_boot_fit_update_state - update internal data structures after boot-fit + * is loaded + * + * @hdev: pointer to the habanalabs device structure + * @cpu_boot_dev_sts0_reg: register holding CPU boot dev status 0 + * @cpu_boot_dev_sts1_reg: register holding CPU boot dev status 1 + * + * @return 0 on success, otherwise non-zero error code + */ +static void hl_fw_boot_fit_update_state(struct hl_device *hdev, + u32 cpu_boot_dev_sts0_reg, + u32 cpu_boot_dev_sts1_reg) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + + hdev->fw_loader.fw_comp_loaded |= FW_TYPE_BOOT_CPU; + + /* Read boot_cpu status bits */ + if (prop->fw_preboot_cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_ENABLED) { + prop->fw_bootfit_cpu_boot_dev_sts0 = + RREG32(cpu_boot_dev_sts0_reg); + + prop->hard_reset_done_by_fw = !!(prop->fw_bootfit_cpu_boot_dev_sts0 & + CPU_BOOT_DEV_STS0_FW_HARD_RST_EN); + + dev_dbg(hdev->dev, "Firmware boot CPU status0 %#x\n", + prop->fw_bootfit_cpu_boot_dev_sts0); + } + + if (prop->fw_cpu_boot_dev_sts1_valid) { + prop->fw_bootfit_cpu_boot_dev_sts1 = + RREG32(cpu_boot_dev_sts1_reg); + + dev_dbg(hdev->dev, "Firmware boot CPU status1 %#x\n", + prop->fw_bootfit_cpu_boot_dev_sts1); + } + + dev_dbg(hdev->dev, "Firmware boot CPU hard-reset is %s\n", + prop->hard_reset_done_by_fw ? "enabled" : "disabled"); +} + +static void hl_fw_dynamic_update_linux_interrupt_if(struct hl_device *hdev) +{ + struct cpu_dyn_regs *dyn_regs = + &hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs; + + /* Check whether all 3 interrupt interfaces are set, if not use a + * single interface + */ + if (!hdev->asic_prop.gic_interrupts_enable && + !(hdev->asic_prop.fw_app_cpu_boot_dev_sts0 & + CPU_BOOT_DEV_STS0_MULTI_IRQ_POLL_EN)) { + dyn_regs->gic_host_halt_irq = dyn_regs->gic_host_pi_upd_irq; + dyn_regs->gic_host_ints_irq = dyn_regs->gic_host_pi_upd_irq; + + dev_warn(hdev->dev, + "Using a single interrupt interface towards cpucp"); + } +} +/** + * hl_fw_dynamic_load_image - load FW image using dynamic protocol + * + * @hdev: pointer to the habanalabs device structure + * @fw_loader: managing structure for loading device's FW + * @load_fwc: the FW component to be loaded + * @img_ld_timeout: image load timeout + * + * @return 0 on success, otherwise non-zero error code + */ +static int hl_fw_dynamic_load_image(struct hl_device *hdev, + struct fw_load_mgr *fw_loader, + enum hl_fw_component load_fwc, + u32 img_ld_timeout) +{ + enum hl_fw_component cur_fwc; + const struct firmware *fw; + char *fw_name; + int rc = 0; + + /* + * when loading image we have one of 2 scenarios: + * 1. current FW component is preboot and we want to load boot-fit + * 2. current FW component is boot-fit and we want to load linux + */ + if (load_fwc == FW_COMP_BOOT_FIT) { + cur_fwc = FW_COMP_PREBOOT; + fw_name = fw_loader->boot_fit_img.image_name; + } else { + cur_fwc = FW_COMP_BOOT_FIT; + fw_name = fw_loader->linux_img.image_name; + } + + /* request FW in order to communicate to FW the size to be allocated */ + rc = hl_request_fw(hdev, &fw, fw_name); + if (rc) + return rc; + + /* store the image size for future validation */ + fw_loader->dynamic_loader.fw_image_size = fw->size; + + rc = hl_fw_dynamic_request_descriptor(hdev, fw_loader, fw->size); + if (rc) + goto release_fw; + + /* read preboot version */ + rc = hl_fw_dynamic_read_device_fw_version(hdev, cur_fwc, + fw_loader->dynamic_loader.comm_desc.cur_fw_ver); + if (rc) + goto release_fw; + + /* update state according to boot stage */ + if (cur_fwc == FW_COMP_BOOT_FIT) { + struct cpu_dyn_regs *dyn_regs; + + dyn_regs = &fw_loader->dynamic_loader.comm_desc.cpu_dyn_regs; + hl_fw_boot_fit_update_state(hdev, + le32_to_cpu(dyn_regs->cpu_boot_dev_sts0), + le32_to_cpu(dyn_regs->cpu_boot_dev_sts1)); + } + + /* copy boot fit to space allocated by FW */ + rc = hl_fw_dynamic_copy_image(hdev, fw, fw_loader); + if (rc) + goto release_fw; + + rc = hl_fw_dynamic_send_protocol_cmd(hdev, fw_loader, COMMS_DATA_RDY, + 0, true, + fw_loader->cpu_timeout); + if (rc) + goto release_fw; + + rc = hl_fw_dynamic_send_protocol_cmd(hdev, fw_loader, COMMS_EXEC, + 0, false, + img_ld_timeout); + +release_fw: + hl_release_firmware(fw); + return rc; +} + +static int hl_fw_dynamic_wait_for_boot_fit_active(struct hl_device *hdev, + struct fw_load_mgr *fw_loader) +{ + struct dynamic_fw_load_mgr *dyn_loader; + u32 status; + int rc; + + dyn_loader = &fw_loader->dynamic_loader; + + /* + * Make sure CPU boot-loader is running + * Note that the CPU_BOOT_STATUS_SRAM_AVAIL is generally set by Linux + * yet there is a debug scenario in which we loading uboot (without Linux) + * which at later stage is relocated to DRAM. In this case we expect + * uboot to set the CPU_BOOT_STATUS_SRAM_AVAIL and so we add it to the + * poll flags + */ + rc = hl_poll_timeout( + hdev, + le32_to_cpu(dyn_loader->comm_desc.cpu_dyn_regs.cpu_boot_status), + status, + (status == CPU_BOOT_STATUS_READY_TO_BOOT) || + (status == CPU_BOOT_STATUS_SRAM_AVAIL), + hdev->fw_poll_interval_usec, + dyn_loader->wait_for_bl_timeout); + if (rc) { + dev_err(hdev->dev, "failed to wait for boot\n"); + return rc; + } + + dev_dbg(hdev->dev, "uboot status = %d\n", status); + return 0; +} + +static int hl_fw_dynamic_wait_for_linux_active(struct hl_device *hdev, + struct fw_load_mgr *fw_loader) +{ + struct dynamic_fw_load_mgr *dyn_loader; + u32 status; + int rc; + + dyn_loader = &fw_loader->dynamic_loader; + + /* Make sure CPU linux is running */ + + rc = hl_poll_timeout( + hdev, + le32_to_cpu(dyn_loader->comm_desc.cpu_dyn_regs.cpu_boot_status), + status, + (status == CPU_BOOT_STATUS_SRAM_AVAIL), + hdev->fw_poll_interval_usec, + fw_loader->cpu_timeout); + if (rc) { + dev_err(hdev->dev, "failed to wait for Linux\n"); + return rc; + } + + dev_dbg(hdev->dev, "Boot status = %d\n", status); + return 0; +} + +/** + * hl_fw_linux_update_state - update internal data structures after Linux + * is loaded. + * Note: Linux initialization is comprised mainly + * of two stages - loading kernel (SRAM_AVAIL) + * & loading ARMCP. + * Therefore reading boot device status in any of + * these stages might result in different values. + * + * @hdev: pointer to the habanalabs device structure + * @cpu_boot_dev_sts0_reg: register holding CPU boot dev status 0 + * @cpu_boot_dev_sts1_reg: register holding CPU boot dev status 1 + * + * @return 0 on success, otherwise non-zero error code + */ +static void hl_fw_linux_update_state(struct hl_device *hdev, + u32 cpu_boot_dev_sts0_reg, + u32 cpu_boot_dev_sts1_reg) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + + hdev->fw_loader.fw_comp_loaded |= FW_TYPE_LINUX; + + /* Read FW application security bits */ + if (prop->fw_cpu_boot_dev_sts0_valid) { + prop->fw_app_cpu_boot_dev_sts0 = RREG32(cpu_boot_dev_sts0_reg); + + prop->hard_reset_done_by_fw = !!(prop->fw_app_cpu_boot_dev_sts0 & + CPU_BOOT_DEV_STS0_FW_HARD_RST_EN); + + if (prop->fw_app_cpu_boot_dev_sts0 & + CPU_BOOT_DEV_STS0_GIC_PRIVILEGED_EN) + prop->gic_interrupts_enable = false; + + dev_dbg(hdev->dev, + "Firmware application CPU status0 %#x\n", + prop->fw_app_cpu_boot_dev_sts0); + + dev_dbg(hdev->dev, "GIC controller is %s\n", + prop->gic_interrupts_enable ? + "enabled" : "disabled"); + } + + if (prop->fw_cpu_boot_dev_sts1_valid) { + prop->fw_app_cpu_boot_dev_sts1 = RREG32(cpu_boot_dev_sts1_reg); + + dev_dbg(hdev->dev, + "Firmware application CPU status1 %#x\n", + prop->fw_app_cpu_boot_dev_sts1); + } + + dev_dbg(hdev->dev, "Firmware application CPU hard-reset is %s\n", + prop->hard_reset_done_by_fw ? "enabled" : "disabled"); + + dev_info(hdev->dev, "Successfully loaded firmware to device\n"); +} + +/** + * hl_fw_dynamic_send_msg - send a COMMS message with attached data + * + * @hdev: pointer to the habanalabs device structure + * @fw_loader: managing structure for loading device's FW + * @msg_type: message type + * @data: data to be sent + * + * @return 0 on success, otherwise non-zero error code + */ +static int hl_fw_dynamic_send_msg(struct hl_device *hdev, + struct fw_load_mgr *fw_loader, u8 msg_type, void *data) +{ + struct lkd_msg_comms msg; + int rc; + + memset(&msg, 0, sizeof(msg)); + + /* create message to be sent */ + msg.header.type = msg_type; + msg.header.size = cpu_to_le16(sizeof(struct comms_msg_header)); + msg.header.magic = cpu_to_le32(HL_COMMS_MSG_MAGIC); + + switch (msg_type) { + case HL_COMMS_RESET_CAUSE_TYPE: + msg.reset_cause = *(__u8 *) data; + break; + + default: + dev_err(hdev->dev, + "Send COMMS message - invalid message type %u\n", + msg_type); + return -EINVAL; + } + + rc = hl_fw_dynamic_request_descriptor(hdev, fw_loader, + sizeof(struct lkd_msg_comms)); + if (rc) + return rc; + + /* copy message to space allocated by FW */ + rc = hl_fw_dynamic_copy_msg(hdev, &msg, fw_loader); + if (rc) + return rc; + + rc = hl_fw_dynamic_send_protocol_cmd(hdev, fw_loader, COMMS_DATA_RDY, + 0, true, + fw_loader->cpu_timeout); + if (rc) + return rc; + + rc = hl_fw_dynamic_send_protocol_cmd(hdev, fw_loader, COMMS_EXEC, + 0, true, + fw_loader->cpu_timeout); + if (rc) + return rc; + + return 0; +} + +/** + * hl_fw_dynamic_init_cpu - initialize the device CPU using dynamic protocol + * + * @hdev: pointer to the habanalabs device structure + * @fw_loader: managing structure for loading device's FW + * + * @return 0 on success, otherwise non-zero error code + * + * brief: the dynamic protocol is master (LKD) slave (FW CPU) protocol. + * the communication is done using registers: + * - LKD command register + * - FW status register + * the protocol is race free. this goal is achieved by splitting the requests + * and response to known synchronization points between the LKD and the FW. + * each response to LKD request is known and bound to a predefined timeout. + * in case of timeout expiration without the desired status from FW- the + * protocol (and hence the boot) will fail. + */ +static int hl_fw_dynamic_init_cpu(struct hl_device *hdev, + struct fw_load_mgr *fw_loader) +{ + struct cpu_dyn_regs *dyn_regs; + int rc; + + dev_info(hdev->dev, + "Loading %sfirmware to device, may take some time...\n", + hdev->asic_prop.fw_security_enabled ? "secured " : ""); + + /* initialize FW descriptor as invalid */ + fw_loader->dynamic_loader.fw_desc_valid = false; + + /* + * In this stage, "cpu_dyn_regs" contains only LKD's hard coded values! + * It will be updated from FW after hl_fw_dynamic_request_descriptor(). + */ + dyn_regs = &fw_loader->dynamic_loader.comm_desc.cpu_dyn_regs; + + rc = hl_fw_dynamic_send_protocol_cmd(hdev, fw_loader, COMMS_RST_STATE, + 0, true, + fw_loader->cpu_timeout); + if (rc) + goto protocol_err; + + if (hdev->reset_info.curr_reset_cause) { + rc = hl_fw_dynamic_send_msg(hdev, fw_loader, + HL_COMMS_RESET_CAUSE_TYPE, &hdev->reset_info.curr_reset_cause); + if (rc) + goto protocol_err; + + /* Clear current reset cause */ + hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; + } + + if (!(hdev->fw_components & FW_TYPE_BOOT_CPU)) { + rc = hl_fw_dynamic_request_descriptor(hdev, fw_loader, 0); + if (rc) + goto protocol_err; + + /* read preboot version */ + return hl_fw_dynamic_read_device_fw_version(hdev, FW_COMP_PREBOOT, + fw_loader->dynamic_loader.comm_desc.cur_fw_ver); + } + + /* load boot fit to FW */ + rc = hl_fw_dynamic_load_image(hdev, fw_loader, FW_COMP_BOOT_FIT, + fw_loader->boot_fit_timeout); + if (rc) { + dev_err(hdev->dev, "failed to load boot fit\n"); + goto protocol_err; + } + + /* + * when testing FW load (without Linux) on PLDM we don't want to + * wait until boot fit is active as it may take several hours. + * instead, we load the bootfit and let it do all initialization in + * the background. + */ + if (hdev->pldm && !(hdev->fw_components & FW_TYPE_LINUX)) + return 0; + + rc = hl_fw_dynamic_wait_for_boot_fit_active(hdev, fw_loader); + if (rc) + goto protocol_err; + + /* Enable DRAM scrambling before Linux boot and after successful + * UBoot + */ + hdev->asic_funcs->init_cpu_scrambler_dram(hdev); + + if (!(hdev->fw_components & FW_TYPE_LINUX)) { + dev_info(hdev->dev, "Skip loading Linux F/W\n"); + return 0; + } + + if (fw_loader->skip_bmc) { + rc = hl_fw_dynamic_send_protocol_cmd(hdev, fw_loader, + COMMS_SKIP_BMC, 0, + true, + fw_loader->cpu_timeout); + if (rc) { + dev_err(hdev->dev, "failed to load boot fit\n"); + goto protocol_err; + } + } + + /* load Linux image to FW */ + rc = hl_fw_dynamic_load_image(hdev, fw_loader, FW_COMP_LINUX, + fw_loader->cpu_timeout); + if (rc) { + dev_err(hdev->dev, "failed to load Linux\n"); + goto protocol_err; + } + + rc = hl_fw_dynamic_wait_for_linux_active(hdev, fw_loader); + if (rc) + goto protocol_err; + + hl_fw_linux_update_state(hdev, le32_to_cpu(dyn_regs->cpu_boot_dev_sts0), + le32_to_cpu(dyn_regs->cpu_boot_dev_sts1)); + + hl_fw_dynamic_update_linux_interrupt_if(hdev); + + return 0; + +protocol_err: + if (fw_loader->dynamic_loader.fw_desc_valid) + fw_read_errors(hdev, le32_to_cpu(dyn_regs->cpu_boot_err0), + le32_to_cpu(dyn_regs->cpu_boot_err1), + le32_to_cpu(dyn_regs->cpu_boot_dev_sts0), + le32_to_cpu(dyn_regs->cpu_boot_dev_sts1)); + return rc; +} + +/** + * hl_fw_static_init_cpu - initialize the device CPU using static protocol + * + * @hdev: pointer to the habanalabs device structure + * @fw_loader: managing structure for loading device's FW + * + * @return 0 on success, otherwise non-zero error code + */ +static int hl_fw_static_init_cpu(struct hl_device *hdev, + struct fw_load_mgr *fw_loader) +{ + u32 cpu_msg_status_reg, cpu_timeout, msg_to_cpu_reg, status; + u32 cpu_boot_dev_status0_reg, cpu_boot_dev_status1_reg; + struct static_fw_load_mgr *static_loader; + u32 cpu_boot_status_reg; + int rc; + + if (!(hdev->fw_components & FW_TYPE_BOOT_CPU)) + return 0; + + /* init common loader parameters */ + cpu_timeout = fw_loader->cpu_timeout; + + /* init static loader parameters */ + static_loader = &fw_loader->static_loader; + cpu_msg_status_reg = static_loader->cpu_cmd_status_to_host_reg; + msg_to_cpu_reg = static_loader->kmd_msg_to_cpu_reg; + cpu_boot_dev_status0_reg = static_loader->cpu_boot_dev_status0_reg; + cpu_boot_dev_status1_reg = static_loader->cpu_boot_dev_status1_reg; + cpu_boot_status_reg = static_loader->cpu_boot_status_reg; + + dev_info(hdev->dev, "Going to wait for device boot (up to %lds)\n", + cpu_timeout / USEC_PER_SEC); + + /* Wait for boot FIT request */ + rc = hl_poll_timeout( + hdev, + cpu_boot_status_reg, + status, + status == CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT, + hdev->fw_poll_interval_usec, + fw_loader->boot_fit_timeout); + + if (rc) { + dev_dbg(hdev->dev, + "No boot fit request received, resuming boot\n"); + } else { + rc = hdev->asic_funcs->load_boot_fit_to_device(hdev); + if (rc) + goto out; + + /* Clear device CPU message status */ + WREG32(cpu_msg_status_reg, CPU_MSG_CLR); + + /* Signal device CPU that boot loader is ready */ + WREG32(msg_to_cpu_reg, KMD_MSG_FIT_RDY); + + /* Poll for CPU device ack */ + rc = hl_poll_timeout( + hdev, + cpu_msg_status_reg, + status, + status == CPU_MSG_OK, + hdev->fw_poll_interval_usec, + fw_loader->boot_fit_timeout); + + if (rc) { + dev_err(hdev->dev, + "Timeout waiting for boot fit load ack\n"); + goto out; + } + + /* Clear message */ + WREG32(msg_to_cpu_reg, KMD_MSG_NA); + } + + /* + * Make sure CPU boot-loader is running + * Note that the CPU_BOOT_STATUS_SRAM_AVAIL is generally set by Linux + * yet there is a debug scenario in which we loading uboot (without Linux) + * which at later stage is relocated to DRAM. In this case we expect + * uboot to set the CPU_BOOT_STATUS_SRAM_AVAIL and so we add it to the + * poll flags + */ + rc = hl_poll_timeout( + hdev, + cpu_boot_status_reg, + status, + (status == CPU_BOOT_STATUS_DRAM_RDY) || + (status == CPU_BOOT_STATUS_NIC_FW_RDY) || + (status == CPU_BOOT_STATUS_READY_TO_BOOT) || + (status == CPU_BOOT_STATUS_SRAM_AVAIL), + hdev->fw_poll_interval_usec, + cpu_timeout); + + dev_dbg(hdev->dev, "uboot status = %d\n", status); + + /* Read U-Boot version now in case we will later fail */ + hl_fw_static_read_device_fw_version(hdev, FW_COMP_BOOT_FIT); + + /* update state according to boot stage */ + hl_fw_boot_fit_update_state(hdev, cpu_boot_dev_status0_reg, + cpu_boot_dev_status1_reg); + + if (rc) { + detect_cpu_boot_status(hdev, status); + rc = -EIO; + goto out; + } + + /* Enable DRAM scrambling before Linux boot and after successful + * UBoot + */ + hdev->asic_funcs->init_cpu_scrambler_dram(hdev); + + if (!(hdev->fw_components & FW_TYPE_LINUX)) { + dev_info(hdev->dev, "Skip loading Linux F/W\n"); + rc = 0; + goto out; + } + + if (status == CPU_BOOT_STATUS_SRAM_AVAIL) { + rc = 0; + goto out; + } + + dev_info(hdev->dev, + "Loading firmware to device, may take some time...\n"); + + rc = hdev->asic_funcs->load_firmware_to_device(hdev); + if (rc) + goto out; + + if (fw_loader->skip_bmc) { + WREG32(msg_to_cpu_reg, KMD_MSG_SKIP_BMC); + + rc = hl_poll_timeout( + hdev, + cpu_boot_status_reg, + status, + (status == CPU_BOOT_STATUS_BMC_WAITING_SKIPPED), + hdev->fw_poll_interval_usec, + cpu_timeout); + + if (rc) { + dev_err(hdev->dev, + "Failed to get ACK on skipping BMC, %d\n", + status); + WREG32(msg_to_cpu_reg, KMD_MSG_NA); + rc = -EIO; + goto out; + } + } + + WREG32(msg_to_cpu_reg, KMD_MSG_FIT_RDY); + + rc = hl_poll_timeout( + hdev, + cpu_boot_status_reg, + status, + (status == CPU_BOOT_STATUS_SRAM_AVAIL), + hdev->fw_poll_interval_usec, + cpu_timeout); + + /* Clear message */ + WREG32(msg_to_cpu_reg, KMD_MSG_NA); + + if (rc) { + if (status == CPU_BOOT_STATUS_FIT_CORRUPTED) + dev_err(hdev->dev, + "Device reports FIT image is corrupted\n"); + else + dev_err(hdev->dev, + "Failed to load firmware to device, %d\n", + status); + + rc = -EIO; + goto out; + } + + rc = fw_read_errors(hdev, fw_loader->static_loader.boot_err0_reg, + fw_loader->static_loader.boot_err1_reg, + cpu_boot_dev_status0_reg, + cpu_boot_dev_status1_reg); + if (rc) + return rc; + + hl_fw_linux_update_state(hdev, cpu_boot_dev_status0_reg, + cpu_boot_dev_status1_reg); + + return 0; + +out: + fw_read_errors(hdev, fw_loader->static_loader.boot_err0_reg, + fw_loader->static_loader.boot_err1_reg, + cpu_boot_dev_status0_reg, + cpu_boot_dev_status1_reg); + + return rc; +} + +/** + * hl_fw_init_cpu - initialize the device CPU + * + * @hdev: pointer to the habanalabs device structure + * + * @return 0 on success, otherwise non-zero error code + * + * perform necessary initializations for device's CPU. takes into account if + * init protocol is static or dynamic. + */ +int hl_fw_init_cpu(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct fw_load_mgr *fw_loader = &hdev->fw_loader; + + return prop->dynamic_fw_load ? + hl_fw_dynamic_init_cpu(hdev, fw_loader) : + hl_fw_static_init_cpu(hdev, fw_loader); +} + +void hl_fw_set_pll_profile(struct hl_device *hdev) +{ + hl_fw_set_frequency(hdev, hdev->asic_prop.clk_pll_index, + hdev->asic_prop.max_freq_value); +} + +int hl_fw_get_clk_rate(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk) +{ + long value; + + if (!hl_device_operational(hdev, NULL)) + return -ENODEV; + + if (!hdev->pdev) { + *cur_clk = 0; + *max_clk = 0; + return 0; + } + + value = hl_fw_get_frequency(hdev, hdev->asic_prop.clk_pll_index, false); + + if (value < 0) { + dev_err(hdev->dev, "Failed to retrieve device max clock %ld\n", value); + return value; + } + + *max_clk = (value / 1000 / 1000); + + value = hl_fw_get_frequency(hdev, hdev->asic_prop.clk_pll_index, true); + + if (value < 0) { + dev_err(hdev->dev, "Failed to retrieve device current clock %ld\n", value); + return value; + } + + *cur_clk = (value / 1000 / 1000); + + return 0; +} + +long hl_fw_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr) +{ + struct cpucp_packet pkt; + u32 used_pll_idx; + u64 result; + int rc; + + rc = get_used_pll_index(hdev, pll_index, &used_pll_idx); + if (rc) + return rc; + + memset(&pkt, 0, sizeof(pkt)); + + if (curr) + pkt.ctl = cpu_to_le32(CPUCP_PACKET_FREQUENCY_CURR_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + else + pkt.ctl = cpu_to_le32(CPUCP_PACKET_FREQUENCY_GET << CPUCP_PKT_CTL_OPCODE_SHIFT); + + pkt.pll_index = cpu_to_le32((u32)used_pll_idx); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result); + + if (rc) { + dev_err(hdev->dev, "Failed to get frequency of PLL %d, error %d\n", + used_pll_idx, rc); + return rc; + } + + return (long) result; +} + +void hl_fw_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq) +{ + struct cpucp_packet pkt; + u32 used_pll_idx; + int rc; + + rc = get_used_pll_index(hdev, pll_index, &used_pll_idx); + if (rc) + return; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_FREQUENCY_SET << CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.pll_index = cpu_to_le32((u32)used_pll_idx); + pkt.value = cpu_to_le64(freq); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL); + + if (rc) + dev_err(hdev->dev, "Failed to set frequency to PLL %d, error %d\n", + used_pll_idx, rc); +} + +long hl_fw_get_max_power(struct hl_device *hdev) +{ + struct cpucp_packet pkt; + u64 result; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_MAX_POWER_GET << CPUCP_PKT_CTL_OPCODE_SHIFT); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result); + + if (rc) { + dev_err(hdev->dev, "Failed to get max power, error %d\n", rc); + return rc; + } + + return result; +} + +void hl_fw_set_max_power(struct hl_device *hdev) +{ + struct cpucp_packet pkt; + int rc; + + /* TODO: remove this after simulator supports this packet */ + if (!hdev->pdev) + return; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_MAX_POWER_SET << CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.value = cpu_to_le64(hdev->max_power); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL); + + if (rc) + dev_err(hdev->dev, "Failed to set max power, error %d\n", rc); +} + +static int hl_fw_get_sec_attest_data(struct hl_device *hdev, u32 packet_id, void *data, u32 size, + u32 nonce, u32 timeout) +{ + struct cpucp_packet pkt = {}; + dma_addr_t req_dma_addr; + void *req_cpu_addr; + int rc; + + req_cpu_addr = hl_cpu_accessible_dma_pool_alloc(hdev, size, &req_dma_addr); + if (!req_cpu_addr) { + dev_err(hdev->dev, + "Failed to allocate DMA memory for CPU-CP packet %u\n", packet_id); + return -ENOMEM; + } + + memset(data, 0, size); + + pkt.ctl = cpu_to_le32(packet_id << CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.addr = cpu_to_le64(req_dma_addr); + pkt.data_max_size = cpu_to_le32(size); + pkt.nonce = cpu_to_le32(nonce); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + timeout, NULL); + if (rc) { + dev_err(hdev->dev, + "Failed to handle CPU-CP pkt %u, error %d\n", packet_id, rc); + goto out; + } + + memcpy(data, req_cpu_addr, size); + +out: + hl_cpu_accessible_dma_pool_free(hdev, size, req_cpu_addr); + + return rc; +} + +int hl_fw_get_sec_attest_info(struct hl_device *hdev, struct cpucp_sec_attest_info *sec_attest_info, + u32 nonce) +{ + return hl_fw_get_sec_attest_data(hdev, CPUCP_PACKET_SEC_ATTEST_GET, sec_attest_info, + sizeof(struct cpucp_sec_attest_info), nonce, + HL_CPUCP_SEC_ATTEST_INFO_TINEOUT_USEC); +} diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h new file mode 100644 index 000000000..257b94cec --- /dev/null +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -0,0 +1,3962 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright 2016-2022 HabanaLabs, Ltd. + * All Rights Reserved. + * + */ + +#ifndef HABANALABSP_H_ +#define HABANALABSP_H_ + +#include "../include/common/cpucp_if.h" +#include "../include/common/qman_if.h" +#include "../include/hw_ip/mmu/mmu_general.h" +#include <uapi/misc/habanalabs.h> + +#include <linux/cdev.h> +#include <linux/iopoll.h> +#include <linux/irqreturn.h> +#include <linux/dma-direction.h> +#include <linux/scatterlist.h> +#include <linux/hashtable.h> +#include <linux/debugfs.h> +#include <linux/rwsem.h> +#include <linux/eventfd.h> +#include <linux/bitfield.h> +#include <linux/genalloc.h> +#include <linux/sched/signal.h> +#include <linux/io-64-nonatomic-lo-hi.h> +#include <linux/coresight.h> +#include <linux/dma-buf.h> + +#define HL_NAME "habanalabs" + +struct hl_device; +struct hl_fpriv; + +#define PCI_VENDOR_ID_HABANALABS 0x1da3 + +/* Use upper bits of mmap offset to store habana driver specific information. + * bits[63:59] - Encode mmap type + * bits[45:0] - mmap offset value + * + * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these + * defines are w.r.t to PAGE_SIZE + */ +#define HL_MMAP_TYPE_SHIFT (59 - PAGE_SHIFT) +#define HL_MMAP_TYPE_MASK (0x1full << HL_MMAP_TYPE_SHIFT) +#define HL_MMAP_TYPE_TS_BUFF (0x10ull << HL_MMAP_TYPE_SHIFT) +#define HL_MMAP_TYPE_BLOCK (0x4ull << HL_MMAP_TYPE_SHIFT) +#define HL_MMAP_TYPE_CB (0x2ull << HL_MMAP_TYPE_SHIFT) + +#define HL_MMAP_OFFSET_VALUE_MASK (0x1FFFFFFFFFFFull >> PAGE_SHIFT) +#define HL_MMAP_OFFSET_VALUE_GET(off) (off & HL_MMAP_OFFSET_VALUE_MASK) + +#define HL_PENDING_RESET_PER_SEC 10 +#define HL_PENDING_RESET_MAX_TRIALS 60 /* 10 minutes */ +#define HL_PENDING_RESET_LONG_SEC 60 + +#define HL_HARD_RESET_MAX_TIMEOUT 120 +#define HL_PLDM_HARD_RESET_MAX_TIMEOUT (HL_HARD_RESET_MAX_TIMEOUT * 3) + +#define HL_DEVICE_TIMEOUT_USEC 1000000 /* 1 s */ + +#define HL_HEARTBEAT_PER_USEC 5000000 /* 5 s */ + +#define HL_PLL_LOW_JOB_FREQ_USEC 5000000 /* 5 s */ + +#define HL_CPUCP_INFO_TIMEOUT_USEC 10000000 /* 10s */ +#define HL_CPUCP_EEPROM_TIMEOUT_USEC 10000000 /* 10s */ +#define HL_CPUCP_MON_DUMP_TIMEOUT_USEC 10000000 /* 10s */ +#define HL_CPUCP_SEC_ATTEST_INFO_TINEOUT_USEC 10000000 /* 10s */ + +#define HL_FW_STATUS_POLL_INTERVAL_USEC 10000 /* 10ms */ +#define HL_FW_COMMS_STATUS_PLDM_POLL_INTERVAL_USEC 1000000 /* 1s */ + +#define HL_PCI_ELBI_TIMEOUT_MSEC 10 /* 10ms */ + +#define HL_SIM_MAX_TIMEOUT_US 100000000 /* 100s */ + +#define HL_INVALID_QUEUE UINT_MAX + +#define HL_COMMON_USER_CQ_INTERRUPT_ID 0xFFF +#define HL_COMMON_DEC_INTERRUPT_ID 0xFFE + +#define HL_STATE_DUMP_HIST_LEN 5 + +/* Default value for device reset trigger , an invalid value */ +#define HL_RESET_TRIGGER_DEFAULT 0xFF + +#define OBJ_NAMES_HASH_TABLE_BITS 7 /* 1 << 7 buckets */ +#define SYNC_TO_ENGINE_HASH_TABLE_BITS 7 /* 1 << 7 buckets */ + +/* Memory */ +#define MEM_HASH_TABLE_BITS 7 /* 1 << 7 buckets */ + +/* MMU */ +#define MMU_HASH_TABLE_BITS 7 /* 1 << 7 buckets */ + +/** + * enum hl_mmu_page_table_location - mmu page table location + * @MMU_DR_PGT: page-table is located on device DRAM. + * @MMU_HR_PGT: page-table is located on host memory. + * @MMU_NUM_PGT_LOCATIONS: number of page-table locations currently supported. + */ +enum hl_mmu_page_table_location { + MMU_DR_PGT = 0, /* device-dram-resident MMU PGT */ + MMU_HR_PGT, /* host resident MMU PGT */ + MMU_NUM_PGT_LOCATIONS /* num of PGT locations */ +}; + +/** + * enum hl_mmu_enablement - what mmu modules to enable + * @MMU_EN_NONE: mmu disabled. + * @MMU_EN_ALL: enable all. + * @MMU_EN_PMMU_ONLY: Enable only the PMMU leaving the DMMU disabled. + */ +enum hl_mmu_enablement { + MMU_EN_NONE = 0, + MMU_EN_ALL = 1, + MMU_EN_PMMU_ONLY = 3, /* N/A for Goya/Gaudi */ +}; + +/* + * HL_RSVD_SOBS 'sync stream' reserved sync objects per QMAN stream + * HL_RSVD_MONS 'sync stream' reserved monitors per QMAN stream + */ +#define HL_RSVD_SOBS 2 +#define HL_RSVD_MONS 1 + +/* + * HL_COLLECTIVE_RSVD_MSTR_MONS 'collective' reserved monitors per QMAN stream + */ +#define HL_COLLECTIVE_RSVD_MSTR_MONS 2 + +#define HL_MAX_SOB_VAL (1 << 15) + +#define IS_POWER_OF_2(n) (n != 0 && ((n & (n - 1)) == 0)) +#define IS_MAX_PENDING_CS_VALID(n) (IS_POWER_OF_2(n) && (n > 1)) + +#define HL_PCI_NUM_BARS 6 + +/* Completion queue entry relates to completed job */ +#define HL_COMPLETION_MODE_JOB 0 +/* Completion queue entry relates to completed command submission */ +#define HL_COMPLETION_MODE_CS 1 + +#define HL_MAX_DCORES 8 + +/* DMA alloc/free wrappers */ +#define hl_asic_dma_alloc_coherent(hdev, size, dma_handle, flags) \ + hl_asic_dma_alloc_coherent_caller(hdev, size, dma_handle, flags, __func__) + +#define hl_cpu_accessible_dma_pool_alloc(hdev, size, dma_handle) \ + hl_cpu_accessible_dma_pool_alloc_caller(hdev, size, dma_handle, __func__) + +#define hl_asic_dma_pool_zalloc(hdev, size, mem_flags, dma_handle) \ + hl_asic_dma_pool_zalloc_caller(hdev, size, mem_flags, dma_handle, __func__) + +#define hl_asic_dma_free_coherent(hdev, size, cpu_addr, dma_handle) \ + hl_asic_dma_free_coherent_caller(hdev, size, cpu_addr, dma_handle, __func__) + +#define hl_cpu_accessible_dma_pool_free(hdev, size, vaddr) \ + hl_cpu_accessible_dma_pool_free_caller(hdev, size, vaddr, __func__) + +#define hl_asic_dma_pool_free(hdev, vaddr, dma_addr) \ + hl_asic_dma_pool_free_caller(hdev, vaddr, dma_addr, __func__) + +/* + * Reset Flags + * + * - HL_DRV_RESET_HARD + * If set do hard reset to all engines. If not set reset just + * compute/DMA engines. + * + * - HL_DRV_RESET_FROM_RESET_THR + * Set if the caller is the hard-reset thread + * + * - HL_DRV_RESET_HEARTBEAT + * Set if reset is due to heartbeat + * + * - HL_DRV_RESET_TDR + * Set if reset is due to TDR + * + * - HL_DRV_RESET_DEV_RELEASE + * Set if reset is due to device release + * + * - HL_DRV_RESET_BYPASS_REQ_TO_FW + * F/W will perform the reset. No need to ask it to reset the device. This is relevant + * only when running with secured f/w + * + * - HL_DRV_RESET_FW_FATAL_ERR + * Set if reset is due to a fatal error from FW + * + * - HL_DRV_RESET_DELAY + * Set if a delay should be added before the reset + */ + +#define HL_DRV_RESET_HARD (1 << 0) +#define HL_DRV_RESET_FROM_RESET_THR (1 << 1) +#define HL_DRV_RESET_HEARTBEAT (1 << 2) +#define HL_DRV_RESET_TDR (1 << 3) +#define HL_DRV_RESET_DEV_RELEASE (1 << 4) +#define HL_DRV_RESET_BYPASS_REQ_TO_FW (1 << 5) +#define HL_DRV_RESET_FW_FATAL_ERR (1 << 6) +#define HL_DRV_RESET_DELAY (1 << 7) + +/* + * Security + */ + +#define HL_PB_SHARED 1 +#define HL_PB_NA 0 +#define HL_PB_SINGLE_INSTANCE 1 +#define HL_BLOCK_SIZE 0x1000 +#define HL_BLOCK_GLBL_ERR_MASK 0xF40 +#define HL_BLOCK_GLBL_ERR_ADDR 0xF44 +#define HL_BLOCK_GLBL_ERR_CAUSE 0xF48 +#define HL_BLOCK_GLBL_SEC_OFFS 0xF80 +#define HL_BLOCK_GLBL_SEC_SIZE (HL_BLOCK_SIZE - HL_BLOCK_GLBL_SEC_OFFS) +#define HL_BLOCK_GLBL_SEC_LEN (HL_BLOCK_GLBL_SEC_SIZE / sizeof(u32)) +#define UNSET_GLBL_SEC_BIT(array, b) ((array)[((b) / 32)] |= (1 << ((b) % 32))) + +enum hl_protection_levels { + SECURED_LVL, + PRIVILEGED_LVL, + NON_SECURED_LVL +}; + +/** + * struct iterate_module_ctx - HW module iterator + * @fn: function to apply to each HW module instance + * @data: optional internal data to the function iterator + * @rc: return code for optional use of iterator/iterator-caller + */ +struct iterate_module_ctx { + /* + * callback for the HW module iterator + * @hdev: pointer to the habanalabs device structure + * @block: block (ASIC specific definition can be dcore/hdcore) + * @inst: HW module instance within the block + * @offset: current HW module instance offset from the 1-st HW module instance + * in the 1-st block + * @ctx: the iterator context. + */ + void (*fn)(struct hl_device *hdev, int block, int inst, u32 offset, + struct iterate_module_ctx *ctx); + void *data; + int rc; +}; + +struct hl_block_glbl_sec { + u32 sec_array[HL_BLOCK_GLBL_SEC_LEN]; +}; + +#define HL_MAX_SOBS_PER_MONITOR 8 + +/** + * struct hl_gen_wait_properties - properties for generating a wait CB + * @data: command buffer + * @q_idx: queue id is used to extract fence register address + * @size: offset in command buffer + * @sob_base: SOB base to use in this wait CB + * @sob_val: SOB value to wait for + * @mon_id: monitor to use in this wait CB + * @sob_mask: each bit represents a SOB offset from sob_base to be used + */ +struct hl_gen_wait_properties { + void *data; + u32 q_idx; + u32 size; + u16 sob_base; + u16 sob_val; + u16 mon_id; + u8 sob_mask; +}; + +/** + * struct pgt_info - MMU hop page info. + * @node: hash linked-list node for the pgts on host (shadow pgts for device resident MMU and + * actual pgts for host resident MMU). + * @phys_addr: physical address of the pgt. + * @virt_addr: host virtual address of the pgt (see above device/host resident). + * @shadow_addr: shadow hop in the host for device resident MMU. + * @ctx: pointer to the owner ctx. + * @num_of_ptes: indicates how many ptes are used in the pgt. used only for dynamically + * allocated HOPs (all HOPs but HOP0) + * + * The MMU page tables hierarchy can be placed either on the device's DRAM (in which case shadow + * pgts will be stored on host memory) or on host memory (in which case no shadow is required). + * + * When a new level (hop) is needed during mapping this structure will be used to describe + * the newly allocated hop as well as to track number of PTEs in it. + * During unmapping, if no valid PTEs remained in the page of a newly allocated hop, it is + * freed with its pgt_info structure. + */ +struct pgt_info { + struct hlist_node node; + u64 phys_addr; + u64 virt_addr; + u64 shadow_addr; + struct hl_ctx *ctx; + int num_of_ptes; +}; + +/** + * enum hl_pci_match_mode - pci match mode per region + * @PCI_ADDRESS_MATCH_MODE: address match mode + * @PCI_BAR_MATCH_MODE: bar match mode + */ +enum hl_pci_match_mode { + PCI_ADDRESS_MATCH_MODE, + PCI_BAR_MATCH_MODE +}; + +/** + * enum hl_fw_component - F/W components to read version through registers. + * @FW_COMP_BOOT_FIT: boot fit. + * @FW_COMP_PREBOOT: preboot. + * @FW_COMP_LINUX: linux. + */ +enum hl_fw_component { + FW_COMP_BOOT_FIT, + FW_COMP_PREBOOT, + FW_COMP_LINUX, +}; + +/** + * enum hl_fw_types - F/W types present in the system + * @FW_TYPE_NONE: no FW component indication + * @FW_TYPE_LINUX: Linux image for device CPU + * @FW_TYPE_BOOT_CPU: Boot image for device CPU + * @FW_TYPE_PREBOOT_CPU: Indicates pre-loaded CPUs are present in the system + * (preboot, ppboot etc...) + * @FW_TYPE_ALL_TYPES: Mask for all types + */ +enum hl_fw_types { + FW_TYPE_NONE = 0x0, + FW_TYPE_LINUX = 0x1, + FW_TYPE_BOOT_CPU = 0x2, + FW_TYPE_PREBOOT_CPU = 0x4, + FW_TYPE_ALL_TYPES = + (FW_TYPE_LINUX | FW_TYPE_BOOT_CPU | FW_TYPE_PREBOOT_CPU) +}; + +/** + * enum hl_queue_type - Supported QUEUE types. + * @QUEUE_TYPE_NA: queue is not available. + * @QUEUE_TYPE_EXT: external queue which is a DMA channel that may access the + * host. + * @QUEUE_TYPE_INT: internal queue that performs DMA inside the device's + * memories and/or operates the compute engines. + * @QUEUE_TYPE_CPU: S/W queue for communication with the device's CPU. + * @QUEUE_TYPE_HW: queue of DMA and compute engines jobs, for which completion + * notifications are sent by H/W. + */ +enum hl_queue_type { + QUEUE_TYPE_NA, + QUEUE_TYPE_EXT, + QUEUE_TYPE_INT, + QUEUE_TYPE_CPU, + QUEUE_TYPE_HW +}; + +enum hl_cs_type { + CS_TYPE_DEFAULT, + CS_TYPE_SIGNAL, + CS_TYPE_WAIT, + CS_TYPE_COLLECTIVE_WAIT, + CS_RESERVE_SIGNALS, + CS_UNRESERVE_SIGNALS, + CS_TYPE_ENGINE_CORE +}; + +/* + * struct hl_inbound_pci_region - inbound region descriptor + * @mode: pci match mode for this region + * @addr: region target address + * @size: region size in bytes + * @offset_in_bar: offset within bar (address match mode) + * @bar: bar id + */ +struct hl_inbound_pci_region { + enum hl_pci_match_mode mode; + u64 addr; + u64 size; + u64 offset_in_bar; + u8 bar; +}; + +/* + * struct hl_outbound_pci_region - outbound region descriptor + * @addr: region target address + * @size: region size in bytes + */ +struct hl_outbound_pci_region { + u64 addr; + u64 size; +}; + +/* + * enum queue_cb_alloc_flags - Indicates queue support for CBs that + * allocated by Kernel or by User + * @CB_ALLOC_KERNEL: support only CBs that allocated by Kernel + * @CB_ALLOC_USER: support only CBs that allocated by User + */ +enum queue_cb_alloc_flags { + CB_ALLOC_KERNEL = 0x1, + CB_ALLOC_USER = 0x2 +}; + +/* + * struct hl_hw_sob - H/W SOB info. + * @hdev: habanalabs device structure. + * @kref: refcount of this SOB. The SOB will reset once the refcount is zero. + * @sob_id: id of this SOB. + * @sob_addr: the sob offset from the base address. + * @q_idx: the H/W queue that uses this SOB. + * @need_reset: reset indication set when switching to the other sob. + */ +struct hl_hw_sob { + struct hl_device *hdev; + struct kref kref; + u32 sob_id; + u32 sob_addr; + u32 q_idx; + bool need_reset; +}; + +enum hl_collective_mode { + HL_COLLECTIVE_NOT_SUPPORTED = 0x0, + HL_COLLECTIVE_MASTER = 0x1, + HL_COLLECTIVE_SLAVE = 0x2 +}; + +/** + * struct hw_queue_properties - queue information. + * @type: queue type. + * @cb_alloc_flags: bitmap which indicates if the hw queue supports CB + * that allocated by the Kernel driver and therefore, + * a CB handle can be provided for jobs on this queue. + * Otherwise, a CB address must be provided. + * @collective_mode: collective mode of current queue + * @driver_only: true if only the driver is allowed to send a job to this queue, + * false otherwise. + * @binned: True if the queue is binned out and should not be used + * @supports_sync_stream: True if queue supports sync stream + */ +struct hw_queue_properties { + enum hl_queue_type type; + enum queue_cb_alloc_flags cb_alloc_flags; + enum hl_collective_mode collective_mode; + u8 driver_only; + u8 binned; + u8 supports_sync_stream; +}; + +/** + * enum vm_type - virtual memory mapping request information. + * @VM_TYPE_USERPTR: mapping of user memory to device virtual address. + * @VM_TYPE_PHYS_PACK: mapping of DRAM memory to device virtual address. + */ +enum vm_type { + VM_TYPE_USERPTR = 0x1, + VM_TYPE_PHYS_PACK = 0x2 +}; + +/** + * enum mmu_op_flags - mmu operation relevant information. + * @MMU_OP_USERPTR: operation on user memory (host resident). + * @MMU_OP_PHYS_PACK: operation on DRAM (device resident). + * @MMU_OP_CLEAR_MEMCACHE: operation has to clear memcache. + * @MMU_OP_SKIP_LOW_CACHE_INV: operation is allowed to skip parts of cache invalidation. + */ +enum mmu_op_flags { + MMU_OP_USERPTR = 0x1, + MMU_OP_PHYS_PACK = 0x2, + MMU_OP_CLEAR_MEMCACHE = 0x4, + MMU_OP_SKIP_LOW_CACHE_INV = 0x8, +}; + + +/** + * enum hl_device_hw_state - H/W device state. use this to understand whether + * to do reset before hw_init or not + * @HL_DEVICE_HW_STATE_CLEAN: H/W state is clean. i.e. after hard reset + * @HL_DEVICE_HW_STATE_DIRTY: H/W state is dirty. i.e. we started to execute + * hw_init + */ +enum hl_device_hw_state { + HL_DEVICE_HW_STATE_CLEAN = 0, + HL_DEVICE_HW_STATE_DIRTY +}; + +#define HL_MMU_VA_ALIGNMENT_NOT_NEEDED 0 + +/** + * struct hl_mmu_properties - ASIC specific MMU address translation properties. + * @start_addr: virtual start address of the memory region. + * @end_addr: virtual end address of the memory region. + * @hop_shifts: array holds HOPs shifts. + * @hop_masks: array holds HOPs masks. + * @last_mask: mask to get the bit indicating this is the last hop. + * @pgt_size: size for page tables. + * @supported_pages_mask: bitmask for supported page size (relevant only for MMUs + * supporting multiple page size). + * @page_size: default page size used to allocate memory. + * @num_hops: The amount of hops supported by the translation table. + * @hop_table_size: HOP table size. + * @hop0_tables_total_size: total size for all HOP0 tables. + * @host_resident: Should the MMU page table reside in host memory or in the + * device DRAM. + */ +struct hl_mmu_properties { + u64 start_addr; + u64 end_addr; + u64 hop_shifts[MMU_HOP_MAX]; + u64 hop_masks[MMU_HOP_MAX]; + u64 last_mask; + u64 pgt_size; + u64 supported_pages_mask; + u32 page_size; + u32 num_hops; + u32 hop_table_size; + u32 hop0_tables_total_size; + u8 host_resident; +}; + +/** + * struct hl_hints_range - hint addresses reserved va range. + * @start_addr: start address of the va range. + * @end_addr: end address of the va range. + */ +struct hl_hints_range { + u64 start_addr; + u64 end_addr; +}; + +/** + * struct asic_fixed_properties - ASIC specific immutable properties. + * @hw_queues_props: H/W queues properties. + * @cpucp_info: received various information from CPU-CP regarding the H/W, e.g. + * available sensors. + * @uboot_ver: F/W U-boot version. + * @preboot_ver: F/W Preboot version. + * @dmmu: DRAM MMU address translation properties. + * @pmmu: PCI (host) MMU address translation properties. + * @pmmu_huge: PCI (host) MMU address translation properties for memory + * allocated with huge pages. + * @hints_dram_reserved_va_range: dram hint addresses reserved range. + * @hints_host_reserved_va_range: host hint addresses reserved range. + * @hints_host_hpage_reserved_va_range: host huge page hint addresses reserved + * range. + * @sram_base_address: SRAM physical start address. + * @sram_end_address: SRAM physical end address. + * @sram_user_base_address - SRAM physical start address for user access. + * @dram_base_address: DRAM physical start address. + * @dram_end_address: DRAM physical end address. + * @dram_user_base_address: DRAM physical start address for user access. + * @dram_size: DRAM total size. + * @dram_pci_bar_size: size of PCI bar towards DRAM. + * @max_power_default: max power of the device after reset. + * @dc_power_default: power consumed by the device in mode idle. + * @dram_size_for_default_page_mapping: DRAM size needed to map to avoid page + * fault. + * @pcie_dbi_base_address: Base address of the PCIE_DBI block. + * @pcie_aux_dbi_reg_addr: Address of the PCIE_AUX DBI register. + * @mmu_pgt_addr: base physical address in DRAM of MMU page tables. + * @mmu_dram_default_page_addr: DRAM default page physical address. + * @tpc_enabled_mask: which TPCs are enabled. + * @tpc_binning_mask: which TPCs are binned. 0 means usable and 1 means binned. + * @dram_enabled_mask: which DRAMs are enabled. + * @dram_binning_mask: which DRAMs are binned. 0 means usable, 1 means binned. + * @dram_hints_align_mask: dram va hint addresses alignment mask which is used + * for hints validity check. + * @cfg_base_address: config space base address. + * @mmu_cache_mng_addr: address of the MMU cache. + * @mmu_cache_mng_size: size of the MMU cache. + * @device_dma_offset_for_host_access: the offset to add to host DMA addresses + * to enable the device to access them. + * @host_base_address: host physical start address for host DMA from device + * @host_end_address: host physical end address for host DMA from device + * @max_freq_value: current max clk frequency. + * @clk_pll_index: clock PLL index that specify which PLL determines the clock + * we display to the user + * @mmu_pgt_size: MMU page tables total size. + * @mmu_pte_size: PTE size in MMU page tables. + * @mmu_hop_table_size: MMU hop table size. + * @mmu_hop0_tables_total_size: total size of MMU hop0 tables. + * @dram_page_size: page size for MMU DRAM allocation. + * @cfg_size: configuration space size on SRAM. + * @sram_size: total size of SRAM. + * @max_asid: maximum number of open contexts (ASIDs). + * @num_of_events: number of possible internal H/W IRQs. + * @psoc_pci_pll_nr: PCI PLL NR value. + * @psoc_pci_pll_nf: PCI PLL NF value. + * @psoc_pci_pll_od: PCI PLL OD value. + * @psoc_pci_pll_div_factor: PCI PLL DIV FACTOR 1 value. + * @psoc_timestamp_frequency: frequency of the psoc timestamp clock. + * @high_pll: high PLL frequency used by the device. + * @cb_pool_cb_cnt: number of CBs in the CB pool. + * @cb_pool_cb_size: size of each CB in the CB pool. + * @decoder_enabled_mask: which decoders are enabled. + * @decoder_binning_mask: which decoders are binned, 0 means usable and 1 + * means binned (at most one binned decoder per dcore). + * @edma_enabled_mask: which EDMAs are enabled. + * @edma_binning_mask: which EDMAs are binned, 0 means usable and 1 means + * binned (at most one binned DMA). + * @max_pending_cs: maximum of concurrent pending command submissions + * @max_queues: maximum amount of queues in the system + * @fw_preboot_cpu_boot_dev_sts0: bitmap representation of preboot cpu + * capabilities reported by FW, bit description + * can be found in CPU_BOOT_DEV_STS0 + * @fw_preboot_cpu_boot_dev_sts1: bitmap representation of preboot cpu + * capabilities reported by FW, bit description + * can be found in CPU_BOOT_DEV_STS1 + * @fw_bootfit_cpu_boot_dev_sts0: bitmap representation of boot cpu security + * status reported by FW, bit description can be + * found in CPU_BOOT_DEV_STS0 + * @fw_bootfit_cpu_boot_dev_sts1: bitmap representation of boot cpu security + * status reported by FW, bit description can be + * found in CPU_BOOT_DEV_STS1 + * @fw_app_cpu_boot_dev_sts0: bitmap representation of application security + * status reported by FW, bit description can be + * found in CPU_BOOT_DEV_STS0 + * @fw_app_cpu_boot_dev_sts1: bitmap representation of application security + * status reported by FW, bit description can be + * found in CPU_BOOT_DEV_STS1 + * @max_dec: maximum number of decoders + * @hmmu_hif_enabled_mask: mask of HMMUs/HIFs that are not isolated (enabled) + * 1- enabled, 0- isolated. + * @faulty_dram_cluster_map: mask of faulty DRAM cluster. + * 1- faulty cluster, 0- good cluster. + * @xbar_edge_enabled_mask: mask of XBAR_EDGEs that are not isolated (enabled) + * 1- enabled, 0- isolated. + * @device_mem_alloc_default_page_size: may be different than dram_page_size only for ASICs for + * which the property supports_user_set_page_size is true + * (i.e. the DRAM supports multiple page sizes), otherwise + * it will shall be equal to dram_page_size. + * @num_engine_cores: number of engine cpu cores + * @collective_first_sob: first sync object available for collective use + * @collective_first_mon: first monitor available for collective use + * @sync_stream_first_sob: first sync object available for sync stream use + * @sync_stream_first_mon: first monitor available for sync stream use + * @first_available_user_sob: first sob available for the user + * @first_available_user_mon: first monitor available for the user + * @first_available_user_interrupt: first available interrupt reserved for the user + * @first_available_cq: first available CQ for the user. + * @user_interrupt_count: number of user interrupts. + * @user_dec_intr_count: number of decoder interrupts exposed to user. + * @cache_line_size: device cache line size. + * @server_type: Server type that the ASIC is currently installed in. + * The value is according to enum hl_server_type in uapi file. + * @completion_queues_count: number of completion queues. + * @completion_mode: 0 - job based completion, 1 - cs based completion + * @mme_master_slave_mode: 0 - Each MME works independently, 1 - MME works + * in Master/Slave mode + * @fw_security_enabled: true if security measures are enabled in firmware, + * false otherwise + * @fw_cpu_boot_dev_sts0_valid: status bits are valid and can be fetched from + * BOOT_DEV_STS0 + * @fw_cpu_boot_dev_sts1_valid: status bits are valid and can be fetched from + * BOOT_DEV_STS1 + * @dram_supports_virtual_memory: is there an MMU towards the DRAM + * @hard_reset_done_by_fw: true if firmware is handling hard reset flow + * @num_functional_hbms: number of functional HBMs in each DCORE. + * @hints_range_reservation: device support hint addresses range reservation. + * @iatu_done_by_fw: true if iATU configuration is being done by FW. + * @dynamic_fw_load: is dynamic FW load is supported. + * @gic_interrupts_enable: true if FW is not blocking GIC controller, + * false otherwise. + * @use_get_power_for_reset_history: To support backward compatibility for Goya + * and Gaudi + * @supports_compute_reset: is a reset which is not a hard-reset supported by this asic. + * @allow_inference_soft_reset: true if the ASIC supports soft reset that is + * initiated by user or TDR. This is only true + * in inference ASICs, as there is no real-world + * use-case of doing soft-reset in training (due + * to the fact that training runs on multiple + * devices) + * @configurable_stop_on_err: is stop-on-error option configurable via debugfs. + * @set_max_power_on_device_init: true if need to set max power in F/W on device init. + * @supports_user_set_page_size: true if user can set the allocation page size. + * @dma_mask: the dma mask to be set for this device + * @supports_advanced_cpucp_rc: true if new cpucp opcodes are supported. + */ +struct asic_fixed_properties { + struct hw_queue_properties *hw_queues_props; + struct cpucp_info cpucp_info; + char uboot_ver[VERSION_MAX_LEN]; + char preboot_ver[VERSION_MAX_LEN]; + struct hl_mmu_properties dmmu; + struct hl_mmu_properties pmmu; + struct hl_mmu_properties pmmu_huge; + struct hl_hints_range hints_dram_reserved_va_range; + struct hl_hints_range hints_host_reserved_va_range; + struct hl_hints_range hints_host_hpage_reserved_va_range; + u64 sram_base_address; + u64 sram_end_address; + u64 sram_user_base_address; + u64 dram_base_address; + u64 dram_end_address; + u64 dram_user_base_address; + u64 dram_size; + u64 dram_pci_bar_size; + u64 max_power_default; + u64 dc_power_default; + u64 dram_size_for_default_page_mapping; + u64 pcie_dbi_base_address; + u64 pcie_aux_dbi_reg_addr; + u64 mmu_pgt_addr; + u64 mmu_dram_default_page_addr; + u64 tpc_enabled_mask; + u64 tpc_binning_mask; + u64 dram_enabled_mask; + u64 dram_binning_mask; + u64 dram_hints_align_mask; + u64 cfg_base_address; + u64 mmu_cache_mng_addr; + u64 mmu_cache_mng_size; + u64 device_dma_offset_for_host_access; + u64 host_base_address; + u64 host_end_address; + u64 max_freq_value; + u32 clk_pll_index; + u32 mmu_pgt_size; + u32 mmu_pte_size; + u32 mmu_hop_table_size; + u32 mmu_hop0_tables_total_size; + u32 dram_page_size; + u32 cfg_size; + u32 sram_size; + u32 max_asid; + u32 num_of_events; + u32 psoc_pci_pll_nr; + u32 psoc_pci_pll_nf; + u32 psoc_pci_pll_od; + u32 psoc_pci_pll_div_factor; + u32 psoc_timestamp_frequency; + u32 high_pll; + u32 cb_pool_cb_cnt; + u32 cb_pool_cb_size; + u32 decoder_enabled_mask; + u32 decoder_binning_mask; + u32 edma_enabled_mask; + u32 edma_binning_mask; + u32 max_pending_cs; + u32 max_queues; + u32 fw_preboot_cpu_boot_dev_sts0; + u32 fw_preboot_cpu_boot_dev_sts1; + u32 fw_bootfit_cpu_boot_dev_sts0; + u32 fw_bootfit_cpu_boot_dev_sts1; + u32 fw_app_cpu_boot_dev_sts0; + u32 fw_app_cpu_boot_dev_sts1; + u32 max_dec; + u32 hmmu_hif_enabled_mask; + u32 faulty_dram_cluster_map; + u32 xbar_edge_enabled_mask; + u32 device_mem_alloc_default_page_size; + u32 num_engine_cores; + u16 collective_first_sob; + u16 collective_first_mon; + u16 sync_stream_first_sob; + u16 sync_stream_first_mon; + u16 first_available_user_sob[HL_MAX_DCORES]; + u16 first_available_user_mon[HL_MAX_DCORES]; + u16 first_available_user_interrupt; + u16 first_available_cq[HL_MAX_DCORES]; + u16 user_interrupt_count; + u16 user_dec_intr_count; + u16 cache_line_size; + u16 server_type; + u8 completion_queues_count; + u8 completion_mode; + u8 mme_master_slave_mode; + u8 fw_security_enabled; + u8 fw_cpu_boot_dev_sts0_valid; + u8 fw_cpu_boot_dev_sts1_valid; + u8 dram_supports_virtual_memory; + u8 hard_reset_done_by_fw; + u8 num_functional_hbms; + u8 hints_range_reservation; + u8 iatu_done_by_fw; + u8 dynamic_fw_load; + u8 gic_interrupts_enable; + u8 use_get_power_for_reset_history; + u8 supports_compute_reset; + u8 allow_inference_soft_reset; + u8 configurable_stop_on_err; + u8 set_max_power_on_device_init; + u8 supports_user_set_page_size; + u8 dma_mask; + u8 supports_advanced_cpucp_rc; +}; + +/** + * struct hl_fence - software synchronization primitive + * @completion: fence is implemented using completion + * @refcount: refcount for this fence + * @cs_sequence: sequence of the corresponding command submission + * @stream_master_qid_map: streams masters QID bitmap to represent all streams + * masters QIDs that multi cs is waiting on + * @error: mark this fence with error + * @timestamp: timestamp upon completion + * @mcs_handling_done: indicates that corresponding command submission has + * finished msc handling, this does not mean it was part + * of the mcs + */ +struct hl_fence { + struct completion completion; + struct kref refcount; + u64 cs_sequence; + u32 stream_master_qid_map; + int error; + ktime_t timestamp; + u8 mcs_handling_done; +}; + +/** + * struct hl_cs_compl - command submission completion object. + * @base_fence: hl fence object. + * @lock: spinlock to protect fence. + * @hdev: habanalabs device structure. + * @hw_sob: the H/W SOB used in this signal/wait CS. + * @encaps_sig_hdl: encaps signals handler. + * @cs_seq: command submission sequence number. + * @type: type of the CS - signal/wait. + * @sob_val: the SOB value that is used in this signal/wait CS. + * @sob_group: the SOB group that is used in this collective wait CS. + * @encaps_signals: indication whether it's a completion object of cs with + * encaps signals or not. + */ +struct hl_cs_compl { + struct hl_fence base_fence; + spinlock_t lock; + struct hl_device *hdev; + struct hl_hw_sob *hw_sob; + struct hl_cs_encaps_sig_handle *encaps_sig_hdl; + u64 cs_seq; + enum hl_cs_type type; + u16 sob_val; + u16 sob_group; + bool encaps_signals; +}; + +/* + * Command Buffers + */ + +/** + * struct hl_ts_buff - describes a timestamp buffer. + * @kernel_buff_address: Holds the internal buffer's kernel virtual address. + * @user_buff_address: Holds the user buffer's kernel virtual address. + * @kernel_buff_size: Holds the internal kernel buffer size. + */ +struct hl_ts_buff { + void *kernel_buff_address; + void *user_buff_address; + u32 kernel_buff_size; +}; + +struct hl_mmap_mem_buf; + +/** + * struct hl_mem_mgr - describes unified memory manager for mappable memory chunks. + * @dev: back pointer to the owning device + * @lock: protects handles + * @handles: an idr holding all active handles to the memory buffers in the system. + */ +struct hl_mem_mgr { + struct device *dev; + spinlock_t lock; + struct idr handles; +}; + +/** + * struct hl_mmap_mem_buf_behavior - describes unified memory manager buffer behavior + * @topic: string identifier used for logging + * @mem_id: memory type identifier, embedded in the handle and used to identify + * the memory type by handle. + * @alloc: callback executed on buffer allocation, shall allocate the memory, + * set it under buffer private, and set mappable size. + * @mmap: callback executed on mmap, must map the buffer to vma + * @release: callback executed on release, must free the resources used by the buffer + */ +struct hl_mmap_mem_buf_behavior { + const char *topic; + u64 mem_id; + + int (*alloc)(struct hl_mmap_mem_buf *buf, gfp_t gfp, void *args); + int (*mmap)(struct hl_mmap_mem_buf *buf, struct vm_area_struct *vma, void *args); + void (*release)(struct hl_mmap_mem_buf *buf); +}; + +/** + * struct hl_mmap_mem_buf - describes a single unified memory buffer + * @behavior: buffer behavior + * @mmg: back pointer to the unified memory manager + * @refcount: reference counter for buffer users + * @private: pointer to buffer behavior private data + * @mmap: atomic boolean indicating whether or not the buffer is mapped right now + * @real_mapped_size: the actual size of buffer mapped, after part of it may be released, + * may change at runtime. + * @mappable_size: the original mappable size of the buffer, does not change after + * the allocation. + * @handle: the buffer id in mmg handles store + */ +struct hl_mmap_mem_buf { + struct hl_mmap_mem_buf_behavior *behavior; + struct hl_mem_mgr *mmg; + struct kref refcount; + void *private; + atomic_t mmap; + u64 real_mapped_size; + u64 mappable_size; + u64 handle; +}; + +/** + * struct hl_cb - describes a Command Buffer. + * @hdev: pointer to device this CB belongs to. + * @ctx: pointer to the CB owner's context. + * @buf: back pointer to the parent mappable memory buffer + * @debugfs_list: node in debugfs list of command buffers. + * @pool_list: node in pool list of command buffers. + * @kernel_address: Holds the CB's kernel virtual address. + * @virtual_addr: Holds the CB's virtual address. + * @bus_address: Holds the CB's DMA address. + * @size: holds the CB's size. + * @roundup_size: holds the cb size after roundup to page size. + * @cs_cnt: holds number of CS that this CB participates in. + * @is_pool: true if CB was acquired from the pool, false otherwise. + * @is_internal: internally allocated + * @is_mmu_mapped: true if the CB is mapped to the device's MMU. + */ +struct hl_cb { + struct hl_device *hdev; + struct hl_ctx *ctx; + struct hl_mmap_mem_buf *buf; + struct list_head debugfs_list; + struct list_head pool_list; + void *kernel_address; + u64 virtual_addr; + dma_addr_t bus_address; + u32 size; + u32 roundup_size; + atomic_t cs_cnt; + u8 is_pool; + u8 is_internal; + u8 is_mmu_mapped; +}; + + +/* + * QUEUES + */ + +struct hl_cs_job; + +/* Queue length of external and HW queues */ +#define HL_QUEUE_LENGTH 4096 +#define HL_QUEUE_SIZE_IN_BYTES (HL_QUEUE_LENGTH * HL_BD_SIZE) + +#if (HL_MAX_JOBS_PER_CS > HL_QUEUE_LENGTH) +#error "HL_QUEUE_LENGTH must be greater than HL_MAX_JOBS_PER_CS" +#endif + +/* HL_CQ_LENGTH is in units of struct hl_cq_entry */ +#define HL_CQ_LENGTH HL_QUEUE_LENGTH +#define HL_CQ_SIZE_IN_BYTES (HL_CQ_LENGTH * HL_CQ_ENTRY_SIZE) + +/* Must be power of 2 */ +#define HL_EQ_LENGTH 64 +#define HL_EQ_SIZE_IN_BYTES (HL_EQ_LENGTH * HL_EQ_ENTRY_SIZE) + +/* Host <-> CPU-CP shared memory size */ +#define HL_CPU_ACCESSIBLE_MEM_SIZE SZ_2M + +/** + * struct hl_sync_stream_properties - + * describes a H/W queue sync stream properties + * @hw_sob: array of the used H/W SOBs by this H/W queue. + * @next_sob_val: the next value to use for the currently used SOB. + * @base_sob_id: the base SOB id of the SOBs used by this queue. + * @base_mon_id: the base MON id of the MONs used by this queue. + * @collective_mstr_mon_id: the MON ids of the MONs used by this master queue + * in order to sync with all slave queues. + * @collective_slave_mon_id: the MON id used by this slave queue in order to + * sync with its master queue. + * @collective_sob_id: current SOB id used by this collective slave queue + * to signal its collective master queue upon completion. + * @curr_sob_offset: the id offset to the currently used SOB from the + * HL_RSVD_SOBS that are being used by this queue. + */ +struct hl_sync_stream_properties { + struct hl_hw_sob hw_sob[HL_RSVD_SOBS]; + u16 next_sob_val; + u16 base_sob_id; + u16 base_mon_id; + u16 collective_mstr_mon_id[HL_COLLECTIVE_RSVD_MSTR_MONS]; + u16 collective_slave_mon_id; + u16 collective_sob_id; + u8 curr_sob_offset; +}; + +/** + * struct hl_encaps_signals_mgr - describes sync stream encapsulated signals + * handlers manager + * @lock: protects handles. + * @handles: an idr to hold all encapsulated signals handles. + */ +struct hl_encaps_signals_mgr { + spinlock_t lock; + struct idr handles; +}; + +/** + * struct hl_hw_queue - describes a H/W transport queue. + * @shadow_queue: pointer to a shadow queue that holds pointers to jobs. + * @sync_stream_prop: sync stream queue properties + * @queue_type: type of queue. + * @collective_mode: collective mode of current queue + * @kernel_address: holds the queue's kernel virtual address. + * @bus_address: holds the queue's DMA address. + * @pi: holds the queue's pi value. + * @ci: holds the queue's ci value, AS CALCULATED BY THE DRIVER (not real ci). + * @hw_queue_id: the id of the H/W queue. + * @cq_id: the id for the corresponding CQ for this H/W queue. + * @msi_vec: the IRQ number of the H/W queue. + * @int_queue_len: length of internal queue (number of entries). + * @valid: is the queue valid (we have array of 32 queues, not all of them + * exist). + * @supports_sync_stream: True if queue supports sync stream + */ +struct hl_hw_queue { + struct hl_cs_job **shadow_queue; + struct hl_sync_stream_properties sync_stream_prop; + enum hl_queue_type queue_type; + enum hl_collective_mode collective_mode; + void *kernel_address; + dma_addr_t bus_address; + u32 pi; + atomic_t ci; + u32 hw_queue_id; + u32 cq_id; + u32 msi_vec; + u16 int_queue_len; + u8 valid; + u8 supports_sync_stream; +}; + +/** + * struct hl_cq - describes a completion queue + * @hdev: pointer to the device structure + * @kernel_address: holds the queue's kernel virtual address + * @bus_address: holds the queue's DMA address + * @cq_idx: completion queue index in array + * @hw_queue_id: the id of the matching H/W queue + * @ci: ci inside the queue + * @pi: pi inside the queue + * @free_slots_cnt: counter of free slots in queue + */ +struct hl_cq { + struct hl_device *hdev; + void *kernel_address; + dma_addr_t bus_address; + u32 cq_idx; + u32 hw_queue_id; + u32 ci; + u32 pi; + atomic_t free_slots_cnt; +}; + +/** + * struct hl_user_interrupt - holds user interrupt information + * @hdev: pointer to the device structure + * @wait_list_head: head to the list of user threads pending on this interrupt + * @wait_list_lock: protects wait_list_head + * @interrupt_id: msix interrupt id + * @is_decoder: whether this entry represents a decoder interrupt + */ +struct hl_user_interrupt { + struct hl_device *hdev; + struct list_head wait_list_head; + spinlock_t wait_list_lock; + u32 interrupt_id; + bool is_decoder; +}; + +/** + * struct timestamp_reg_free_node - holds the timestamp registration free objects node + * @free_objects_node: node in the list free_obj_jobs + * @cq_cb: pointer to cq command buffer to be freed + * @buf: pointer to timestamp buffer to be freed + */ +struct timestamp_reg_free_node { + struct list_head free_objects_node; + struct hl_cb *cq_cb; + struct hl_mmap_mem_buf *buf; +}; + +/* struct timestamp_reg_work_obj - holds the timestamp registration free objects job + * the job will be to pass over the free_obj_jobs list and put refcount to objects + * in each node of the list + * @free_obj: workqueue object to free timestamp registration node objects + * @hdev: pointer to the device structure + * @free_obj_head: list of free jobs nodes (node type timestamp_reg_free_node) + */ +struct timestamp_reg_work_obj { + struct work_struct free_obj; + struct hl_device *hdev; + struct list_head *free_obj_head; +}; + +/* struct timestamp_reg_info - holds the timestamp registration related data. + * @buf: pointer to the timestamp buffer which include both user/kernel buffers. + * relevant only when doing timestamps records registration. + * @cq_cb: pointer to CQ counter CB. + * @timestamp_kernel_addr: timestamp handle address, where to set timestamp + * relevant only when doing timestamps records + * registration. + * @in_use: indicates if the node already in use. relevant only when doing + * timestamps records registration, since in this case the driver + * will have it's own buffer which serve as a records pool instead of + * allocating records dynamically. + */ +struct timestamp_reg_info { + struct hl_mmap_mem_buf *buf; + struct hl_cb *cq_cb; + u64 *timestamp_kernel_addr; + u8 in_use; +}; + +/** + * struct hl_user_pending_interrupt - holds a context to a user thread + * pending on an interrupt + * @ts_reg_info: holds the timestamps registration nodes info + * @wait_list_node: node in the list of user threads pending on an interrupt + * @fence: hl fence object for interrupt completion + * @cq_target_value: CQ target value + * @cq_kernel_addr: CQ kernel address, to be used in the cq interrupt + * handler for target value comparison + */ +struct hl_user_pending_interrupt { + struct timestamp_reg_info ts_reg_info; + struct list_head wait_list_node; + struct hl_fence fence; + u64 cq_target_value; + u64 *cq_kernel_addr; +}; + +/** + * struct hl_eq - describes the event queue (single one per device) + * @hdev: pointer to the device structure + * @kernel_address: holds the queue's kernel virtual address + * @bus_address: holds the queue's DMA address + * @ci: ci inside the queue + * @prev_eqe_index: the index of the previous event queue entry. The index of + * the current entry's index must be +1 of the previous one. + * @check_eqe_index: do we need to check the index of the current entry vs. the + * previous one. This is for backward compatibility with older + * firmwares + */ +struct hl_eq { + struct hl_device *hdev; + void *kernel_address; + dma_addr_t bus_address; + u32 ci; + u32 prev_eqe_index; + bool check_eqe_index; +}; + +/** + * struct hl_dec - describes a decoder sw instance. + * @hdev: pointer to the device structure. + * @completion_abnrm_work: workqueue object to run when decoder generates an error interrupt + * @core_id: ID of the decoder. + * @base_addr: base address of the decoder. + */ +struct hl_dec { + struct hl_device *hdev; + struct work_struct completion_abnrm_work; + u32 core_id; + u32 base_addr; +}; + +/** + * enum hl_asic_type - supported ASIC types. + * @ASIC_INVALID: Invalid ASIC type. + * @ASIC_GOYA: Goya device (HL-1000). + * @ASIC_GAUDI: Gaudi device (HL-2000). + * @ASIC_GAUDI_SEC: Gaudi secured device (HL-2000). + * @ASIC_GAUDI2: Gaudi2 device. + * @ASIC_GAUDI2_SEC: Gaudi2 secured device. + */ +enum hl_asic_type { + ASIC_INVALID, + ASIC_GOYA, + ASIC_GAUDI, + ASIC_GAUDI_SEC, + ASIC_GAUDI2, + ASIC_GAUDI2_SEC, +}; + +struct hl_cs_parser; + +/** + * enum hl_pm_mng_profile - power management profile. + * @PM_AUTO: internal clock is set by the Linux driver. + * @PM_MANUAL: internal clock is set by the user. + * @PM_LAST: last power management type. + */ +enum hl_pm_mng_profile { + PM_AUTO = 1, + PM_MANUAL, + PM_LAST +}; + +/** + * enum hl_pll_frequency - PLL frequency. + * @PLL_HIGH: high frequency. + * @PLL_LOW: low frequency. + * @PLL_LAST: last frequency values that were configured by the user. + */ +enum hl_pll_frequency { + PLL_HIGH = 1, + PLL_LOW, + PLL_LAST +}; + +#define PLL_REF_CLK 50 + +enum div_select_defs { + DIV_SEL_REF_CLK = 0, + DIV_SEL_PLL_CLK = 1, + DIV_SEL_DIVIDED_REF = 2, + DIV_SEL_DIVIDED_PLL = 3, +}; + +enum debugfs_access_type { + DEBUGFS_READ8, + DEBUGFS_WRITE8, + DEBUGFS_READ32, + DEBUGFS_WRITE32, + DEBUGFS_READ64, + DEBUGFS_WRITE64, +}; + +enum pci_region { + PCI_REGION_CFG, + PCI_REGION_SRAM, + PCI_REGION_DRAM, + PCI_REGION_SP_SRAM, + PCI_REGION_NUMBER, +}; + +/** + * struct pci_mem_region - describe memory region in a PCI bar + * @region_base: region base address + * @region_size: region size + * @bar_size: size of the BAR + * @offset_in_bar: region offset into the bar + * @bar_id: bar ID of the region + * @used: if used 1, otherwise 0 + */ +struct pci_mem_region { + u64 region_base; + u64 region_size; + u64 bar_size; + u64 offset_in_bar; + u8 bar_id; + u8 used; +}; + +/** + * struct static_fw_load_mgr - static FW load manager + * @preboot_version_max_off: max offset to preboot version + * @boot_fit_version_max_off: max offset to boot fit version + * @kmd_msg_to_cpu_reg: register address for KDM->CPU messages + * @cpu_cmd_status_to_host_reg: register address for CPU command status response + * @cpu_boot_status_reg: boot status register + * @cpu_boot_dev_status0_reg: boot device status register 0 + * @cpu_boot_dev_status1_reg: boot device status register 1 + * @boot_err0_reg: boot error register 0 + * @boot_err1_reg: boot error register 1 + * @preboot_version_offset_reg: SRAM offset to preboot version register + * @boot_fit_version_offset_reg: SRAM offset to boot fit version register + * @sram_offset_mask: mask for getting offset into the SRAM + * @cpu_reset_wait_msec: used when setting WFE via kmd_msg_to_cpu_reg + */ +struct static_fw_load_mgr { + u64 preboot_version_max_off; + u64 boot_fit_version_max_off; + u32 kmd_msg_to_cpu_reg; + u32 cpu_cmd_status_to_host_reg; + u32 cpu_boot_status_reg; + u32 cpu_boot_dev_status0_reg; + u32 cpu_boot_dev_status1_reg; + u32 boot_err0_reg; + u32 boot_err1_reg; + u32 preboot_version_offset_reg; + u32 boot_fit_version_offset_reg; + u32 sram_offset_mask; + u32 cpu_reset_wait_msec; +}; + +/** + * struct fw_response - FW response to LKD command + * @ram_offset: descriptor offset into the RAM + * @ram_type: RAM type containing the descriptor (SRAM/DRAM) + * @status: command status + */ +struct fw_response { + u32 ram_offset; + u8 ram_type; + u8 status; +}; + +/** + * struct dynamic_fw_load_mgr - dynamic FW load manager + * @response: FW to LKD response + * @comm_desc: the communication descriptor with FW + * @image_region: region to copy the FW image to + * @fw_image_size: size of FW image to load + * @wait_for_bl_timeout: timeout for waiting for boot loader to respond + * @fw_desc_valid: true if FW descriptor has been validated and hence the data can be used + */ +struct dynamic_fw_load_mgr { + struct fw_response response; + struct lkd_fw_comms_desc comm_desc; + struct pci_mem_region *image_region; + size_t fw_image_size; + u32 wait_for_bl_timeout; + bool fw_desc_valid; +}; + +/** + * struct pre_fw_load_props - needed properties for pre-FW load + * @cpu_boot_status_reg: cpu_boot_status register address + * @sts_boot_dev_sts0_reg: sts_boot_dev_sts0 register address + * @sts_boot_dev_sts1_reg: sts_boot_dev_sts1 register address + * @boot_err0_reg: boot_err0 register address + * @boot_err1_reg: boot_err1 register address + * @wait_for_preboot_timeout: timeout to poll for preboot ready + */ +struct pre_fw_load_props { + u32 cpu_boot_status_reg; + u32 sts_boot_dev_sts0_reg; + u32 sts_boot_dev_sts1_reg; + u32 boot_err0_reg; + u32 boot_err1_reg; + u32 wait_for_preboot_timeout; +}; + +/** + * struct fw_image_props - properties of FW image + * @image_name: name of the image + * @src_off: offset in src FW to copy from + * @copy_size: amount of bytes to copy (0 to copy the whole binary) + */ +struct fw_image_props { + char *image_name; + u32 src_off; + u32 copy_size; +}; + +/** + * struct fw_load_mgr - manager FW loading process + * @dynamic_loader: specific structure for dynamic load + * @static_loader: specific structure for static load + * @pre_fw_load_props: parameter for pre FW load + * @boot_fit_img: boot fit image properties + * @linux_img: linux image properties + * @cpu_timeout: CPU response timeout in usec + * @boot_fit_timeout: Boot fit load timeout in usec + * @skip_bmc: should BMC be skipped + * @sram_bar_id: SRAM bar ID + * @dram_bar_id: DRAM bar ID + * @fw_comp_loaded: bitmask of loaded FW components. set bit meaning loaded + * component. values are set according to enum hl_fw_types. + */ +struct fw_load_mgr { + union { + struct dynamic_fw_load_mgr dynamic_loader; + struct static_fw_load_mgr static_loader; + }; + struct pre_fw_load_props pre_fw_load; + struct fw_image_props boot_fit_img; + struct fw_image_props linux_img; + u32 cpu_timeout; + u32 boot_fit_timeout; + u8 skip_bmc; + u8 sram_bar_id; + u8 dram_bar_id; + u8 fw_comp_loaded; +}; + +struct hl_cs; + +/** + * struct engines_data - asic engines data + * @buf: buffer for engines data in ascii + * @actual_size: actual size of data that was written by the driver to the allocated buffer + * @allocated_buf_size: total size of allocated buffer + */ +struct engines_data { + char *buf; + int actual_size; + u32 allocated_buf_size; +}; + +/** + * struct hl_asic_funcs - ASIC specific functions that are can be called from + * common code. + * @early_init: sets up early driver state (pre sw_init), doesn't configure H/W. + * @early_fini: tears down what was done in early_init. + * @late_init: sets up late driver/hw state (post hw_init) - Optional. + * @late_fini: tears down what was done in late_init (pre hw_fini) - Optional. + * @sw_init: sets up driver state, does not configure H/W. + * @sw_fini: tears down driver state, does not configure H/W. + * @hw_init: sets up the H/W state. + * @hw_fini: tears down the H/W state. + * @halt_engines: halt engines, needed for reset sequence. This also disables + * interrupts from the device. Should be called before + * hw_fini and before CS rollback. + * @suspend: handles IP specific H/W or SW changes for suspend. + * @resume: handles IP specific H/W or SW changes for resume. + * @mmap: maps a memory. + * @ring_doorbell: increment PI on a given QMAN. + * @pqe_write: Write the PQ entry to the PQ. This is ASIC-specific + * function because the PQs are located in different memory areas + * per ASIC (SRAM, DRAM, Host memory) and therefore, the method of + * writing the PQE must match the destination memory area + * properties. + * @asic_dma_alloc_coherent: Allocate coherent DMA memory by calling + * dma_alloc_coherent(). This is ASIC function because + * its implementation is not trivial when the driver + * is loaded in simulation mode (not upstreamed). + * @asic_dma_free_coherent: Free coherent DMA memory by calling + * dma_free_coherent(). This is ASIC function because + * its implementation is not trivial when the driver + * is loaded in simulation mode (not upstreamed). + * @scrub_device_mem: Scrub the entire SRAM and DRAM. + * @scrub_device_dram: Scrub the dram memory of the device. + * @get_int_queue_base: get the internal queue base address. + * @test_queues: run simple test on all queues for sanity check. + * @asic_dma_pool_zalloc: small DMA allocation of coherent memory from DMA pool. + * size of allocation is HL_DMA_POOL_BLK_SIZE. + * @asic_dma_pool_free: free small DMA allocation from pool. + * @cpu_accessible_dma_pool_alloc: allocate CPU PQ packet from DMA pool. + * @cpu_accessible_dma_pool_free: free CPU PQ packet from DMA pool. + * @asic_dma_unmap_single: unmap a single DMA buffer + * @asic_dma_map_single: map a single buffer to a DMA + * @hl_dma_unmap_sgtable: DMA unmap scatter-gather table. + * @cs_parser: parse Command Submission. + * @asic_dma_map_sgtable: DMA map scatter-gather table. + * @add_end_of_cb_packets: Add packets to the end of CB, if device requires it. + * @update_eq_ci: update event queue CI. + * @context_switch: called upon ASID context switch. + * @restore_phase_topology: clear all SOBs amd MONs. + * @debugfs_read_dma: debug interface for reading up to 2MB from the device's + * internal memory via DMA engine. + * @add_device_attr: add ASIC specific device attributes. + * @handle_eqe: handle event queue entry (IRQ) from CPU-CP. + * @get_events_stat: retrieve event queue entries histogram. + * @read_pte: read MMU page table entry from DRAM. + * @write_pte: write MMU page table entry to DRAM. + * @mmu_invalidate_cache: flush MMU STLB host/DRAM cache, either with soft + * (L1 only) or hard (L0 & L1) flush. + * @mmu_invalidate_cache_range: flush specific MMU STLB cache lines with ASID-VA-size mask. + * @mmu_prefetch_cache_range: pre-fetch specific MMU STLB cache lines with ASID-VA-size mask. + * @send_heartbeat: send is-alive packet to CPU-CP and verify response. + * @debug_coresight: perform certain actions on Coresight for debugging. + * @is_device_idle: return true if device is idle, false otherwise. + * @compute_reset_late_init: perform certain actions needed after a compute reset + * @hw_queues_lock: acquire H/W queues lock. + * @hw_queues_unlock: release H/W queues lock. + * @get_pci_id: retrieve PCI ID. + * @get_eeprom_data: retrieve EEPROM data from F/W. + * @get_monitor_dump: retrieve monitor registers dump from F/W. + * @send_cpu_message: send message to F/W. If the message is timedout, the + * driver will eventually reset the device. The timeout can + * be determined by the calling function or it can be 0 and + * then the timeout is the default timeout for the specific + * ASIC + * @get_hw_state: retrieve the H/W state + * @pci_bars_map: Map PCI BARs. + * @init_iatu: Initialize the iATU unit inside the PCI controller. + * @rreg: Read a register. Needed for simulator support. + * @wreg: Write a register. Needed for simulator support. + * @halt_coresight: stop the ETF and ETR traces. + * @ctx_init: context dependent initialization. + * @ctx_fini: context dependent cleanup. + * @pre_schedule_cs: Perform pre-CS-scheduling operations. + * @get_queue_id_for_cq: Get the H/W queue id related to the given CQ index. + * @load_firmware_to_device: load the firmware to the device's memory + * @load_boot_fit_to_device: load boot fit to device's memory + * @get_signal_cb_size: Get signal CB size. + * @get_wait_cb_size: Get wait CB size. + * @gen_signal_cb: Generate a signal CB. + * @gen_wait_cb: Generate a wait CB. + * @reset_sob: Reset a SOB. + * @reset_sob_group: Reset SOB group + * @get_device_time: Get the device time. + * @pb_print_security_errors: print security errors according block and cause + * @collective_wait_init_cs: Generate collective master/slave packets + * and place them in the relevant cs jobs + * @collective_wait_create_jobs: allocate collective wait cs jobs + * @get_dec_base_addr: get the base address of a given decoder. + * @scramble_addr: Routine to scramble the address prior of mapping it + * in the MMU. + * @descramble_addr: Routine to de-scramble the address prior of + * showing it to users. + * @ack_protection_bits_errors: ack and dump all security violations + * @get_hw_block_id: retrieve a HW block id to be used by the user to mmap it. + * also returns the size of the block if caller supplies + * a valid pointer for it + * @hw_block_mmap: mmap a HW block with a given id. + * @enable_events_from_fw: send interrupt to firmware to notify them the + * driver is ready to receive asynchronous events. This + * function should be called during the first init and + * after every hard-reset of the device + * @ack_mmu_errors: check and ack mmu errors, page fault, access violation. + * @get_msi_info: Retrieve asic-specific MSI ID of the f/w async event + * @map_pll_idx_to_fw_idx: convert driver specific per asic PLL index to + * generic f/w compatible PLL Indexes + * @init_firmware_preload_params: initialize pre FW-load parameters. + * @init_firmware_loader: initialize data for FW loader. + * @init_cpu_scrambler_dram: Enable CPU specific DRAM scrambling + * @state_dump_init: initialize constants required for state dump + * @get_sob_addr: get SOB base address offset. + * @set_pci_memory_regions: setting properties of PCI memory regions + * @get_stream_master_qid_arr: get pointer to stream masters QID array + * @check_if_razwi_happened: check if there was a razwi due to RR violation. + * @access_dev_mem: access device memory + * @set_dram_bar_base: set the base of the DRAM BAR + * @set_engine_cores: set a config command to enigne cores + * @send_device_activity: indication to FW about device availability + */ +struct hl_asic_funcs { + int (*early_init)(struct hl_device *hdev); + int (*early_fini)(struct hl_device *hdev); + int (*late_init)(struct hl_device *hdev); + void (*late_fini)(struct hl_device *hdev); + int (*sw_init)(struct hl_device *hdev); + int (*sw_fini)(struct hl_device *hdev); + int (*hw_init)(struct hl_device *hdev); + void (*hw_fini)(struct hl_device *hdev, bool hard_reset, bool fw_reset); + void (*halt_engines)(struct hl_device *hdev, bool hard_reset, bool fw_reset); + int (*suspend)(struct hl_device *hdev); + int (*resume)(struct hl_device *hdev); + int (*mmap)(struct hl_device *hdev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size); + void (*ring_doorbell)(struct hl_device *hdev, u32 hw_queue_id, u32 pi); + void (*pqe_write)(struct hl_device *hdev, __le64 *pqe, + struct hl_bd *bd); + void* (*asic_dma_alloc_coherent)(struct hl_device *hdev, size_t size, + dma_addr_t *dma_handle, gfp_t flag); + void (*asic_dma_free_coherent)(struct hl_device *hdev, size_t size, + void *cpu_addr, dma_addr_t dma_handle); + int (*scrub_device_mem)(struct hl_device *hdev); + int (*scrub_device_dram)(struct hl_device *hdev, u64 val); + void* (*get_int_queue_base)(struct hl_device *hdev, u32 queue_id, + dma_addr_t *dma_handle, u16 *queue_len); + int (*test_queues)(struct hl_device *hdev); + void* (*asic_dma_pool_zalloc)(struct hl_device *hdev, size_t size, + gfp_t mem_flags, dma_addr_t *dma_handle); + void (*asic_dma_pool_free)(struct hl_device *hdev, void *vaddr, + dma_addr_t dma_addr); + void* (*cpu_accessible_dma_pool_alloc)(struct hl_device *hdev, + size_t size, dma_addr_t *dma_handle); + void (*cpu_accessible_dma_pool_free)(struct hl_device *hdev, + size_t size, void *vaddr); + void (*asic_dma_unmap_single)(struct hl_device *hdev, + dma_addr_t dma_addr, int len, + enum dma_data_direction dir); + dma_addr_t (*asic_dma_map_single)(struct hl_device *hdev, + void *addr, int len, + enum dma_data_direction dir); + void (*hl_dma_unmap_sgtable)(struct hl_device *hdev, + struct sg_table *sgt, + enum dma_data_direction dir); + int (*cs_parser)(struct hl_device *hdev, struct hl_cs_parser *parser); + int (*asic_dma_map_sgtable)(struct hl_device *hdev, struct sg_table *sgt, + enum dma_data_direction dir); + void (*add_end_of_cb_packets)(struct hl_device *hdev, + void *kernel_address, u32 len, + u32 original_len, + u64 cq_addr, u32 cq_val, u32 msix_num, + bool eb); + void (*update_eq_ci)(struct hl_device *hdev, u32 val); + int (*context_switch)(struct hl_device *hdev, u32 asid); + void (*restore_phase_topology)(struct hl_device *hdev); + int (*debugfs_read_dma)(struct hl_device *hdev, u64 addr, u32 size, + void *blob_addr); + void (*add_device_attr)(struct hl_device *hdev, struct attribute_group *dev_clk_attr_grp, + struct attribute_group *dev_vrm_attr_grp); + void (*handle_eqe)(struct hl_device *hdev, + struct hl_eq_entry *eq_entry); + void* (*get_events_stat)(struct hl_device *hdev, bool aggregate, + u32 *size); + u64 (*read_pte)(struct hl_device *hdev, u64 addr); + void (*write_pte)(struct hl_device *hdev, u64 addr, u64 val); + int (*mmu_invalidate_cache)(struct hl_device *hdev, bool is_hard, + u32 flags); + int (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard, + u32 flags, u32 asid, u64 va, u64 size); + int (*mmu_prefetch_cache_range)(struct hl_ctx *ctx, u32 flags, u32 asid, u64 va, u64 size); + int (*send_heartbeat)(struct hl_device *hdev); + int (*debug_coresight)(struct hl_device *hdev, struct hl_ctx *ctx, void *data); + bool (*is_device_idle)(struct hl_device *hdev, u64 *mask_arr, u8 mask_len, + struct engines_data *e); + int (*compute_reset_late_init)(struct hl_device *hdev); + void (*hw_queues_lock)(struct hl_device *hdev); + void (*hw_queues_unlock)(struct hl_device *hdev); + u32 (*get_pci_id)(struct hl_device *hdev); + int (*get_eeprom_data)(struct hl_device *hdev, void *data, size_t max_size); + int (*get_monitor_dump)(struct hl_device *hdev, void *data); + int (*send_cpu_message)(struct hl_device *hdev, u32 *msg, + u16 len, u32 timeout, u64 *result); + int (*pci_bars_map)(struct hl_device *hdev); + int (*init_iatu)(struct hl_device *hdev); + u32 (*rreg)(struct hl_device *hdev, u32 reg); + void (*wreg)(struct hl_device *hdev, u32 reg, u32 val); + void (*halt_coresight)(struct hl_device *hdev, struct hl_ctx *ctx); + int (*ctx_init)(struct hl_ctx *ctx); + void (*ctx_fini)(struct hl_ctx *ctx); + int (*pre_schedule_cs)(struct hl_cs *cs); + u32 (*get_queue_id_for_cq)(struct hl_device *hdev, u32 cq_idx); + int (*load_firmware_to_device)(struct hl_device *hdev); + int (*load_boot_fit_to_device)(struct hl_device *hdev); + u32 (*get_signal_cb_size)(struct hl_device *hdev); + u32 (*get_wait_cb_size)(struct hl_device *hdev); + u32 (*gen_signal_cb)(struct hl_device *hdev, void *data, u16 sob_id, + u32 size, bool eb); + u32 (*gen_wait_cb)(struct hl_device *hdev, + struct hl_gen_wait_properties *prop); + void (*reset_sob)(struct hl_device *hdev, void *data); + void (*reset_sob_group)(struct hl_device *hdev, u16 sob_group); + u64 (*get_device_time)(struct hl_device *hdev); + void (*pb_print_security_errors)(struct hl_device *hdev, + u32 block_addr, u32 cause, u32 offended_addr); + int (*collective_wait_init_cs)(struct hl_cs *cs); + int (*collective_wait_create_jobs)(struct hl_device *hdev, + struct hl_ctx *ctx, struct hl_cs *cs, + u32 wait_queue_id, u32 collective_engine_id, + u32 encaps_signal_offset); + u32 (*get_dec_base_addr)(struct hl_device *hdev, u32 core_id); + u64 (*scramble_addr)(struct hl_device *hdev, u64 addr); + u64 (*descramble_addr)(struct hl_device *hdev, u64 addr); + void (*ack_protection_bits_errors)(struct hl_device *hdev); + int (*get_hw_block_id)(struct hl_device *hdev, u64 block_addr, + u32 *block_size, u32 *block_id); + int (*hw_block_mmap)(struct hl_device *hdev, struct vm_area_struct *vma, + u32 block_id, u32 block_size); + void (*enable_events_from_fw)(struct hl_device *hdev); + int (*ack_mmu_errors)(struct hl_device *hdev, u64 mmu_cap_mask); + void (*get_msi_info)(__le32 *table); + int (*map_pll_idx_to_fw_idx)(u32 pll_idx); + void (*init_firmware_preload_params)(struct hl_device *hdev); + void (*init_firmware_loader)(struct hl_device *hdev); + void (*init_cpu_scrambler_dram)(struct hl_device *hdev); + void (*state_dump_init)(struct hl_device *hdev); + u32 (*get_sob_addr)(struct hl_device *hdev, u32 sob_id); + void (*set_pci_memory_regions)(struct hl_device *hdev); + u32* (*get_stream_master_qid_arr)(void); + void (*check_if_razwi_happened)(struct hl_device *hdev); + int (*mmu_get_real_page_size)(struct hl_device *hdev, struct hl_mmu_properties *mmu_prop, + u32 page_size, u32 *real_page_size, bool is_dram_addr); + int (*access_dev_mem)(struct hl_device *hdev, enum pci_region region_type, + u64 addr, u64 *val, enum debugfs_access_type acc_type); + u64 (*set_dram_bar_base)(struct hl_device *hdev, u64 addr); + int (*set_engine_cores)(struct hl_device *hdev, u32 *core_ids, + u32 num_cores, u32 core_command); + int (*send_device_activity)(struct hl_device *hdev, bool open); +}; + + +/* + * CONTEXTS + */ + +#define HL_KERNEL_ASID_ID 0 + +/** + * enum hl_va_range_type - virtual address range type. + * @HL_VA_RANGE_TYPE_HOST: range type of host pages + * @HL_VA_RANGE_TYPE_HOST_HUGE: range type of host huge pages + * @HL_VA_RANGE_TYPE_DRAM: range type of dram pages + */ +enum hl_va_range_type { + HL_VA_RANGE_TYPE_HOST, + HL_VA_RANGE_TYPE_HOST_HUGE, + HL_VA_RANGE_TYPE_DRAM, + HL_VA_RANGE_TYPE_MAX +}; + +/** + * struct hl_va_range - virtual addresses range. + * @lock: protects the virtual addresses list. + * @list: list of virtual addresses blocks available for mappings. + * @start_addr: range start address. + * @end_addr: range end address. + * @page_size: page size of this va range. + */ +struct hl_va_range { + struct mutex lock; + struct list_head list; + u64 start_addr; + u64 end_addr; + u32 page_size; +}; + +/** + * struct hl_cs_counters_atomic - command submission counters + * @out_of_mem_drop_cnt: dropped due to memory allocation issue + * @parsing_drop_cnt: dropped due to error in packet parsing + * @queue_full_drop_cnt: dropped due to queue full + * @device_in_reset_drop_cnt: dropped due to device in reset + * @max_cs_in_flight_drop_cnt: dropped due to maximum CS in-flight + * @validation_drop_cnt: dropped due to error in validation + */ +struct hl_cs_counters_atomic { + atomic64_t out_of_mem_drop_cnt; + atomic64_t parsing_drop_cnt; + atomic64_t queue_full_drop_cnt; + atomic64_t device_in_reset_drop_cnt; + atomic64_t max_cs_in_flight_drop_cnt; + atomic64_t validation_drop_cnt; +}; + +/** + * struct hl_dmabuf_priv - a dma-buf private object. + * @dmabuf: pointer to dma-buf object. + * @ctx: pointer to the dma-buf owner's context. + * @phys_pg_pack: pointer to physical page pack if the dma-buf was exported for + * memory allocation handle. + * @device_address: physical address of the device's memory. Relevant only + * if phys_pg_pack is NULL (dma-buf was exported from address). + * The total size can be taken from the dmabuf object. + */ +struct hl_dmabuf_priv { + struct dma_buf *dmabuf; + struct hl_ctx *ctx; + struct hl_vm_phys_pg_pack *phys_pg_pack; + uint64_t device_address; +}; + +#define HL_CS_OUTCOME_HISTORY_LEN 256 + +/** + * struct hl_cs_outcome - represents a single completed CS outcome + * @list_link: link to either container's used list or free list + * @map_link: list to the container hash map + * @ts: completion ts + * @seq: the original cs sequence + * @error: error code cs completed with, if any + */ +struct hl_cs_outcome { + struct list_head list_link; + struct hlist_node map_link; + ktime_t ts; + u64 seq; + int error; +}; + +/** + * struct hl_cs_outcome_store - represents a limited store of completed CS outcomes + * @outcome_map: index of completed CS searchable by sequence number + * @used_list: list of outcome objects currently in use + * @free_list: list of outcome objects currently not in use + * @nodes_pool: a static pool of pre-allocated outcome objects + * @db_lock: any operation on the store must take this lock + */ +struct hl_cs_outcome_store { + DECLARE_HASHTABLE(outcome_map, 8); + struct list_head used_list; + struct list_head free_list; + struct hl_cs_outcome nodes_pool[HL_CS_OUTCOME_HISTORY_LEN]; + spinlock_t db_lock; +}; + +/** + * struct hl_ctx - user/kernel context. + * @mem_hash: holds mapping from virtual address to virtual memory area + * descriptor (hl_vm_phys_pg_list or hl_userptr). + * @mmu_shadow_hash: holds a mapping from shadow address to pgt_info structure. + * @hr_mmu_phys_hash: if host-resident MMU is used, holds a mapping from + * MMU-hop-page physical address to its host-resident + * pgt_info structure. + * @hpriv: pointer to the private (Kernel Driver) data of the process (fd). + * @hdev: pointer to the device structure. + * @refcount: reference counter for the context. Context is released only when + * this hits 0l. It is incremented on CS and CS_WAIT. + * @cs_pending: array of hl fence objects representing pending CS. + * @outcome_store: storage data structure used to remember outcomes of completed + * command submissions for a long time after CS id wraparound. + * @va_range: holds available virtual addresses for host and dram mappings. + * @mem_hash_lock: protects the mem_hash. + * @hw_block_list_lock: protects the HW block memory list. + * @debugfs_list: node in debugfs list of contexts. + * @hw_block_mem_list: list of HW block virtual mapped addresses. + * @cs_counters: context command submission counters. + * @cb_va_pool: device VA pool for command buffers which are mapped to the + * device's MMU. + * @sig_mgr: encaps signals handle manager. + * @cb_va_pool_base: the base address for the device VA pool + * @cs_sequence: sequence number for CS. Value is assigned to a CS and passed + * to user so user could inquire about CS. It is used as + * index to cs_pending array. + * @dram_default_hops: array that holds all hops addresses needed for default + * DRAM mapping. + * @cs_lock: spinlock to protect cs_sequence. + * @dram_phys_mem: amount of used physical DRAM memory by this context. + * @thread_ctx_switch_token: token to prevent multiple threads of the same + * context from running the context switch phase. + * Only a single thread should run it. + * @thread_ctx_switch_wait_token: token to prevent the threads that didn't run + * the context switch phase from moving to their + * execution phase before the context switch phase + * has finished. + * @asid: context's unique address space ID in the device's MMU. + * @handle: context's opaque handle for user + */ +struct hl_ctx { + DECLARE_HASHTABLE(mem_hash, MEM_HASH_TABLE_BITS); + DECLARE_HASHTABLE(mmu_shadow_hash, MMU_HASH_TABLE_BITS); + DECLARE_HASHTABLE(hr_mmu_phys_hash, MMU_HASH_TABLE_BITS); + struct hl_fpriv *hpriv; + struct hl_device *hdev; + struct kref refcount; + struct hl_fence **cs_pending; + struct hl_cs_outcome_store outcome_store; + struct hl_va_range *va_range[HL_VA_RANGE_TYPE_MAX]; + struct mutex mem_hash_lock; + struct mutex hw_block_list_lock; + struct list_head debugfs_list; + struct list_head hw_block_mem_list; + struct hl_cs_counters_atomic cs_counters; + struct gen_pool *cb_va_pool; + struct hl_encaps_signals_mgr sig_mgr; + u64 cb_va_pool_base; + u64 cs_sequence; + u64 *dram_default_hops; + spinlock_t cs_lock; + atomic64_t dram_phys_mem; + atomic_t thread_ctx_switch_token; + u32 thread_ctx_switch_wait_token; + u32 asid; + u32 handle; +}; + +/** + * struct hl_ctx_mgr - for handling multiple contexts. + * @lock: protects ctx_handles. + * @handles: idr to hold all ctx handles. + */ +struct hl_ctx_mgr { + struct mutex lock; + struct idr handles; +}; + + +/* + * COMMAND SUBMISSIONS + */ + +/** + * struct hl_userptr - memory mapping chunk information + * @vm_type: type of the VM. + * @job_node: linked-list node for hanging the object on the Job's list. + * @pages: pointer to struct page array + * @npages: size of @pages array + * @sgt: pointer to the scatter-gather table that holds the pages. + * @dir: for DMA unmapping, the direction must be supplied, so save it. + * @debugfs_list: node in debugfs list of command submissions. + * @pid: the pid of the user process owning the memory + * @addr: user-space virtual address of the start of the memory area. + * @size: size of the memory area to pin & map. + * @dma_mapped: true if the SG was mapped to DMA addresses, false otherwise. + */ +struct hl_userptr { + enum vm_type vm_type; /* must be first */ + struct list_head job_node; + struct page **pages; + unsigned int npages; + struct sg_table *sgt; + enum dma_data_direction dir; + struct list_head debugfs_list; + pid_t pid; + u64 addr; + u64 size; + u8 dma_mapped; +}; + +/** + * struct hl_cs - command submission. + * @jobs_in_queue_cnt: per each queue, maintain counter of submitted jobs. + * @ctx: the context this CS belongs to. + * @job_list: list of the CS's jobs in the various queues. + * @job_lock: spinlock for the CS's jobs list. Needed for free_job. + * @refcount: reference counter for usage of the CS. + * @fence: pointer to the fence object of this CS. + * @signal_fence: pointer to the fence object of the signal CS (used by wait + * CS only). + * @finish_work: workqueue object to run when CS is completed by H/W. + * @work_tdr: delayed work node for TDR. + * @mirror_node : node in device mirror list of command submissions. + * @staged_cs_node: node in the staged cs list. + * @debugfs_list: node in debugfs list of command submissions. + * @encaps_sig_hdl: holds the encaps signals handle. + * @sequence: the sequence number of this CS. + * @staged_sequence: the sequence of the staged submission this CS is part of, + * relevant only if staged_cs is set. + * @timeout_jiffies: cs timeout in jiffies. + * @submission_time_jiffies: submission time of the cs + * @type: CS_TYPE_*. + * @jobs_cnt: counter of submitted jobs on all queues. + * @encaps_sig_hdl_id: encaps signals handle id, set for the first staged cs. + * @sob_addr_offset: sob offset from the configuration base address. + * @initial_sob_count: count of completed signals in SOB before current submission of signal or + * cs with encaps signals. + * @submitted: true if CS was submitted to H/W. + * @completed: true if CS was completed by device. + * @timedout : true if CS was timedout. + * @tdr_active: true if TDR was activated for this CS (to prevent + * double TDR activation). + * @aborted: true if CS was aborted due to some device error. + * @timestamp: true if a timestamp must be captured upon completion. + * @staged_last: true if this is the last staged CS and needs completion. + * @staged_first: true if this is the first staged CS and we need to receive + * timeout for this CS. + * @staged_cs: true if this CS is part of a staged submission. + * @skip_reset_on_timeout: true if we shall not reset the device in case + * timeout occurs (debug scenario). + * @encaps_signals: true if this CS has encaps reserved signals. + */ +struct hl_cs { + u16 *jobs_in_queue_cnt; + struct hl_ctx *ctx; + struct list_head job_list; + spinlock_t job_lock; + struct kref refcount; + struct hl_fence *fence; + struct hl_fence *signal_fence; + struct work_struct finish_work; + struct delayed_work work_tdr; + struct list_head mirror_node; + struct list_head staged_cs_node; + struct list_head debugfs_list; + struct hl_cs_encaps_sig_handle *encaps_sig_hdl; + u64 sequence; + u64 staged_sequence; + u64 timeout_jiffies; + u64 submission_time_jiffies; + enum hl_cs_type type; + u32 jobs_cnt; + u32 encaps_sig_hdl_id; + u32 sob_addr_offset; + u16 initial_sob_count; + u8 submitted; + u8 completed; + u8 timedout; + u8 tdr_active; + u8 aborted; + u8 timestamp; + u8 staged_last; + u8 staged_first; + u8 staged_cs; + u8 skip_reset_on_timeout; + u8 encaps_signals; +}; + +/** + * struct hl_cs_job - command submission job. + * @cs_node: the node to hang on the CS jobs list. + * @cs: the CS this job belongs to. + * @user_cb: the CB we got from the user. + * @patched_cb: in case of patching, this is internal CB which is submitted on + * the queue instead of the CB we got from the IOCTL. + * @finish_work: workqueue object to run when job is completed. + * @userptr_list: linked-list of userptr mappings that belong to this job and + * wait for completion. + * @debugfs_list: node in debugfs list of command submission jobs. + * @refcount: reference counter for usage of the CS job. + * @queue_type: the type of the H/W queue this job is submitted to. + * @id: the id of this job inside a CS. + * @hw_queue_id: the id of the H/W queue this job is submitted to. + * @user_cb_size: the actual size of the CB we got from the user. + * @job_cb_size: the actual size of the CB that we put on the queue. + * @encaps_sig_wait_offset: encapsulated signals offset, which allow user + * to wait on part of the reserved signals. + * @is_kernel_allocated_cb: true if the CB handle we got from the user holds a + * handle to a kernel-allocated CB object, false + * otherwise (SRAM/DRAM/host address). + * @contains_dma_pkt: whether the JOB contains at least one DMA packet. This + * info is needed later, when adding the 2xMSG_PROT at the + * end of the JOB, to know which barriers to put in the + * MSG_PROT packets. Relevant only for GAUDI as GOYA doesn't + * have streams so the engine can't be busy by another + * stream. + */ +struct hl_cs_job { + struct list_head cs_node; + struct hl_cs *cs; + struct hl_cb *user_cb; + struct hl_cb *patched_cb; + struct work_struct finish_work; + struct list_head userptr_list; + struct list_head debugfs_list; + struct kref refcount; + enum hl_queue_type queue_type; + u32 id; + u32 hw_queue_id; + u32 user_cb_size; + u32 job_cb_size; + u32 encaps_sig_wait_offset; + u8 is_kernel_allocated_cb; + u8 contains_dma_pkt; +}; + +/** + * struct hl_cs_parser - command submission parser properties. + * @user_cb: the CB we got from the user. + * @patched_cb: in case of patching, this is internal CB which is submitted on + * the queue instead of the CB we got from the IOCTL. + * @job_userptr_list: linked-list of userptr mappings that belong to the related + * job and wait for completion. + * @cs_sequence: the sequence number of the related CS. + * @queue_type: the type of the H/W queue this job is submitted to. + * @ctx_id: the ID of the context the related CS belongs to. + * @hw_queue_id: the id of the H/W queue this job is submitted to. + * @user_cb_size: the actual size of the CB we got from the user. + * @patched_cb_size: the size of the CB after parsing. + * @job_id: the id of the related job inside the related CS. + * @is_kernel_allocated_cb: true if the CB handle we got from the user holds a + * handle to a kernel-allocated CB object, false + * otherwise (SRAM/DRAM/host address). + * @contains_dma_pkt: whether the JOB contains at least one DMA packet. This + * info is needed later, when adding the 2xMSG_PROT at the + * end of the JOB, to know which barriers to put in the + * MSG_PROT packets. Relevant only for GAUDI as GOYA doesn't + * have streams so the engine can't be busy by another + * stream. + * @completion: true if we need completion for this CS. + */ +struct hl_cs_parser { + struct hl_cb *user_cb; + struct hl_cb *patched_cb; + struct list_head *job_userptr_list; + u64 cs_sequence; + enum hl_queue_type queue_type; + u32 ctx_id; + u32 hw_queue_id; + u32 user_cb_size; + u32 patched_cb_size; + u8 job_id; + u8 is_kernel_allocated_cb; + u8 contains_dma_pkt; + u8 completion; +}; + +/* + * MEMORY STRUCTURE + */ + +/** + * struct hl_vm_hash_node - hash element from virtual address to virtual + * memory area descriptor (hl_vm_phys_pg_list or + * hl_userptr). + * @node: node to hang on the hash table in context object. + * @vaddr: key virtual address. + * @ptr: value pointer (hl_vm_phys_pg_list or hl_userptr). + */ +struct hl_vm_hash_node { + struct hlist_node node; + u64 vaddr; + void *ptr; +}; + +/** + * struct hl_vm_hw_block_list_node - list element from user virtual address to + * HW block id. + * @node: node to hang on the list in context object. + * @ctx: the context this node belongs to. + * @vaddr: virtual address of the HW block. + * @block_size: size of the block. + * @mapped_size: size of the block which is mapped. May change if partial un-mappings are done. + * @id: HW block id (handle). + */ +struct hl_vm_hw_block_list_node { + struct list_head node; + struct hl_ctx *ctx; + unsigned long vaddr; + u32 block_size; + u32 mapped_size; + u32 id; +}; + +/** + * struct hl_vm_phys_pg_pack - physical page pack. + * @vm_type: describes the type of the virtual area descriptor. + * @pages: the physical page array. + * @npages: num physical pages in the pack. + * @total_size: total size of all the pages in this list. + * @node: used to attach to deletion list that is used when all the allocations are cleared + * at the teardown of the context. + * @mapping_cnt: number of shared mappings. + * @exporting_cnt: number of dma-buf exporting. + * @asid: the context related to this list. + * @page_size: size of each page in the pack. + * @flags: HL_MEM_* flags related to this list. + * @handle: the provided handle related to this list. + * @offset: offset from the first page. + * @contiguous: is contiguous physical memory. + * @created_from_userptr: is product of host virtual address. + */ +struct hl_vm_phys_pg_pack { + enum vm_type vm_type; /* must be first */ + u64 *pages; + u64 npages; + u64 total_size; + struct list_head node; + atomic_t mapping_cnt; + u32 exporting_cnt; + u32 asid; + u32 page_size; + u32 flags; + u32 handle; + u32 offset; + u8 contiguous; + u8 created_from_userptr; +}; + +/** + * struct hl_vm_va_block - virtual range block information. + * @node: node to hang on the virtual range list in context object. + * @start: virtual range start address. + * @end: virtual range end address. + * @size: virtual range size. + */ +struct hl_vm_va_block { + struct list_head node; + u64 start; + u64 end; + u64 size; +}; + +/** + * struct hl_vm - virtual memory manager for MMU. + * @dram_pg_pool: pool for DRAM physical pages of 2MB. + * @dram_pg_pool_refcount: reference counter for the pool usage. + * @idr_lock: protects the phys_pg_list_handles. + * @phys_pg_pack_handles: idr to hold all device allocations handles. + * @init_done: whether initialization was done. We need this because VM + * initialization might be skipped during device initialization. + */ +struct hl_vm { + struct gen_pool *dram_pg_pool; + struct kref dram_pg_pool_refcount; + spinlock_t idr_lock; + struct idr phys_pg_pack_handles; + u8 init_done; +}; + + +/* + * DEBUG, PROFILING STRUCTURE + */ + +/** + * struct hl_debug_params - Coresight debug parameters. + * @input: pointer to component specific input parameters. + * @output: pointer to component specific output parameters. + * @output_size: size of output buffer. + * @reg_idx: relevant register ID. + * @op: component operation to execute. + * @enable: true if to enable component debugging, false otherwise. + */ +struct hl_debug_params { + void *input; + void *output; + u32 output_size; + u32 reg_idx; + u32 op; + bool enable; +}; + +/** + * struct hl_notifier_event - holds the notifier data structure + * @eventfd: the event file descriptor to raise the notifications + * @lock: mutex lock to protect the notifier data flows + * @events_mask: indicates the bitmap events + */ +struct hl_notifier_event { + struct eventfd_ctx *eventfd; + struct mutex lock; + u64 events_mask; +}; + +/* + * FILE PRIVATE STRUCTURE + */ + +/** + * struct hl_fpriv - process information stored in FD private data. + * @hdev: habanalabs device structure. + * @filp: pointer to the given file structure. + * @taskpid: current process ID. + * @ctx: current executing context. TODO: remove for multiple ctx per process + * @ctx_mgr: context manager to handle multiple context for this FD. + * @mem_mgr: manager descriptor for memory exportable via mmap + * @notifier_event: notifier eventfd towards user process + * @debugfs_list: list of relevant ASIC debugfs. + * @dev_node: node in the device list of file private data + * @refcount: number of related contexts. + * @restore_phase_mutex: lock for context switch and restore phase. + * @ctx_lock: protects the pointer to current executing context pointer. TODO: remove for multiple + * ctx per process. + */ +struct hl_fpriv { + struct hl_device *hdev; + struct file *filp; + struct pid *taskpid; + struct hl_ctx *ctx; + struct hl_ctx_mgr ctx_mgr; + struct hl_mem_mgr mem_mgr; + struct hl_notifier_event notifier_event; + struct list_head debugfs_list; + struct list_head dev_node; + struct kref refcount; + struct mutex restore_phase_mutex; + struct mutex ctx_lock; +}; + + +/* + * DebugFS + */ + +/** + * struct hl_info_list - debugfs file ops. + * @name: file name. + * @show: function to output information. + * @write: function to write to the file. + */ +struct hl_info_list { + const char *name; + int (*show)(struct seq_file *s, void *data); + ssize_t (*write)(struct file *file, const char __user *buf, + size_t count, loff_t *f_pos); +}; + +/** + * struct hl_debugfs_entry - debugfs dentry wrapper. + * @info_ent: dentry related ops. + * @dev_entry: ASIC specific debugfs manager. + */ +struct hl_debugfs_entry { + const struct hl_info_list *info_ent; + struct hl_dbg_device_entry *dev_entry; +}; + +/** + * struct hl_dbg_device_entry - ASIC specific debugfs manager. + * @root: root dentry. + * @hdev: habanalabs device structure. + * @entry_arr: array of available hl_debugfs_entry. + * @file_list: list of available debugfs files. + * @file_mutex: protects file_list. + * @cb_list: list of available CBs. + * @cb_spinlock: protects cb_list. + * @cs_list: list of available CSs. + * @cs_spinlock: protects cs_list. + * @cs_job_list: list of available CB jobs. + * @cs_job_spinlock: protects cs_job_list. + * @userptr_list: list of available userptrs (virtual memory chunk descriptor). + * @userptr_spinlock: protects userptr_list. + * @ctx_mem_hash_list: list of available contexts with MMU mappings. + * @ctx_mem_hash_spinlock: protects cb_list. + * @data_dma_blob_desc: data DMA descriptor of blob. + * @mon_dump_blob_desc: monitor dump descriptor of blob. + * @state_dump: data of the system states in case of a bad cs. + * @state_dump_sem: protects state_dump. + * @addr: next address to read/write from/to in read/write32. + * @mmu_addr: next virtual address to translate to physical address in mmu_show. + * @mmu_cap_mask: mmu hw capability mask, to be used in mmu_ack_error. + * @userptr_lookup: the target user ptr to look up for on demand. + * @mmu_asid: ASID to use while translating in mmu_show. + * @state_dump_head: index of the latest state dump + * @i2c_bus: generic u8 debugfs file for bus value to use in i2c_data_read. + * @i2c_addr: generic u8 debugfs file for address value to use in i2c_data_read. + * @i2c_reg: generic u8 debugfs file for register value to use in i2c_data_read. + * @i2c_len: generic u8 debugfs file for length value to use in i2c_data_read. + */ +struct hl_dbg_device_entry { + struct dentry *root; + struct hl_device *hdev; + struct hl_debugfs_entry *entry_arr; + struct list_head file_list; + struct mutex file_mutex; + struct list_head cb_list; + spinlock_t cb_spinlock; + struct list_head cs_list; + spinlock_t cs_spinlock; + struct list_head cs_job_list; + spinlock_t cs_job_spinlock; + struct list_head userptr_list; + spinlock_t userptr_spinlock; + struct list_head ctx_mem_hash_list; + spinlock_t ctx_mem_hash_spinlock; + struct debugfs_blob_wrapper data_dma_blob_desc; + struct debugfs_blob_wrapper mon_dump_blob_desc; + char *state_dump[HL_STATE_DUMP_HIST_LEN]; + struct rw_semaphore state_dump_sem; + u64 addr; + u64 mmu_addr; + u64 mmu_cap_mask; + u64 userptr_lookup; + u32 mmu_asid; + u32 state_dump_head; + u8 i2c_bus; + u8 i2c_addr; + u8 i2c_reg; + u8 i2c_len; +}; + +/** + * struct hl_hw_obj_name_entry - single hw object name, member of + * hl_state_dump_specs + * @node: link to the containing hash table + * @name: hw object name + * @id: object identifier + */ +struct hl_hw_obj_name_entry { + struct hlist_node node; + const char *name; + u32 id; +}; + +enum hl_state_dump_specs_props { + SP_SYNC_OBJ_BASE_ADDR, + SP_NEXT_SYNC_OBJ_ADDR, + SP_SYNC_OBJ_AMOUNT, + SP_MON_OBJ_WR_ADDR_LOW, + SP_MON_OBJ_WR_ADDR_HIGH, + SP_MON_OBJ_WR_DATA, + SP_MON_OBJ_ARM_DATA, + SP_MON_OBJ_STATUS, + SP_MONITORS_AMOUNT, + SP_TPC0_CMDQ, + SP_TPC0_CFG_SO, + SP_NEXT_TPC, + SP_MME_CMDQ, + SP_MME_CFG_SO, + SP_NEXT_MME, + SP_DMA_CMDQ, + SP_DMA_CFG_SO, + SP_DMA_QUEUES_OFFSET, + SP_NUM_OF_MME_ENGINES, + SP_SUB_MME_ENG_NUM, + SP_NUM_OF_DMA_ENGINES, + SP_NUM_OF_TPC_ENGINES, + SP_ENGINE_NUM_OF_QUEUES, + SP_ENGINE_NUM_OF_STREAMS, + SP_ENGINE_NUM_OF_FENCES, + SP_FENCE0_CNT_OFFSET, + SP_FENCE0_RDATA_OFFSET, + SP_CP_STS_OFFSET, + SP_NUM_CORES, + + SP_MAX +}; + +enum hl_sync_engine_type { + ENGINE_TPC, + ENGINE_DMA, + ENGINE_MME, +}; + +/** + * struct hl_mon_state_dump - represents a state dump of a single monitor + * @id: monitor id + * @wr_addr_low: address monitor will write to, low bits + * @wr_addr_high: address monitor will write to, high bits + * @wr_data: data monitor will write + * @arm_data: register value containing monitor configuration + * @status: monitor status + */ +struct hl_mon_state_dump { + u32 id; + u32 wr_addr_low; + u32 wr_addr_high; + u32 wr_data; + u32 arm_data; + u32 status; +}; + +/** + * struct hl_sync_to_engine_map_entry - sync object id to engine mapping entry + * @engine_type: type of the engine + * @engine_id: id of the engine + * @sync_id: id of the sync object + */ +struct hl_sync_to_engine_map_entry { + struct hlist_node node; + enum hl_sync_engine_type engine_type; + u32 engine_id; + u32 sync_id; +}; + +/** + * struct hl_sync_to_engine_map - maps sync object id to associated engine id + * @tb: hash table containing the mapping, each element is of type + * struct hl_sync_to_engine_map_entry + */ +struct hl_sync_to_engine_map { + DECLARE_HASHTABLE(tb, SYNC_TO_ENGINE_HASH_TABLE_BITS); +}; + +/** + * struct hl_state_dump_specs_funcs - virtual functions used by the state dump + * @gen_sync_to_engine_map: generate a hash map from sync obj id to its engine + * @print_single_monitor: format monitor data as string + * @monitor_valid: return true if given monitor dump is valid + * @print_fences_single_engine: format fences data as string + */ +struct hl_state_dump_specs_funcs { + int (*gen_sync_to_engine_map)(struct hl_device *hdev, + struct hl_sync_to_engine_map *map); + int (*print_single_monitor)(char **buf, size_t *size, size_t *offset, + struct hl_device *hdev, + struct hl_mon_state_dump *mon); + int (*monitor_valid)(struct hl_mon_state_dump *mon); + int (*print_fences_single_engine)(struct hl_device *hdev, + u64 base_offset, + u64 status_base_offset, + enum hl_sync_engine_type engine_type, + u32 engine_id, char **buf, + size_t *size, size_t *offset); +}; + +/** + * struct hl_state_dump_specs - defines ASIC known hw objects names + * @so_id_to_str_tb: sync objects names index table + * @monitor_id_to_str_tb: monitors names index table + * @funcs: virtual functions used for state dump + * @sync_namager_names: readable names for sync manager if available (ex: N_E) + * @props: pointer to a per asic const props array required for state dump + */ +struct hl_state_dump_specs { + DECLARE_HASHTABLE(so_id_to_str_tb, OBJ_NAMES_HASH_TABLE_BITS); + DECLARE_HASHTABLE(monitor_id_to_str_tb, OBJ_NAMES_HASH_TABLE_BITS); + struct hl_state_dump_specs_funcs funcs; + const char * const *sync_namager_names; + s64 *props; +}; + + +/* + * DEVICES + */ + +#define HL_STR_MAX 32 + +#define HL_DEV_STS_MAX (HL_DEVICE_STATUS_LAST + 1) + +/* Theoretical limit only. A single host can only contain up to 4 or 8 PCIe + * x16 cards. In extreme cases, there are hosts that can accommodate 16 cards. + */ +#define HL_MAX_MINORS 256 + +/* + * Registers read & write functions. + */ + +u32 hl_rreg(struct hl_device *hdev, u32 reg); +void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); + +#define RREG32(reg) hdev->asic_funcs->rreg(hdev, (reg)) +#define WREG32(reg, v) hdev->asic_funcs->wreg(hdev, (reg), (v)) +#define DREG32(reg) pr_info("REGISTER: " #reg " : 0x%08X\n", \ + hdev->asic_funcs->rreg(hdev, (reg))) + +#define WREG32_P(reg, val, mask) \ + do { \ + u32 tmp_ = RREG32(reg); \ + tmp_ &= (mask); \ + tmp_ |= ((val) & ~(mask)); \ + WREG32(reg, tmp_); \ + } while (0) +#define WREG32_AND(reg, and) WREG32_P(reg, 0, and) +#define WREG32_OR(reg, or) WREG32_P(reg, or, ~(or)) + +#define RMWREG32(reg, val, mask) \ + do { \ + u32 tmp_ = RREG32(reg); \ + tmp_ &= ~(mask); \ + tmp_ |= ((val) << __ffs(mask)); \ + WREG32(reg, tmp_); \ + } while (0) + +#define RREG32_MASK(reg, mask) ((RREG32(reg) & mask) >> __ffs(mask)) + +#define REG_FIELD_SHIFT(reg, field) reg##_##field##_SHIFT +#define REG_FIELD_MASK(reg, field) reg##_##field##_MASK +#define WREG32_FIELD(reg, offset, field, val) \ + WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & \ + ~REG_FIELD_MASK(reg, field)) | \ + (val) << REG_FIELD_SHIFT(reg, field)) + +/* Timeout should be longer when working with simulator but cap the + * increased timeout to some maximum + */ +#define hl_poll_timeout_common(hdev, addr, val, cond, sleep_us, timeout_us, elbi) \ +({ \ + ktime_t __timeout; \ + u32 __elbi_read; \ + int __rc = 0; \ + if (hdev->pdev) \ + __timeout = ktime_add_us(ktime_get(), timeout_us); \ + else \ + __timeout = ktime_add_us(ktime_get(),\ + min((u64)(timeout_us * 10), \ + (u64) HL_SIM_MAX_TIMEOUT_US)); \ + might_sleep_if(sleep_us); \ + for (;;) { \ + if (elbi) { \ + __rc = hl_pci_elbi_read(hdev, addr, &__elbi_read); \ + if (__rc) \ + break; \ + (val) = __elbi_read; \ + } else {\ + (val) = RREG32((u32)(addr)); \ + } \ + if (cond) \ + break; \ + if (timeout_us && ktime_compare(ktime_get(), __timeout) > 0) { \ + if (elbi) { \ + __rc = hl_pci_elbi_read(hdev, addr, &__elbi_read); \ + if (__rc) \ + break; \ + (val) = __elbi_read; \ + } else {\ + (val) = RREG32((u32)(addr)); \ + } \ + break; \ + } \ + if (sleep_us) \ + usleep_range((sleep_us >> 2) + 1, sleep_us); \ + } \ + __rc ? __rc : ((cond) ? 0 : -ETIMEDOUT); \ +}) + +#define hl_poll_timeout(hdev, addr, val, cond, sleep_us, timeout_us) \ + hl_poll_timeout_common(hdev, addr, val, cond, sleep_us, timeout_us, false) + +#define hl_poll_timeout_elbi(hdev, addr, val, cond, sleep_us, timeout_us) \ + hl_poll_timeout_common(hdev, addr, val, cond, sleep_us, timeout_us, true) + +/* + * poll array of register addresses. + * condition is satisfied if all registers values match the expected value. + * once some register in the array satisfies the condition it will not be polled again, + * this is done both for efficiency and due to some registers are "clear on read". + * TODO: use read from PCI bar in other places in the code (SW-91406) + */ +#define hl_poll_reg_array_timeout_common(hdev, addr_arr, arr_size, expected_val, sleep_us, \ + timeout_us, elbi) \ +({ \ + ktime_t __timeout; \ + u64 __elem_bitmask; \ + u32 __read_val; \ + u8 __arr_idx; \ + int __rc = 0; \ + \ + if (hdev->pdev) \ + __timeout = ktime_add_us(ktime_get(), timeout_us); \ + else \ + __timeout = ktime_add_us(ktime_get(),\ + min(((u64)timeout_us * 10), \ + (u64) HL_SIM_MAX_TIMEOUT_US)); \ + \ + might_sleep_if(sleep_us); \ + if (arr_size >= 64) \ + __rc = -EINVAL; \ + else \ + __elem_bitmask = BIT_ULL(arr_size) - 1; \ + for (;;) { \ + if (__rc) \ + break; \ + for (__arr_idx = 0; __arr_idx < (arr_size); __arr_idx++) { \ + if (!(__elem_bitmask & BIT_ULL(__arr_idx))) \ + continue; \ + if (elbi) { \ + __rc = hl_pci_elbi_read(hdev, (addr_arr)[__arr_idx], &__read_val); \ + if (__rc) \ + break; \ + } else { \ + __read_val = RREG32((u32)(addr_arr)[__arr_idx]); \ + } \ + if (__read_val == (expected_val)) \ + __elem_bitmask &= ~BIT_ULL(__arr_idx); \ + } \ + if (__rc || (__elem_bitmask == 0)) \ + break; \ + if (timeout_us && ktime_compare(ktime_get(), __timeout) > 0) \ + break; \ + if (sleep_us) \ + usleep_range((sleep_us >> 2) + 1, sleep_us); \ + } \ + __rc ? __rc : ((__elem_bitmask == 0) ? 0 : -ETIMEDOUT); \ +}) + +#define hl_poll_reg_array_timeout(hdev, addr_arr, arr_size, expected_val, sleep_us, \ + timeout_us) \ + hl_poll_reg_array_timeout_common(hdev, addr_arr, arr_size, expected_val, sleep_us, \ + timeout_us, false) + +#define hl_poll_reg_array_timeout_elbi(hdev, addr_arr, arr_size, expected_val, sleep_us, \ + timeout_us) \ + hl_poll_reg_array_timeout_common(hdev, addr_arr, arr_size, expected_val, sleep_us, \ + timeout_us, true) + +/* + * address in this macro points always to a memory location in the + * host's (server's) memory. That location is updated asynchronously + * either by the direct access of the device or by another core. + * + * To work both in LE and BE architectures, we need to distinguish between the + * two states (device or another core updates the memory location). Therefore, + * if mem_written_by_device is true, the host memory being polled will be + * updated directly by the device. If false, the host memory being polled will + * be updated by host CPU. Required so host knows whether or not the memory + * might need to be byte-swapped before returning value to caller. + */ +#define hl_poll_timeout_memory(hdev, addr, val, cond, sleep_us, timeout_us, \ + mem_written_by_device) \ +({ \ + ktime_t __timeout; \ + if (hdev->pdev) \ + __timeout = ktime_add_us(ktime_get(), timeout_us); \ + else \ + __timeout = ktime_add_us(ktime_get(),\ + min((u64)(timeout_us * 100), \ + (u64) HL_SIM_MAX_TIMEOUT_US)); \ + might_sleep_if(sleep_us); \ + for (;;) { \ + /* Verify we read updates done by other cores or by device */ \ + mb(); \ + (val) = *((u32 *)(addr)); \ + if (mem_written_by_device) \ + (val) = le32_to_cpu(*(__le32 *) &(val)); \ + if (cond) \ + break; \ + if (timeout_us && ktime_compare(ktime_get(), __timeout) > 0) { \ + (val) = *((u32 *)(addr)); \ + if (mem_written_by_device) \ + (val) = le32_to_cpu(*(__le32 *) &(val)); \ + break; \ + } \ + if (sleep_us) \ + usleep_range((sleep_us >> 2) + 1, sleep_us); \ + } \ + (cond) ? 0 : -ETIMEDOUT; \ +}) + +#define HL_USR_MAPPED_BLK_INIT(blk, base, sz) \ +({ \ + struct user_mapped_block *p = blk; \ +\ + p->address = base; \ + p->size = sz; \ +}) + +#define HL_USR_INTR_STRUCT_INIT(usr_intr, hdev, intr_id, decoder) \ +({ \ + usr_intr.hdev = hdev; \ + usr_intr.interrupt_id = intr_id; \ + usr_intr.is_decoder = decoder; \ + INIT_LIST_HEAD(&usr_intr.wait_list_head); \ + spin_lock_init(&usr_intr.wait_list_lock); \ +}) + +struct hwmon_chip_info; + +/** + * struct hl_device_reset_work - reset workqueue task wrapper. + * @wq: work queue for device reset procedure. + * @reset_work: reset work to be done. + * @hdev: habanalabs device structure. + * @flags: reset flags. + */ +struct hl_device_reset_work { + struct workqueue_struct *wq; + struct delayed_work reset_work; + struct hl_device *hdev; + u32 flags; +}; + +/** + * struct hl_mmu_hr_pgt_priv - used for holding per-device mmu host-resident + * page-table internal information. + * @mmu_pgt_pool: pool of page tables used by a host-resident MMU for + * allocating hops. + * @mmu_asid_hop0: per-ASID array of host-resident hop0 tables. + */ +struct hl_mmu_hr_priv { + struct gen_pool *mmu_pgt_pool; + struct pgt_info *mmu_asid_hop0; +}; + +/** + * struct hl_mmu_dr_pgt_priv - used for holding per-device mmu device-resident + * page-table internal information. + * @mmu_pgt_pool: pool of page tables used by MMU for allocating hops. + * @mmu_shadow_hop0: shadow array of hop0 tables. + */ +struct hl_mmu_dr_priv { + struct gen_pool *mmu_pgt_pool; + void *mmu_shadow_hop0; +}; + +/** + * struct hl_mmu_priv - used for holding per-device mmu internal information. + * @dr: information on the device-resident MMU, when exists. + * @hr: information on the host-resident MMU, when exists. + */ +struct hl_mmu_priv { + struct hl_mmu_dr_priv dr; + struct hl_mmu_hr_priv hr; +}; + +/** + * struct hl_mmu_per_hop_info - A structure describing one TLB HOP and its entry + * that was created in order to translate a virtual address to a + * physical one. + * @hop_addr: The address of the hop. + * @hop_pte_addr: The address of the hop entry. + * @hop_pte_val: The value in the hop entry. + */ +struct hl_mmu_per_hop_info { + u64 hop_addr; + u64 hop_pte_addr; + u64 hop_pte_val; +}; + +/** + * struct hl_mmu_hop_info - A structure describing the TLB hops and their + * hop-entries that were created in order to translate a virtual address to a + * physical one. + * @scrambled_vaddr: The value of the virtual address after scrambling. This + * address replaces the original virtual-address when mapped + * in the MMU tables. + * @unscrambled_paddr: The un-scrambled physical address. + * @hop_info: Array holding the per-hop information used for the translation. + * @used_hops: The number of hops used for the translation. + * @range_type: virtual address range type. + */ +struct hl_mmu_hop_info { + u64 scrambled_vaddr; + u64 unscrambled_paddr; + struct hl_mmu_per_hop_info hop_info[MMU_ARCH_6_HOPS]; + u32 used_hops; + enum hl_va_range_type range_type; +}; + +/** + * struct hl_hr_mmu_funcs - Device related host resident MMU functions. + * @get_hop0_pgt_info: get page table info structure for HOP0. + * @get_pgt_info: get page table info structure for HOP other than HOP0. + * @add_pgt_info: add page table info structure to hash. + * @get_tlb_mapping_params: get mapping parameters needed for getting TLB info for specific mapping. + */ +struct hl_hr_mmu_funcs { + struct pgt_info *(*get_hop0_pgt_info)(struct hl_ctx *ctx); + struct pgt_info *(*get_pgt_info)(struct hl_ctx *ctx, u64 phys_hop_addr); + void (*add_pgt_info)(struct hl_ctx *ctx, struct pgt_info *pgt_info, dma_addr_t phys_addr); + int (*get_tlb_mapping_params)(struct hl_device *hdev, struct hl_mmu_properties **mmu_prop, + struct hl_mmu_hop_info *hops, + u64 virt_addr, bool *is_huge); +}; + +/** + * struct hl_mmu_funcs - Device related MMU functions. + * @init: initialize the MMU module. + * @fini: release the MMU module. + * @ctx_init: Initialize a context for using the MMU module. + * @ctx_fini: disable a ctx from using the mmu module. + * @map: maps a virtual address to physical address for a context. + * @unmap: unmap a virtual address of a context. + * @flush: flush all writes from all cores to reach device MMU. + * @swap_out: marks all mapping of the given context as swapped out. + * @swap_in: marks all mapping of the given context as swapped in. + * @get_tlb_info: returns the list of hops and hop-entries used that were + * created in order to translate the giver virtual address to a + * physical one. + * @hr_funcs: functions specific to host resident MMU. + */ +struct hl_mmu_funcs { + int (*init)(struct hl_device *hdev); + void (*fini)(struct hl_device *hdev); + int (*ctx_init)(struct hl_ctx *ctx); + void (*ctx_fini)(struct hl_ctx *ctx); + int (*map)(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size, + bool is_dram_addr); + int (*unmap)(struct hl_ctx *ctx, u64 virt_addr, bool is_dram_addr); + void (*flush)(struct hl_ctx *ctx); + void (*swap_out)(struct hl_ctx *ctx); + void (*swap_in)(struct hl_ctx *ctx); + int (*get_tlb_info)(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_info *hops); + struct hl_hr_mmu_funcs hr_funcs; +}; + +/** + * struct hl_prefetch_work - prefetch work structure handler + * @pf_work: actual work struct. + * @ctx: compute context. + * @va: virtual address to pre-fetch. + * @size: pre-fetch size. + * @flags: operation flags. + * @asid: ASID for maintenance operation. + */ +struct hl_prefetch_work { + struct work_struct pf_work; + struct hl_ctx *ctx; + u64 va; + u64 size; + u32 flags; + u32 asid; +}; + +/* + * number of user contexts allowed to call wait_for_multi_cs ioctl in + * parallel + */ +#define MULTI_CS_MAX_USER_CTX 2 + +/** + * struct multi_cs_completion - multi CS wait completion. + * @completion: completion of any of the CS in the list + * @lock: spinlock for the completion structure + * @timestamp: timestamp for the multi-CS completion + * @stream_master_qid_map: bitmap of all stream masters on which the multi-CS + * is waiting + * @used: 1 if in use, otherwise 0 + */ +struct multi_cs_completion { + struct completion completion; + spinlock_t lock; + s64 timestamp; + u32 stream_master_qid_map; + u8 used; +}; + +/** + * struct multi_cs_data - internal data for multi CS call + * @ctx: pointer to the context structure + * @fence_arr: array of fences of all CSs + * @seq_arr: array of CS sequence numbers + * @timeout_jiffies: timeout in jiffies for waiting for CS to complete + * @timestamp: timestamp of first completed CS + * @wait_status: wait for CS status + * @completion_bitmap: bitmap of completed CSs (1- completed, otherwise 0) + * @arr_len: fence_arr and seq_arr array length + * @gone_cs: indication of gone CS (1- there was gone CS, otherwise 0) + * @update_ts: update timestamp. 1- update the timestamp, otherwise 0. + */ +struct multi_cs_data { + struct hl_ctx *ctx; + struct hl_fence **fence_arr; + u64 *seq_arr; + s64 timeout_jiffies; + s64 timestamp; + long wait_status; + u32 completion_bitmap; + u8 arr_len; + u8 gone_cs; + u8 update_ts; +}; + +/** + * struct hl_clk_throttle_timestamp - current/last clock throttling timestamp + * @start: timestamp taken when 'start' event is received in driver + * @end: timestamp taken when 'end' event is received in driver + */ +struct hl_clk_throttle_timestamp { + ktime_t start; + ktime_t end; +}; + +/** + * struct hl_clk_throttle - keeps current/last clock throttling timestamps + * @timestamp: timestamp taken by driver and firmware, index 0 refers to POWER + * index 1 refers to THERMAL + * @lock: protects this structure as it can be accessed from both event queue + * context and info_ioctl context + * @current_reason: bitmask represents the current clk throttling reasons + * @aggregated_reason: bitmask represents aggregated clk throttling reasons since driver load + */ +struct hl_clk_throttle { + struct hl_clk_throttle_timestamp timestamp[HL_CLK_THROTTLE_TYPE_MAX]; + struct mutex lock; + u32 current_reason; + u32 aggregated_reason; +}; + +/** + * struct user_mapped_block - describes a hw block allowed to be mmapped by user + * @address: physical HW block address + * @size: allowed size for mmap + */ +struct user_mapped_block { + u32 address; + u32 size; +}; + +/** + * struct cs_timeout_info - info of last CS timeout occurred. + * @timestamp: CS timeout timestamp. + * @write_enable: if set writing to CS parameters in the structure is enabled. otherwise - disabled, + * so the first (root cause) CS timeout will not be overwritten. + * @seq: CS timeout sequence number. + */ +struct cs_timeout_info { + ktime_t timestamp; + atomic_t write_enable; + u64 seq; +}; + +/** + * struct razwi_info - info about last razwi error occurred. + * @timestamp: razwi timestamp. + * @write_enable: if set writing to razwi parameters in the structure is enabled. + * otherwise - disabled, so the first (root cause) razwi will not be overwritten. + * @addr: address that caused razwi. + * @engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does + * not have engine id it will be set to U16_MAX. + * @engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible + * engines which one them caused the razwi. In that case, it will contain the + * second possible engine id, otherwise it will be set to U16_MAX. + * @non_engine_initiator: in case the initiator of the razwi does not have engine id. + * @type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX. + */ +struct razwi_info { + ktime_t timestamp; + atomic_t write_enable; + u64 addr; + u16 engine_id_1; + u16 engine_id_2; + u8 non_engine_initiator; + u8 type; +}; + +#define MAX_QMAN_STREAMS_INFO 4 +#define OPCODE_INFO_MAX_ADDR_SIZE 8 +/** + * struct undefined_opcode_info - info about last undefined opcode error + * @timestamp: timestamp of the undefined opcode error + * @cb_addr_streams: CB addresses (per stream) that are currently exists in the PQ + * entries. In case all streams array entries are + * filled with values, it means the execution was in Lower-CP. + * @cq_addr: the address of the current handled command buffer + * @cq_size: the size of the current handled command buffer + * @cb_addr_streams_len: num of streams - actual len of cb_addr_streams array. + * should be equal to 1 incase of undefined opcode + * in Upper-CP (specific stream) and equal to 4 incase + * of undefined opcode in Lower-CP. + * @engine_id: engine-id that the error occurred on + * @stream_id: the stream id the error occurred on. In case the stream equals to + * MAX_QMAN_STREAMS_INFO it means the error occurred on a Lower-CP. + * @write_enable: if set, writing to undefined opcode parameters in the structure + * is enable so the first (root cause) undefined opcode will not be + * overwritten. + */ +struct undefined_opcode_info { + ktime_t timestamp; + u64 cb_addr_streams[MAX_QMAN_STREAMS_INFO][OPCODE_INFO_MAX_ADDR_SIZE]; + u64 cq_addr; + u32 cq_size; + u32 cb_addr_streams_len; + u32 engine_id; + u32 stream_id; + bool write_enable; +}; + +/** + * struct hl_error_info - holds information collected during an error. + * @cs_timeout: CS timeout error information. + * @razwi: razwi information. + * @undef_opcode: undefined opcode information + */ +struct hl_error_info { + struct cs_timeout_info cs_timeout; + struct razwi_info razwi; + struct undefined_opcode_info undef_opcode; +}; + +/** + * struct hl_reset_info - holds current device reset information. + * @lock: lock to protect critical reset flows. + * @compute_reset_cnt: number of compute resets since the driver was loaded. + * @hard_reset_cnt: number of hard resets since the driver was loaded. + * @hard_reset_schedule_flags: hard reset is scheduled to after current compute reset, + * here we hold the hard reset flags. + * @in_reset: is device in reset flow. + * @in_compute_reset: Device is currently in reset but not in hard-reset. + * @needs_reset: true if reset_on_lockup is false and device should be reset + * due to lockup. + * @hard_reset_pending: is there a hard reset work pending. + * @curr_reset_cause: saves an enumerated reset cause when a hard reset is + * triggered, and cleared after it is shared with preboot. + * @prev_reset_trigger: saves the previous trigger which caused a reset, overridden + * with a new value on next reset + * @reset_trigger_repeated: set if device reset is triggered more than once with + * same cause. + * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to + * complete instead. + */ +struct hl_reset_info { + spinlock_t lock; + u32 compute_reset_cnt; + u32 hard_reset_cnt; + u32 hard_reset_schedule_flags; + u8 in_reset; + u8 in_compute_reset; + u8 needs_reset; + u8 hard_reset_pending; + + u8 curr_reset_cause; + u8 prev_reset_trigger; + u8 reset_trigger_repeated; + + u8 skip_reset_on_timeout; +}; + +/** + * struct hl_device - habanalabs device structure. + * @pdev: pointer to PCI device, can be NULL in case of simulator device. + * @pcie_bar_phys: array of available PCIe bars physical addresses. + * (required only for PCI address match mode) + * @pcie_bar: array of available PCIe bars virtual addresses. + * @rmmio: configuration area address on SRAM. + * @cdev: related char device. + * @cdev_ctrl: char device for control operations only (INFO IOCTL) + * @dev: related kernel basic device structure. + * @dev_ctrl: related kernel device structure for the control device + * @work_heartbeat: delayed work for CPU-CP is-alive check. + * @device_reset_work: delayed work which performs hard reset + * @asic_name: ASIC specific name. + * @asic_type: ASIC specific type. + * @completion_queue: array of hl_cq. + * @user_interrupt: array of hl_user_interrupt. upon the corresponding user + * interrupt, driver will monitor the list of fences + * registered to this interrupt. + * @common_user_cq_interrupt: common user CQ interrupt for all user CQ interrupts. + * upon any user CQ interrupt, driver will monitor the + * list of fences registered to this common structure. + * @common_decoder_interrupt: common decoder interrupt for all user decoder interrupts. + * @shadow_cs_queue: pointer to a shadow queue that holds pointers to + * outstanding command submissions. + * @cq_wq: work queues of completion queues for executing work in process + * context. + * @eq_wq: work queue of event queue for executing work in process context. + * @cs_cmplt_wq: work queue of CS completions for executing work in process + * context. + * @ts_free_obj_wq: work queue for timestamp registration objects release. + * @pf_wq: work queue for MMU pre-fetch operations. + * @kernel_ctx: Kernel driver context structure. + * @kernel_queues: array of hl_hw_queue. + * @cs_mirror_list: CS mirror list for TDR. + * @cs_mirror_lock: protects cs_mirror_list. + * @kernel_mem_mgr: memory manager for memory buffers with lifespan of driver. + * @event_queue: event queue for IRQ from CPU-CP. + * @dma_pool: DMA pool for small allocations. + * @cpu_accessible_dma_mem: Host <-> CPU-CP shared memory CPU address. + * @cpu_accessible_dma_address: Host <-> CPU-CP shared memory DMA address. + * @cpu_accessible_dma_pool: Host <-> CPU-CP shared memory pool. + * @asid_bitmap: holds used/available ASIDs. + * @asid_mutex: protects asid_bitmap. + * @send_cpu_message_lock: enforces only one message in Host <-> CPU-CP queue. + * @debug_lock: protects critical section of setting debug mode for device + * @mmu_lock: protects the MMU page tables and invalidation h/w. Although the + * page tables are per context, the invalidation h/w is per MMU. + * Therefore, we can't allow multiple contexts (we only have two, + * user and kernel) to access the invalidation h/w at the same time. + * In addition, any change to the PGT, modifying the MMU hash or + * walking the PGT requires talking this lock. + * @asic_prop: ASIC specific immutable properties. + * @asic_funcs: ASIC specific functions. + * @asic_specific: ASIC specific information to use only from ASIC files. + * @vm: virtual memory manager for MMU. + * @hwmon_dev: H/W monitor device. + * @hl_chip_info: ASIC's sensors information. + * @device_status_description: device status description. + * @hl_debugfs: device's debugfs manager. + * @cb_pool: list of pre allocated CBs. + * @cb_pool_lock: protects the CB pool. + * @internal_cb_pool_virt_addr: internal command buffer pool virtual address. + * @internal_cb_pool_dma_addr: internal command buffer pool dma address. + * @internal_cb_pool: internal command buffer memory pool. + * @internal_cb_va_base: internal cb pool mmu virtual address base + * @fpriv_list: list of file private data structures. Each structure is created + * when a user opens the device + * @fpriv_ctrl_list: list of file private data structures. Each structure is created + * when a user opens the control device + * @fpriv_list_lock: protects the fpriv_list + * @fpriv_ctrl_list_lock: protects the fpriv_ctrl_list + * @aggregated_cs_counters: aggregated cs counters among all contexts + * @mmu_priv: device-specific MMU data. + * @mmu_func: device-related MMU functions. + * @dec: list of decoder sw instance + * @fw_loader: FW loader manager. + * @pci_mem_region: array of memory regions in the PCI + * @state_dump_specs: constants and dictionaries needed to dump system state. + * @multi_cs_completion: array of multi-CS completion. + * @clk_throttling: holds information about current/previous clock throttling events + * @captured_err_info: holds information about errors. + * @reset_info: holds current device reset information. + * @stream_master_qid_arr: pointer to array with QIDs of master streams. + * @fw_major_version: major version of current loaded preboot. + * @fw_minor_version: minor version of current loaded preboot. + * @dram_used_mem: current DRAM memory consumption. + * @memory_scrub_val: the value to which the dram will be scrubbed to using cb scrub_device_dram + * @timeout_jiffies: device CS timeout value. + * @max_power: the max power of the device, as configured by the sysadmin. This + * value is saved so in case of hard-reset, the driver will restore + * this value and update the F/W after the re-initialization + * @boot_error_status_mask: contains a mask of the device boot error status. + * Each bit represents a different error, according to + * the defines in hl_boot_if.h. If the bit is cleared, + * the error will be ignored by the driver during + * device initialization. Mainly used to debug and + * workaround firmware bugs + * @dram_pci_bar_start: start bus address of PCIe bar towards DRAM. + * @last_successful_open_ktime: timestamp (ktime) of the last successful device open. + * @last_successful_open_jif: timestamp (jiffies) of the last successful + * device open. + * @last_open_session_duration_jif: duration (jiffies) of the last device open + * session. + * @open_counter: number of successful device open operations. + * @fw_poll_interval_usec: FW status poll interval in usec. + * used for CPU boot status + * @fw_comms_poll_interval_usec: FW comms/protocol poll interval in usec. + * used for COMMs protocols cmds(COMMS_STS_*) + * @dram_binning: contains mask of drams that is received from the f/w which indicates which + * drams are binned-out + * @tpc_binning: contains mask of tpc engines that is received from the f/w which indicates which + * tpc engines are binned-out + * @card_type: Various ASICs have several card types. This indicates the card + * type of the current device. + * @major: habanalabs kernel driver major. + * @high_pll: high PLL profile frequency. + * @decoder_binning: contains mask of decoder engines that is received from the f/w which + * indicates which decoder engines are binned-out + * @edma_binning: contains mask of edma engines that is received from the f/w which + * indicates which edma engines are binned-out + * @id: device minor. + * @id_control: minor of the control device. + * @cdev_idx: char device index. Used for setting its name. + * @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit + * addresses. + * @is_in_dram_scrub: true if dram scrub operation is on going. + * @disabled: is device disabled. + * @late_init_done: is late init stage was done during initialization. + * @hwmon_initialized: is H/W monitor sensors was initialized. + * @reset_on_lockup: true if a reset should be done in case of stuck CS, false + * otherwise. + * @dram_default_page_mapping: is DRAM default page mapping enabled. + * @memory_scrub: true to perform device memory scrub in various locations, + * such as context-switch, context close, page free, etc. + * @pmmu_huge_range: is a different virtual addresses range used for PMMU with + * huge pages. + * @init_done: is the initialization of the device done. + * @device_cpu_disabled: is the device CPU disabled (due to timeouts) + * @in_debug: whether the device is in a state where the profiling/tracing infrastructure + * can be used. This indication is needed because in some ASICs we need to do + * specific operations to enable that infrastructure. + * @cdev_sysfs_created: were char devices and sysfs nodes created. + * @stop_on_err: true if engines should stop on error. + * @supports_sync_stream: is sync stream supported. + * @sync_stream_queue_idx: helper index for sync stream queues initialization. + * @collective_mon_idx: helper index for collective initialization + * @supports_coresight: is CoreSight supported. + * @supports_cb_mapping: is mapping a CB to the device's MMU supported. + * @process_kill_trial_cnt: number of trials reset thread tried killing + * user processes + * @device_fini_pending: true if device_fini was called and might be + * waiting for the reset thread to finish + * @supports_staged_submission: true if staged submissions are supported + * @device_cpu_is_halted: Flag to indicate whether the device CPU was already + * halted. We can't halt it again because the COMMS + * protocol will throw an error. Relevant only for + * cases where Linux was not loaded to device CPU + * @supports_wait_for_multi_cs: true if wait for multi CS is supported + * @is_compute_ctx_active: Whether there is an active compute context executing. + * @compute_ctx_in_release: true if the current compute context is being released. + * @supports_mmu_prefetch: true if prefetch is supported, otherwise false. + * @reset_upon_device_release: reset the device when the user closes the file descriptor of the + * device. + * @nic_ports_mask: Controls which NIC ports are enabled. Used only for testing. + * @fw_components: Controls which f/w components to load to the device. There are multiple f/w + * stages and sometimes we want to stop at a certain stage. Used only for testing. + * @mmu_enable: Whether to enable or disable the device MMU(s). Used only for testing. + * @cpu_queues_enable: Whether to enable queues communication vs. the f/w. Used only for testing. + * @pldm: Whether we are running in Palladium environment. Used only for testing. + * @hard_reset_on_fw_events: Whether to do device hard-reset when a fatal event is received from + * the f/w. Used only for testing. + * @bmc_enable: Whether we are running in a box with BMC. Used only for testing. + * @reset_on_preboot_fail: Whether to reset the device if preboot f/w fails to load. + * Used only for testing. + * @heartbeat: Controls if we want to enable the heartbeat mechanism vs. the f/w, which verifies + * that the f/w is always alive. Used only for testing. + * @supports_ctx_switch: true if a ctx switch is required upon first submission. + */ +struct hl_device { + struct pci_dev *pdev; + u64 pcie_bar_phys[HL_PCI_NUM_BARS]; + void __iomem *pcie_bar[HL_PCI_NUM_BARS]; + void __iomem *rmmio; + struct cdev cdev; + struct cdev cdev_ctrl; + struct device *dev; + struct device *dev_ctrl; + struct delayed_work work_heartbeat; + struct hl_device_reset_work device_reset_work; + char asic_name[HL_STR_MAX]; + char status[HL_DEV_STS_MAX][HL_STR_MAX]; + enum hl_asic_type asic_type; + struct hl_cq *completion_queue; + struct hl_user_interrupt *user_interrupt; + struct hl_user_interrupt common_user_cq_interrupt; + struct hl_user_interrupt common_decoder_interrupt; + struct hl_cs **shadow_cs_queue; + struct workqueue_struct **cq_wq; + struct workqueue_struct *eq_wq; + struct workqueue_struct *cs_cmplt_wq; + struct workqueue_struct *ts_free_obj_wq; + struct workqueue_struct *pf_wq; + struct hl_ctx *kernel_ctx; + struct hl_hw_queue *kernel_queues; + struct list_head cs_mirror_list; + spinlock_t cs_mirror_lock; + struct hl_mem_mgr kernel_mem_mgr; + struct hl_eq event_queue; + struct dma_pool *dma_pool; + void *cpu_accessible_dma_mem; + dma_addr_t cpu_accessible_dma_address; + struct gen_pool *cpu_accessible_dma_pool; + unsigned long *asid_bitmap; + struct mutex asid_mutex; + struct mutex send_cpu_message_lock; + struct mutex debug_lock; + struct mutex mmu_lock; + struct asic_fixed_properties asic_prop; + const struct hl_asic_funcs *asic_funcs; + void *asic_specific; + struct hl_vm vm; + struct device *hwmon_dev; + struct hwmon_chip_info *hl_chip_info; + + struct hl_dbg_device_entry hl_debugfs; + + struct list_head cb_pool; + spinlock_t cb_pool_lock; + + void *internal_cb_pool_virt_addr; + dma_addr_t internal_cb_pool_dma_addr; + struct gen_pool *internal_cb_pool; + u64 internal_cb_va_base; + + struct list_head fpriv_list; + struct list_head fpriv_ctrl_list; + struct mutex fpriv_list_lock; + struct mutex fpriv_ctrl_list_lock; + + struct hl_cs_counters_atomic aggregated_cs_counters; + + struct hl_mmu_priv mmu_priv; + struct hl_mmu_funcs mmu_func[MMU_NUM_PGT_LOCATIONS]; + + struct hl_dec *dec; + + struct fw_load_mgr fw_loader; + + struct pci_mem_region pci_mem_region[PCI_REGION_NUMBER]; + + struct hl_state_dump_specs state_dump_specs; + + struct multi_cs_completion multi_cs_completion[ + MULTI_CS_MAX_USER_CTX]; + struct hl_clk_throttle clk_throttling; + struct hl_error_info captured_err_info; + + struct hl_reset_info reset_info; + + u32 *stream_master_qid_arr; + u32 fw_major_version; + u32 fw_minor_version; + atomic64_t dram_used_mem; + u64 memory_scrub_val; + u64 timeout_jiffies; + u64 max_power; + u64 boot_error_status_mask; + u64 dram_pci_bar_start; + u64 last_successful_open_jif; + u64 last_open_session_duration_jif; + u64 open_counter; + u64 fw_poll_interval_usec; + ktime_t last_successful_open_ktime; + u64 fw_comms_poll_interval_usec; + u64 dram_binning; + u64 tpc_binning; + + enum cpucp_card_types card_type; + u32 major; + u32 high_pll; + u32 decoder_binning; + u32 edma_binning; + u16 id; + u16 id_control; + u16 cdev_idx; + u16 cpu_pci_msb_addr; + u8 is_in_dram_scrub; + u8 disabled; + u8 late_init_done; + u8 hwmon_initialized; + u8 reset_on_lockup; + u8 dram_default_page_mapping; + u8 memory_scrub; + u8 pmmu_huge_range; + u8 init_done; + u8 device_cpu_disabled; + u8 in_debug; + u8 cdev_sysfs_created; + u8 stop_on_err; + u8 supports_sync_stream; + u8 sync_stream_queue_idx; + u8 collective_mon_idx; + u8 supports_coresight; + u8 supports_cb_mapping; + u8 process_kill_trial_cnt; + u8 device_fini_pending; + u8 supports_staged_submission; + u8 device_cpu_is_halted; + u8 supports_wait_for_multi_cs; + u8 stream_master_qid_arr_size; + u8 is_compute_ctx_active; + u8 compute_ctx_in_release; + u8 supports_mmu_prefetch; + u8 reset_upon_device_release; + u8 supports_ctx_switch; + + /* Parameters for bring-up */ + u64 nic_ports_mask; + u64 fw_components; + u8 mmu_enable; + u8 cpu_queues_enable; + u8 pldm; + u8 hard_reset_on_fw_events; + u8 bmc_enable; + u8 reset_on_preboot_fail; + u8 heartbeat; +}; + + +/** + * struct hl_cs_encaps_sig_handle - encapsulated signals handle structure + * @refcount: refcount used to protect removing this id when several + * wait cs are used to wait of the reserved encaps signals. + * @hdev: pointer to habanalabs device structure. + * @hw_sob: pointer to H/W SOB used in the reservation. + * @ctx: pointer to the user's context data structure + * @cs_seq: staged cs sequence which contains encapsulated signals + * @id: idr handler id to be used to fetch the handler info + * @q_idx: stream queue index + * @pre_sob_val: current SOB value before reservation + * @count: signals number + */ +struct hl_cs_encaps_sig_handle { + struct kref refcount; + struct hl_device *hdev; + struct hl_hw_sob *hw_sob; + struct hl_ctx *ctx; + u64 cs_seq; + u32 id; + u32 q_idx; + u32 pre_sob_val; + u32 count; +}; + +/* + * IOCTLs + */ + +/** + * typedef hl_ioctl_t - typedef for ioctl function in the driver + * @hpriv: pointer to the FD's private data, which contains state of + * user process + * @data: pointer to the input/output arguments structure of the IOCTL + * + * Return: 0 for success, negative value for error + */ +typedef int hl_ioctl_t(struct hl_fpriv *hpriv, void *data); + +/** + * struct hl_ioctl_desc - describes an IOCTL entry of the driver. + * @cmd: the IOCTL code as created by the kernel macros. + * @func: pointer to the driver's function that should be called for this IOCTL. + */ +struct hl_ioctl_desc { + unsigned int cmd; + hl_ioctl_t *func; +}; + + +/* + * Kernel module functions that can be accessed by entire module + */ + +/** + * hl_get_sg_info() - get number of pages and the DMA address from SG list. + * @sg: the SG list. + * @dma_addr: pointer to DMA address to return. + * + * Calculate the number of consecutive pages described by the SG list. Take the + * offset of the address in the first page, add to it the length and round it up + * to the number of needed pages. + */ +static inline u32 hl_get_sg_info(struct scatterlist *sg, dma_addr_t *dma_addr) +{ + *dma_addr = sg_dma_address(sg); + + return ((((*dma_addr) & (PAGE_SIZE - 1)) + sg_dma_len(sg)) + + (PAGE_SIZE - 1)) >> PAGE_SHIFT; +} + +/** + * hl_mem_area_inside_range() - Checks whether address+size are inside a range. + * @address: The start address of the area we want to validate. + * @size: The size in bytes of the area we want to validate. + * @range_start_address: The start address of the valid range. + * @range_end_address: The end address of the valid range. + * + * Return: true if the area is inside the valid range, false otherwise. + */ +static inline bool hl_mem_area_inside_range(u64 address, u64 size, + u64 range_start_address, u64 range_end_address) +{ + u64 end_address = address + size; + + if ((address >= range_start_address) && + (end_address <= range_end_address) && + (end_address > address)) + return true; + + return false; +} + +/** + * hl_mem_area_crosses_range() - Checks whether address+size crossing a range. + * @address: The start address of the area we want to validate. + * @size: The size in bytes of the area we want to validate. + * @range_start_address: The start address of the valid range. + * @range_end_address: The end address of the valid range. + * + * Return: true if the area overlaps part or all of the valid range, + * false otherwise. + */ +static inline bool hl_mem_area_crosses_range(u64 address, u32 size, + u64 range_start_address, u64 range_end_address) +{ + u64 end_address = address + size - 1; + + return ((address <= range_end_address) && (range_start_address <= end_address)); +} + +uint64_t hl_set_dram_bar_default(struct hl_device *hdev, u64 addr); +void *hl_asic_dma_alloc_coherent_caller(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, + gfp_t flag, const char *caller); +void hl_asic_dma_free_coherent_caller(struct hl_device *hdev, size_t size, void *cpu_addr, + dma_addr_t dma_handle, const char *caller); +void *hl_cpu_accessible_dma_pool_alloc_caller(struct hl_device *hdev, size_t size, + dma_addr_t *dma_handle, const char *caller); +void hl_cpu_accessible_dma_pool_free_caller(struct hl_device *hdev, size_t size, void *vaddr, + const char *caller); +void *hl_asic_dma_pool_zalloc_caller(struct hl_device *hdev, size_t size, gfp_t mem_flags, + dma_addr_t *dma_handle, const char *caller); +void hl_asic_dma_pool_free_caller(struct hl_device *hdev, void *vaddr, dma_addr_t dma_addr, + const char *caller); +int hl_dma_map_sgtable(struct hl_device *hdev, struct sg_table *sgt, enum dma_data_direction dir); +void hl_dma_unmap_sgtable(struct hl_device *hdev, struct sg_table *sgt, + enum dma_data_direction dir); +int hl_access_cfg_region(struct hl_device *hdev, u64 addr, u64 *val, + enum debugfs_access_type acc_type); +int hl_access_dev_mem(struct hl_device *hdev, enum pci_region region_type, + u64 addr, u64 *val, enum debugfs_access_type acc_type); +int hl_device_open(struct inode *inode, struct file *filp); +int hl_device_open_ctrl(struct inode *inode, struct file *filp); +bool hl_device_operational(struct hl_device *hdev, + enum hl_device_status *status); +enum hl_device_status hl_device_status(struct hl_device *hdev); +int hl_device_set_debug_mode(struct hl_device *hdev, struct hl_ctx *ctx, bool enable); +int hl_hw_queues_create(struct hl_device *hdev); +void hl_hw_queues_destroy(struct hl_device *hdev); +int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id, + u32 cb_size, u64 cb_ptr); +void hl_hw_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q, + u32 ctl, u32 len, u64 ptr); +int hl_hw_queue_schedule_cs(struct hl_cs *cs); +u32 hl_hw_queue_add_ptr(u32 ptr, u16 val); +void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id); +void hl_hw_queue_update_ci(struct hl_cs *cs); +void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset); + +#define hl_queue_inc_ptr(p) hl_hw_queue_add_ptr(p, 1) +#define hl_pi_2_offset(pi) ((pi) & (HL_QUEUE_LENGTH - 1)) + +int hl_cq_init(struct hl_device *hdev, struct hl_cq *q, u32 hw_queue_id); +void hl_cq_fini(struct hl_device *hdev, struct hl_cq *q); +int hl_eq_init(struct hl_device *hdev, struct hl_eq *q); +void hl_eq_fini(struct hl_device *hdev, struct hl_eq *q); +void hl_cq_reset(struct hl_device *hdev, struct hl_cq *q); +void hl_eq_reset(struct hl_device *hdev, struct hl_eq *q); +irqreturn_t hl_irq_handler_cq(int irq, void *arg); +irqreturn_t hl_irq_handler_eq(int irq, void *arg); +irqreturn_t hl_irq_handler_dec_abnrm(int irq, void *arg); +irqreturn_t hl_irq_handler_user_interrupt(int irq, void *arg); +irqreturn_t hl_irq_handler_default(int irq, void *arg); +u32 hl_cq_inc_ptr(u32 ptr); + +int hl_asid_init(struct hl_device *hdev); +void hl_asid_fini(struct hl_device *hdev); +unsigned long hl_asid_alloc(struct hl_device *hdev); +void hl_asid_free(struct hl_device *hdev, unsigned long asid); + +int hl_ctx_create(struct hl_device *hdev, struct hl_fpriv *hpriv); +void hl_ctx_free(struct hl_device *hdev, struct hl_ctx *ctx); +int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx); +void hl_ctx_do_release(struct kref *ref); +void hl_ctx_get(struct hl_ctx *ctx); +int hl_ctx_put(struct hl_ctx *ctx); +struct hl_ctx *hl_get_compute_ctx(struct hl_device *hdev); +struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq); +int hl_ctx_get_fences(struct hl_ctx *ctx, u64 *seq_arr, + struct hl_fence **fence, u32 arr_len); +void hl_ctx_mgr_init(struct hl_ctx_mgr *mgr); +void hl_ctx_mgr_fini(struct hl_device *hdev, struct hl_ctx_mgr *mgr); + +int hl_device_init(struct hl_device *hdev, struct class *hclass); +void hl_device_fini(struct hl_device *hdev); +int hl_device_suspend(struct hl_device *hdev); +int hl_device_resume(struct hl_device *hdev); +int hl_device_reset(struct hl_device *hdev, u32 flags); +void hl_hpriv_get(struct hl_fpriv *hpriv); +int hl_hpriv_put(struct hl_fpriv *hpriv); +int hl_device_utilization(struct hl_device *hdev, u32 *utilization); + +int hl_build_hwmon_channel_info(struct hl_device *hdev, + struct cpucp_sensor *sensors_arr); + +void hl_notifier_event_send_all(struct hl_device *hdev, u64 event_mask); + +int hl_sysfs_init(struct hl_device *hdev); +void hl_sysfs_fini(struct hl_device *hdev); + +int hl_hwmon_init(struct hl_device *hdev); +void hl_hwmon_fini(struct hl_device *hdev); +void hl_hwmon_release_resources(struct hl_device *hdev); + +int hl_cb_create(struct hl_device *hdev, struct hl_mem_mgr *mmg, + struct hl_ctx *ctx, u32 cb_size, bool internal_cb, + bool map_cb, u64 *handle); +int hl_cb_destroy(struct hl_mem_mgr *mmg, u64 cb_handle); +int hl_hw_block_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma); +struct hl_cb *hl_cb_get(struct hl_mem_mgr *mmg, u64 handle); +void hl_cb_put(struct hl_cb *cb); +struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size, + bool internal_cb); +int hl_cb_pool_init(struct hl_device *hdev); +int hl_cb_pool_fini(struct hl_device *hdev); +int hl_cb_va_pool_init(struct hl_ctx *ctx); +void hl_cb_va_pool_fini(struct hl_ctx *ctx); + +void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush); +struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, + enum hl_queue_type queue_type, bool is_kernel_allocated_cb); +void hl_sob_reset_error(struct kref *ref); +int hl_gen_sob_mask(u16 sob_base, u8 sob_mask, u8 *mask); +void hl_fence_put(struct hl_fence *fence); +void hl_fences_put(struct hl_fence **fence, int len); +void hl_fence_get(struct hl_fence *fence); +void cs_get(struct hl_cs *cs); +bool cs_needs_completion(struct hl_cs *cs); +bool cs_needs_timeout(struct hl_cs *cs); +bool is_staged_cs_last_exists(struct hl_device *hdev, struct hl_cs *cs); +struct hl_cs *hl_staged_cs_find_first(struct hl_device *hdev, u64 cs_seq); +void hl_multi_cs_completion_init(struct hl_device *hdev); + +void goya_set_asic_funcs(struct hl_device *hdev); +void gaudi_set_asic_funcs(struct hl_device *hdev); +void gaudi2_set_asic_funcs(struct hl_device *hdev); + +int hl_vm_ctx_init(struct hl_ctx *ctx); +void hl_vm_ctx_fini(struct hl_ctx *ctx); + +int hl_vm_init(struct hl_device *hdev); +void hl_vm_fini(struct hl_device *hdev); + +void hl_hw_block_mem_init(struct hl_ctx *ctx); +void hl_hw_block_mem_fini(struct hl_ctx *ctx); + +u64 hl_reserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx, + enum hl_va_range_type type, u64 size, u32 alignment); +int hl_unreserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx, + u64 start_addr, u64 size); +int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size, + struct hl_userptr *userptr); +void hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr); +void hl_userptr_delete_list(struct hl_device *hdev, + struct list_head *userptr_list); +bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr, u32 size, + struct list_head *userptr_list, + struct hl_userptr **userptr); + +int hl_mmu_init(struct hl_device *hdev); +void hl_mmu_fini(struct hl_device *hdev); +int hl_mmu_ctx_init(struct hl_ctx *ctx); +void hl_mmu_ctx_fini(struct hl_ctx *ctx); +int hl_mmu_map_page(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, + u32 page_size, bool flush_pte); +int hl_mmu_get_real_page_size(struct hl_device *hdev, struct hl_mmu_properties *mmu_prop, + u32 page_size, u32 *real_page_size, bool is_dram_addr); +int hl_mmu_unmap_page(struct hl_ctx *ctx, u64 virt_addr, u32 page_size, + bool flush_pte); +int hl_mmu_map_contiguous(struct hl_ctx *ctx, u64 virt_addr, + u64 phys_addr, u32 size); +int hl_mmu_unmap_contiguous(struct hl_ctx *ctx, u64 virt_addr, u32 size); +int hl_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, u32 flags); +int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard, + u32 flags, u32 asid, u64 va, u64 size); +int hl_mmu_prefetch_cache_range(struct hl_ctx *ctx, u32 flags, u32 asid, u64 va, u64 size); +u64 hl_mmu_get_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte); +u64 hl_mmu_get_hop_pte_phys_addr(struct hl_ctx *ctx, struct hl_mmu_properties *mmu_prop, + u8 hop_idx, u64 hop_addr, u64 virt_addr); +void hl_mmu_hr_flush(struct hl_ctx *ctx); +int hl_mmu_hr_init(struct hl_device *hdev, struct hl_mmu_hr_priv *hr_priv, u32 hop_table_size, + u64 pgt_size); +void hl_mmu_hr_fini(struct hl_device *hdev, struct hl_mmu_hr_priv *hr_priv, u32 hop_table_size); +void hl_mmu_hr_free_hop_remove_pgt(struct pgt_info *pgt_info, struct hl_mmu_hr_priv *hr_priv, + u32 hop_table_size); +u64 hl_mmu_hr_pte_phys_to_virt(struct hl_ctx *ctx, struct pgt_info *pgt, u64 phys_pte_addr, + u32 hop_table_size); +void hl_mmu_hr_write_pte(struct hl_ctx *ctx, struct pgt_info *pgt_info, u64 phys_pte_addr, + u64 val, u32 hop_table_size); +void hl_mmu_hr_clear_pte(struct hl_ctx *ctx, struct pgt_info *pgt_info, u64 phys_pte_addr, + u32 hop_table_size); +int hl_mmu_hr_put_pte(struct hl_ctx *ctx, struct pgt_info *pgt_info, struct hl_mmu_hr_priv *hr_priv, + u32 hop_table_size); +void hl_mmu_hr_get_pte(struct hl_ctx *ctx, struct hl_hr_mmu_funcs *hr_func, u64 phys_hop_addr); +struct pgt_info *hl_mmu_hr_get_next_hop_pgt_info(struct hl_ctx *ctx, + struct hl_hr_mmu_funcs *hr_func, + u64 curr_pte); +struct pgt_info *hl_mmu_hr_alloc_hop(struct hl_ctx *ctx, struct hl_mmu_hr_priv *hr_priv, + struct hl_hr_mmu_funcs *hr_func, + struct hl_mmu_properties *mmu_prop); +struct pgt_info *hl_mmu_hr_get_alloc_next_hop(struct hl_ctx *ctx, + struct hl_mmu_hr_priv *hr_priv, + struct hl_hr_mmu_funcs *hr_func, + struct hl_mmu_properties *mmu_prop, + u64 curr_pte, bool *is_new_hop); +int hl_mmu_hr_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_info *hops, + struct hl_hr_mmu_funcs *hr_func); +void hl_mmu_swap_out(struct hl_ctx *ctx); +void hl_mmu_swap_in(struct hl_ctx *ctx); +int hl_mmu_if_set_funcs(struct hl_device *hdev); +void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu); +void hl_mmu_v2_hr_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu); +int hl_mmu_va_to_pa(struct hl_ctx *ctx, u64 virt_addr, u64 *phys_addr); +int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, + struct hl_mmu_hop_info *hops); +u64 hl_mmu_scramble_addr(struct hl_device *hdev, u64 addr); +u64 hl_mmu_descramble_addr(struct hl_device *hdev, u64 addr); +bool hl_is_dram_va(struct hl_device *hdev, u64 virt_addr); + +int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name, + void __iomem *dst, u32 src_offset, u32 size); +int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode, u64 value); +int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg, + u16 len, u32 timeout, u64 *result); +int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type); +int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr, + size_t irq_arr_size); +int hl_fw_test_cpu_queue(struct hl_device *hdev); +void *hl_fw_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size, + dma_addr_t *dma_handle); +void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size, + void *vaddr); +int hl_fw_send_heartbeat(struct hl_device *hdev); +int hl_fw_cpucp_info_get(struct hl_device *hdev, + u32 sts_boot_dev_sts0_reg, + u32 sts_boot_dev_sts1_reg, u32 boot_err0_reg, + u32 boot_err1_reg); +int hl_fw_cpucp_handshake(struct hl_device *hdev, + u32 sts_boot_dev_sts0_reg, + u32 sts_boot_dev_sts1_reg, u32 boot_err0_reg, + u32 boot_err1_reg); +int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size); +int hl_fw_get_monitor_dump(struct hl_device *hdev, void *data); +int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev, + struct hl_info_pci_counters *counters); +int hl_fw_cpucp_total_energy_get(struct hl_device *hdev, + u64 *total_energy); +int get_used_pll_index(struct hl_device *hdev, u32 input_pll_index, + enum pll_index *pll_index); +int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, u32 pll_index, + u16 *pll_freq_arr); +int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power); +void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev); +void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev); +int hl_fw_init_cpu(struct hl_device *hdev); +int hl_fw_read_preboot_status(struct hl_device *hdev); +int hl_fw_dynamic_send_protocol_cmd(struct hl_device *hdev, + struct fw_load_mgr *fw_loader, + enum comms_cmd cmd, unsigned int size, + bool wait_ok, u32 timeout); +int hl_fw_dram_replaced_row_get(struct hl_device *hdev, + struct cpucp_hbm_row_info *info); +int hl_fw_dram_pending_row_get(struct hl_device *hdev, u32 *pend_rows_num); +int hl_fw_cpucp_engine_core_asid_set(struct hl_device *hdev, u32 asid); +int hl_fw_send_device_activity(struct hl_device *hdev, bool open); +int hl_pci_bars_map(struct hl_device *hdev, const char * const name[3], + bool is_wc[3]); +int hl_pci_elbi_read(struct hl_device *hdev, u64 addr, u32 *data); +int hl_pci_iatu_write(struct hl_device *hdev, u32 addr, u32 data); +int hl_pci_set_inbound_region(struct hl_device *hdev, u8 region, + struct hl_inbound_pci_region *pci_region); +int hl_pci_set_outbound_region(struct hl_device *hdev, + struct hl_outbound_pci_region *pci_region); +enum pci_region hl_get_pci_memory_region(struct hl_device *hdev, u64 addr); +int hl_pci_init(struct hl_device *hdev); +void hl_pci_fini(struct hl_device *hdev); + +long hl_fw_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr); +void hl_fw_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq); +int hl_get_temperature(struct hl_device *hdev, int sensor_index, u32 attr, long *value); +int hl_set_temperature(struct hl_device *hdev, int sensor_index, u32 attr, long value); +int hl_get_voltage(struct hl_device *hdev, int sensor_index, u32 attr, long *value); +int hl_get_current(struct hl_device *hdev, int sensor_index, u32 attr, long *value); +int hl_get_fan_speed(struct hl_device *hdev, int sensor_index, u32 attr, long *value); +int hl_get_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr, long *value); +void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr, long value); +long hl_fw_get_max_power(struct hl_device *hdev); +void hl_fw_set_max_power(struct hl_device *hdev); +int hl_fw_get_sec_attest_info(struct hl_device *hdev, struct cpucp_sec_attest_info *sec_attest_info, + u32 nonce); +int hl_set_voltage(struct hl_device *hdev, int sensor_index, u32 attr, long value); +int hl_set_current(struct hl_device *hdev, int sensor_index, u32 attr, long value); +int hl_set_power(struct hl_device *hdev, int sensor_index, u32 attr, long value); +int hl_get_power(struct hl_device *hdev, int sensor_index, u32 attr, long *value); +int hl_fw_get_clk_rate(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk); +void hl_fw_set_pll_profile(struct hl_device *hdev); +void hl_sysfs_add_dev_clk_attr(struct hl_device *hdev, struct attribute_group *dev_clk_attr_grp); +void hl_sysfs_add_dev_vrm_attr(struct hl_device *hdev, struct attribute_group *dev_vrm_attr_grp); + +void hw_sob_get(struct hl_hw_sob *hw_sob); +void hw_sob_put(struct hl_hw_sob *hw_sob); +void hl_encaps_handle_do_release(struct kref *ref); +void hl_hw_queue_encaps_sig_set_sob_info(struct hl_device *hdev, + struct hl_cs *cs, struct hl_cs_job *job, + struct hl_cs_compl *cs_cmpl); + +int hl_dec_init(struct hl_device *hdev); +void hl_dec_fini(struct hl_device *hdev); +void hl_dec_ctx_fini(struct hl_ctx *ctx); + +void hl_release_pending_user_interrupts(struct hl_device *hdev); +int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx, + struct hl_hw_sob **hw_sob, u32 count, bool encaps_sig); + +int hl_state_dump(struct hl_device *hdev); +const char *hl_state_dump_get_sync_name(struct hl_device *hdev, u32 sync_id); +const char *hl_state_dump_get_monitor_name(struct hl_device *hdev, + struct hl_mon_state_dump *mon); +void hl_state_dump_free_sync_to_engine_map(struct hl_sync_to_engine_map *map); +__printf(4, 5) int hl_snprintf_resize(char **buf, size_t *size, size_t *offset, + const char *format, ...); +char *hl_format_as_binary(char *buf, size_t buf_len, u32 n); +const char *hl_sync_engine_to_string(enum hl_sync_engine_type engine_type); + +void hl_mem_mgr_init(struct device *dev, struct hl_mem_mgr *mmg); +void hl_mem_mgr_fini(struct hl_mem_mgr *mmg); +int hl_mem_mgr_mmap(struct hl_mem_mgr *mmg, struct vm_area_struct *vma, + void *args); +struct hl_mmap_mem_buf *hl_mmap_mem_buf_get(struct hl_mem_mgr *mmg, + u64 handle); +int hl_mmap_mem_buf_put_handle(struct hl_mem_mgr *mmg, u64 handle); +int hl_mmap_mem_buf_put(struct hl_mmap_mem_buf *buf); +struct hl_mmap_mem_buf * +hl_mmap_mem_buf_alloc(struct hl_mem_mgr *mmg, + struct hl_mmap_mem_buf_behavior *behavior, gfp_t gfp, + void *args); +__printf(2, 3) void hl_engine_data_sprintf(struct engines_data *e, const char *fmt, ...); + +#ifdef CONFIG_DEBUG_FS + +void hl_debugfs_init(void); +void hl_debugfs_fini(void); +void hl_debugfs_add_device(struct hl_device *hdev); +void hl_debugfs_remove_device(struct hl_device *hdev); +void hl_debugfs_add_file(struct hl_fpriv *hpriv); +void hl_debugfs_remove_file(struct hl_fpriv *hpriv); +void hl_debugfs_add_cb(struct hl_cb *cb); +void hl_debugfs_remove_cb(struct hl_cb *cb); +void hl_debugfs_add_cs(struct hl_cs *cs); +void hl_debugfs_remove_cs(struct hl_cs *cs); +void hl_debugfs_add_job(struct hl_device *hdev, struct hl_cs_job *job); +void hl_debugfs_remove_job(struct hl_device *hdev, struct hl_cs_job *job); +void hl_debugfs_add_userptr(struct hl_device *hdev, struct hl_userptr *userptr); +void hl_debugfs_remove_userptr(struct hl_device *hdev, + struct hl_userptr *userptr); +void hl_debugfs_add_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx); +void hl_debugfs_remove_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx); +void hl_debugfs_set_state_dump(struct hl_device *hdev, char *data, + unsigned long length); + +#else + +static inline void __init hl_debugfs_init(void) +{ +} + +static inline void hl_debugfs_fini(void) +{ +} + +static inline void hl_debugfs_add_device(struct hl_device *hdev) +{ +} + +static inline void hl_debugfs_remove_device(struct hl_device *hdev) +{ +} + +static inline void hl_debugfs_add_file(struct hl_fpriv *hpriv) +{ +} + +static inline void hl_debugfs_remove_file(struct hl_fpriv *hpriv) +{ +} + +static inline void hl_debugfs_add_cb(struct hl_cb *cb) +{ +} + +static inline void hl_debugfs_remove_cb(struct hl_cb *cb) +{ +} + +static inline void hl_debugfs_add_cs(struct hl_cs *cs) +{ +} + +static inline void hl_debugfs_remove_cs(struct hl_cs *cs) +{ +} + +static inline void hl_debugfs_add_job(struct hl_device *hdev, + struct hl_cs_job *job) +{ +} + +static inline void hl_debugfs_remove_job(struct hl_device *hdev, + struct hl_cs_job *job) +{ +} + +static inline void hl_debugfs_add_userptr(struct hl_device *hdev, + struct hl_userptr *userptr) +{ +} + +static inline void hl_debugfs_remove_userptr(struct hl_device *hdev, + struct hl_userptr *userptr) +{ +} + +static inline void hl_debugfs_add_ctx_mem_hash(struct hl_device *hdev, + struct hl_ctx *ctx) +{ +} + +static inline void hl_debugfs_remove_ctx_mem_hash(struct hl_device *hdev, + struct hl_ctx *ctx) +{ +} + +static inline void hl_debugfs_set_state_dump(struct hl_device *hdev, + char *data, unsigned long length) +{ +} + +#endif + +/* Security */ +int hl_unsecure_register(struct hl_device *hdev, u32 mm_reg_addr, int offset, + const u32 pb_blocks[], struct hl_block_glbl_sec sgs_array[], + int array_size); +int hl_unsecure_registers(struct hl_device *hdev, const u32 mm_reg_array[], + int mm_array_size, int offset, const u32 pb_blocks[], + struct hl_block_glbl_sec sgs_array[], int blocks_array_size); +void hl_config_glbl_sec(struct hl_device *hdev, const u32 pb_blocks[], + struct hl_block_glbl_sec sgs_array[], u32 block_offset, + int array_size); +void hl_secure_block(struct hl_device *hdev, + struct hl_block_glbl_sec sgs_array[], int array_size); +int hl_init_pb_with_mask(struct hl_device *hdev, u32 num_dcores, + u32 dcore_offset, u32 num_instances, u32 instance_offset, + const u32 pb_blocks[], u32 blocks_array_size, + const u32 *regs_array, u32 regs_array_size, u64 mask); +int hl_init_pb(struct hl_device *hdev, u32 num_dcores, u32 dcore_offset, + u32 num_instances, u32 instance_offset, + const u32 pb_blocks[], u32 blocks_array_size, + const u32 *regs_array, u32 regs_array_size); +int hl_init_pb_ranges_with_mask(struct hl_device *hdev, u32 num_dcores, + u32 dcore_offset, u32 num_instances, u32 instance_offset, + const u32 pb_blocks[], u32 blocks_array_size, + const struct range *regs_range_array, u32 regs_range_array_size, + u64 mask); +int hl_init_pb_ranges(struct hl_device *hdev, u32 num_dcores, + u32 dcore_offset, u32 num_instances, u32 instance_offset, + const u32 pb_blocks[], u32 blocks_array_size, + const struct range *regs_range_array, + u32 regs_range_array_size); +int hl_init_pb_single_dcore(struct hl_device *hdev, u32 dcore_offset, + u32 num_instances, u32 instance_offset, + const u32 pb_blocks[], u32 blocks_array_size, + const u32 *regs_array, u32 regs_array_size); +int hl_init_pb_ranges_single_dcore(struct hl_device *hdev, u32 dcore_offset, + u32 num_instances, u32 instance_offset, + const u32 pb_blocks[], u32 blocks_array_size, + const struct range *regs_range_array, + u32 regs_range_array_size); +void hl_ack_pb(struct hl_device *hdev, u32 num_dcores, u32 dcore_offset, + u32 num_instances, u32 instance_offset, + const u32 pb_blocks[], u32 blocks_array_size); +void hl_ack_pb_with_mask(struct hl_device *hdev, u32 num_dcores, + u32 dcore_offset, u32 num_instances, u32 instance_offset, + const u32 pb_blocks[], u32 blocks_array_size, u64 mask); +void hl_ack_pb_single_dcore(struct hl_device *hdev, u32 dcore_offset, + u32 num_instances, u32 instance_offset, + const u32 pb_blocks[], u32 blocks_array_size); + +/* IOCTLs */ +long hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); +long hl_ioctl_control(struct file *filep, unsigned int cmd, unsigned long arg); +int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data); +int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data); +int hl_wait_ioctl(struct hl_fpriv *hpriv, void *data); +int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data); + +#endif /* HABANALABSP_H_ */ diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c new file mode 100644 index 000000000..ae3cab3f4 --- /dev/null +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -0,0 +1,735 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2021 HabanaLabs, Ltd. + * All Rights Reserved. + * + */ + +#define pr_fmt(fmt) "habanalabs: " fmt + +#include "habanalabs.h" + +#include <linux/pci.h> +#include <linux/aer.h> +#include <linux/module.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/habanalabs.h> + +#define HL_DRIVER_AUTHOR "HabanaLabs Kernel Driver Team" + +#define HL_DRIVER_DESC "Driver for HabanaLabs's AI Accelerators" + +MODULE_AUTHOR(HL_DRIVER_AUTHOR); +MODULE_DESCRIPTION(HL_DRIVER_DESC); +MODULE_LICENSE("GPL v2"); + +static int hl_major; +static struct class *hl_class; +static DEFINE_IDR(hl_devs_idr); +static DEFINE_MUTEX(hl_devs_idr_lock); + +#define HL_DEFAULT_TIMEOUT_LOCKED 30 /* 30 seconds */ +#define GAUDI_DEFAULT_TIMEOUT_LOCKED 600 /* 10 minutes */ + +static int timeout_locked = HL_DEFAULT_TIMEOUT_LOCKED; +static int reset_on_lockup = 1; +static int memory_scrub; +static ulong boot_error_status_mask = ULONG_MAX; + +module_param(timeout_locked, int, 0444); +MODULE_PARM_DESC(timeout_locked, + "Device lockup timeout in seconds (0 = disabled, default 30s)"); + +module_param(reset_on_lockup, int, 0444); +MODULE_PARM_DESC(reset_on_lockup, + "Do device reset on lockup (0 = no, 1 = yes, default yes)"); + +module_param(memory_scrub, int, 0444); +MODULE_PARM_DESC(memory_scrub, + "Scrub device memory in various states (0 = no, 1 = yes, default no)"); + +module_param(boot_error_status_mask, ulong, 0444); +MODULE_PARM_DESC(boot_error_status_mask, + "Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)"); + +#define PCI_IDS_GOYA 0x0001 +#define PCI_IDS_GAUDI 0x1000 +#define PCI_IDS_GAUDI_SEC 0x1010 + +#define PCI_IDS_GAUDI2 0x1020 + +static const struct pci_device_id ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), }, + { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), }, + { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI_SEC), }, + { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI2), }, + { 0, } +}; +MODULE_DEVICE_TABLE(pci, ids); + +/* + * get_asic_type - translate device id to asic type + * + * @device: id of the PCI device + * + * Translate device id to asic type. + * In case of unidentified device, return -1 + */ +static enum hl_asic_type get_asic_type(u16 device) +{ + enum hl_asic_type asic_type; + + switch (device) { + case PCI_IDS_GOYA: + asic_type = ASIC_GOYA; + break; + case PCI_IDS_GAUDI: + asic_type = ASIC_GAUDI; + break; + case PCI_IDS_GAUDI_SEC: + asic_type = ASIC_GAUDI_SEC; + break; + case PCI_IDS_GAUDI2: + asic_type = ASIC_GAUDI2; + break; + default: + asic_type = ASIC_INVALID; + break; + } + + return asic_type; +} + +static bool is_asic_secured(enum hl_asic_type asic_type) +{ + switch (asic_type) { + case ASIC_GAUDI_SEC: + return true; + default: + return false; + } +} + +/* + * hl_device_open - open function for habanalabs device + * + * @inode: pointer to inode structure + * @filp: pointer to file structure + * + * Called when process opens an habanalabs device. + */ +int hl_device_open(struct inode *inode, struct file *filp) +{ + enum hl_device_status status; + struct hl_device *hdev; + struct hl_fpriv *hpriv; + int rc; + + mutex_lock(&hl_devs_idr_lock); + hdev = idr_find(&hl_devs_idr, iminor(inode)); + mutex_unlock(&hl_devs_idr_lock); + + if (!hdev) { + pr_err("Couldn't find device %d:%d\n", + imajor(inode), iminor(inode)); + return -ENXIO; + } + + hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL); + if (!hpriv) + return -ENOMEM; + + hpriv->hdev = hdev; + filp->private_data = hpriv; + hpriv->filp = filp; + + mutex_init(&hpriv->notifier_event.lock); + mutex_init(&hpriv->restore_phase_mutex); + mutex_init(&hpriv->ctx_lock); + kref_init(&hpriv->refcount); + nonseekable_open(inode, filp); + + hl_ctx_mgr_init(&hpriv->ctx_mgr); + hl_mem_mgr_init(hpriv->hdev->dev, &hpriv->mem_mgr); + + hpriv->taskpid = get_task_pid(current, PIDTYPE_PID); + + mutex_lock(&hdev->fpriv_list_lock); + + if (!hl_device_operational(hdev, &status)) { + dev_dbg_ratelimited(hdev->dev, + "Can't open %s because it is %s\n", + dev_name(hdev->dev), hdev->status[status]); + + if (status == HL_DEVICE_STATUS_IN_RESET || + status == HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE) + rc = -EAGAIN; + else + rc = -EPERM; + + goto out_err; + } + + if (hdev->is_in_dram_scrub) { + dev_dbg_ratelimited(hdev->dev, + "Can't open %s during dram scrub\n", + dev_name(hdev->dev)); + rc = -EAGAIN; + goto out_err; + } + + if (hdev->compute_ctx_in_release) { + dev_dbg_ratelimited(hdev->dev, + "Can't open %s because another user is still releasing it\n", + dev_name(hdev->dev)); + rc = -EAGAIN; + goto out_err; + } + + if (hdev->is_compute_ctx_active) { + dev_dbg_ratelimited(hdev->dev, + "Can't open %s because another user is working on it\n", + dev_name(hdev->dev)); + rc = -EBUSY; + goto out_err; + } + + rc = hl_ctx_create(hdev, hpriv); + if (rc) { + dev_err(hdev->dev, "Failed to create context %d\n", rc); + goto out_err; + } + + list_add(&hpriv->dev_node, &hdev->fpriv_list); + mutex_unlock(&hdev->fpriv_list_lock); + + hdev->asic_funcs->send_device_activity(hdev, true); + + hl_debugfs_add_file(hpriv); + + atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1); + atomic_set(&hdev->captured_err_info.razwi.write_enable, 1); + hdev->captured_err_info.undef_opcode.write_enable = true; + + hdev->open_counter++; + hdev->last_successful_open_jif = jiffies; + hdev->last_successful_open_ktime = ktime_get(); + + return 0; + +out_err: + mutex_unlock(&hdev->fpriv_list_lock); + hl_mem_mgr_fini(&hpriv->mem_mgr); + hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr); + filp->private_data = NULL; + mutex_destroy(&hpriv->ctx_lock); + mutex_destroy(&hpriv->restore_phase_mutex); + mutex_destroy(&hpriv->notifier_event.lock); + put_pid(hpriv->taskpid); + + kfree(hpriv); + + return rc; +} + +int hl_device_open_ctrl(struct inode *inode, struct file *filp) +{ + struct hl_device *hdev; + struct hl_fpriv *hpriv; + int rc; + + mutex_lock(&hl_devs_idr_lock); + hdev = idr_find(&hl_devs_idr, iminor(inode)); + mutex_unlock(&hl_devs_idr_lock); + + if (!hdev) { + pr_err("Couldn't find device %d:%d\n", + imajor(inode), iminor(inode)); + return -ENXIO; + } + + hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL); + if (!hpriv) + return -ENOMEM; + + /* Prevent other routines from reading partial hpriv data by + * initializing hpriv fields before inserting it to the list + */ + hpriv->hdev = hdev; + filp->private_data = hpriv; + hpriv->filp = filp; + + mutex_init(&hpriv->notifier_event.lock); + nonseekable_open(inode, filp); + + hpriv->taskpid = get_task_pid(current, PIDTYPE_PID); + + mutex_lock(&hdev->fpriv_ctrl_list_lock); + + if (!hl_device_operational(hdev, NULL)) { + dev_dbg_ratelimited(hdev->dev_ctrl, + "Can't open %s because it is disabled or in reset\n", + dev_name(hdev->dev_ctrl)); + rc = -EPERM; + goto out_err; + } + + list_add(&hpriv->dev_node, &hdev->fpriv_ctrl_list); + mutex_unlock(&hdev->fpriv_ctrl_list_lock); + + return 0; + +out_err: + mutex_unlock(&hdev->fpriv_ctrl_list_lock); + filp->private_data = NULL; + put_pid(hpriv->taskpid); + + kfree(hpriv); + + return rc; +} + +static void set_driver_behavior_per_device(struct hl_device *hdev) +{ + hdev->nic_ports_mask = 0; + hdev->fw_components = FW_TYPE_ALL_TYPES; + hdev->mmu_enable = MMU_EN_ALL; + hdev->cpu_queues_enable = 1; + hdev->pldm = 0; + hdev->hard_reset_on_fw_events = 1; + hdev->bmc_enable = 1; + hdev->reset_on_preboot_fail = 1; + hdev->heartbeat = 1; +} + +static void copy_kernel_module_params_to_device(struct hl_device *hdev) +{ + hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type); + + hdev->major = hl_major; + hdev->memory_scrub = memory_scrub; + hdev->reset_on_lockup = reset_on_lockup; + hdev->boot_error_status_mask = boot_error_status_mask; +} + +static void fixup_device_params_per_asic(struct hl_device *hdev, int timeout) +{ + switch (hdev->asic_type) { + case ASIC_GAUDI: + case ASIC_GAUDI_SEC: + /* If user didn't request a different timeout than the default one, we have + * a different default timeout for Gaudi + */ + if (timeout == HL_DEFAULT_TIMEOUT_LOCKED) + hdev->timeout_jiffies = msecs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED * + MSEC_PER_SEC); + + hdev->reset_upon_device_release = 0; + break; + + case ASIC_GOYA: + hdev->reset_upon_device_release = 0; + break; + + default: + hdev->reset_upon_device_release = 1; + break; + } +} + +static int fixup_device_params(struct hl_device *hdev) +{ + int tmp_timeout; + + tmp_timeout = timeout_locked; + + hdev->fw_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC; + hdev->fw_comms_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC; + + if (tmp_timeout) + hdev->timeout_jiffies = msecs_to_jiffies(tmp_timeout * MSEC_PER_SEC); + else + hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT; + + hdev->stop_on_err = true; + hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; + hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT; + + /* Enable only after the initialization of the device */ + hdev->disabled = true; + + if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU) && + (hdev->fw_components & ~FW_TYPE_PREBOOT_CPU)) { + pr_err("Preboot must be set along with other components"); + return -EINVAL; + } + + /* If CPU queues not enabled, no way to do heartbeat */ + if (!hdev->cpu_queues_enable) + hdev->heartbeat = 0; + + fixup_device_params_per_asic(hdev, tmp_timeout); + + return 0; +} + +/** + * create_hdev - create habanalabs device instance + * + * @dev: will hold the pointer to the new habanalabs device structure + * @pdev: pointer to the pci device + * + * Allocate memory for habanalabs device and initialize basic fields + * Identify the ASIC type + * Allocate ID (minor) for the device (only for real devices) + */ +static int create_hdev(struct hl_device **dev, struct pci_dev *pdev) +{ + int main_id, ctrl_id = 0, rc = 0; + struct hl_device *hdev; + + *dev = NULL; + + hdev = kzalloc(sizeof(*hdev), GFP_KERNEL); + if (!hdev) + return -ENOMEM; + + /* Will be NULL in case of simulator device */ + hdev->pdev = pdev; + + /* Assign status description string */ + strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], "operational", HL_STR_MAX); + strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], "in reset", HL_STR_MAX); + strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], "disabled", HL_STR_MAX); + strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX); + strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION], + "in device creation", HL_STR_MAX); + strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE], + "in reset after device release", HL_STR_MAX); + + + /* First, we must find out which ASIC are we handling. This is needed + * to configure the behavior of the driver (kernel parameters) + */ + hdev->asic_type = get_asic_type(pdev->device); + if (hdev->asic_type == ASIC_INVALID) { + dev_err(&pdev->dev, "Unsupported ASIC\n"); + rc = -ENODEV; + goto free_hdev; + } + + copy_kernel_module_params_to_device(hdev); + + set_driver_behavior_per_device(hdev); + + fixup_device_params(hdev); + + mutex_lock(&hl_devs_idr_lock); + + /* Always save 2 numbers, 1 for main device and 1 for control. + * They must be consecutive + */ + main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, GFP_KERNEL); + + if (main_id >= 0) + ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1, + main_id + 2, GFP_KERNEL); + + mutex_unlock(&hl_devs_idr_lock); + + if ((main_id < 0) || (ctrl_id < 0)) { + if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC)) + pr_err("too many devices in the system\n"); + + if (main_id >= 0) { + mutex_lock(&hl_devs_idr_lock); + idr_remove(&hl_devs_idr, main_id); + mutex_unlock(&hl_devs_idr_lock); + } + + rc = -EBUSY; + goto free_hdev; + } + + hdev->id = main_id; + hdev->id_control = ctrl_id; + + *dev = hdev; + + return 0; + +free_hdev: + kfree(hdev); + return rc; +} + +/* + * destroy_hdev - destroy habanalabs device instance + * + * @dev: pointer to the habanalabs device structure + * + */ +static void destroy_hdev(struct hl_device *hdev) +{ + /* Remove device from the device list */ + mutex_lock(&hl_devs_idr_lock); + idr_remove(&hl_devs_idr, hdev->id); + idr_remove(&hl_devs_idr, hdev->id_control); + mutex_unlock(&hl_devs_idr_lock); + + kfree(hdev); +} + +static int hl_pmops_suspend(struct device *dev) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + + pr_debug("Going to suspend PCI device\n"); + + if (!hdev) { + pr_err("device pointer is NULL in suspend\n"); + return 0; + } + + return hl_device_suspend(hdev); +} + +static int hl_pmops_resume(struct device *dev) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + + pr_debug("Going to resume PCI device\n"); + + if (!hdev) { + pr_err("device pointer is NULL in resume\n"); + return 0; + } + + return hl_device_resume(hdev); +} + +/** + * hl_pci_probe - probe PCI habanalabs devices + * + * @pdev: pointer to pci device + * @id: pointer to pci device id structure + * + * Standard PCI probe function for habanalabs device. + * Create a new habanalabs device and initialize it according to the + * device's type + */ +static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct hl_device *hdev; + int rc; + + dev_info(&pdev->dev, HL_NAME + " device found [%04x:%04x] (rev %x)\n", + (int)pdev->vendor, (int)pdev->device, (int)pdev->revision); + + rc = create_hdev(&hdev, pdev); + if (rc) + return rc; + + pci_set_drvdata(pdev, hdev); + + pci_enable_pcie_error_reporting(pdev); + + rc = hl_device_init(hdev, hl_class); + if (rc) { + dev_err(&pdev->dev, "Fatal error during habanalabs device init\n"); + rc = -ENODEV; + goto disable_device; + } + + return 0; + +disable_device: + pci_disable_pcie_error_reporting(pdev); + pci_set_drvdata(pdev, NULL); + destroy_hdev(hdev); + + return rc; +} + +/* + * hl_pci_remove - remove PCI habanalabs devices + * + * @pdev: pointer to pci device + * + * Standard PCI remove function for habanalabs device + */ +static void hl_pci_remove(struct pci_dev *pdev) +{ + struct hl_device *hdev; + + hdev = pci_get_drvdata(pdev); + if (!hdev) + return; + + hl_device_fini(hdev); + pci_disable_pcie_error_reporting(pdev); + pci_set_drvdata(pdev, NULL); + destroy_hdev(hdev); +} + +/** + * hl_pci_err_detected - a PCI bus error detected on this device + * + * @pdev: pointer to pci device + * @state: PCI error type + * + * Called by the PCI subsystem whenever a non-correctable + * PCI bus error is detected + */ +static pci_ers_result_t +hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state) +{ + struct hl_device *hdev = pci_get_drvdata(pdev); + enum pci_ers_result result; + + switch (state) { + case pci_channel_io_normal: + return PCI_ERS_RESULT_CAN_RECOVER; + + case pci_channel_io_frozen: + dev_warn(hdev->dev, "frozen state error detected\n"); + result = PCI_ERS_RESULT_NEED_RESET; + break; + + case pci_channel_io_perm_failure: + dev_warn(hdev->dev, "failure state error detected\n"); + result = PCI_ERS_RESULT_DISCONNECT; + break; + + default: + result = PCI_ERS_RESULT_NONE; + } + + hdev->asic_funcs->halt_engines(hdev, true, false); + + return result; +} + +/** + * hl_pci_err_resume - resume after a PCI slot reset + * + * @pdev: pointer to pci device + * + */ +static void hl_pci_err_resume(struct pci_dev *pdev) +{ + struct hl_device *hdev = pci_get_drvdata(pdev); + + dev_warn(hdev->dev, "Resuming device after PCI slot reset\n"); + hl_device_resume(hdev); +} + +/** + * hl_pci_err_slot_reset - a PCI slot reset has just happened + * + * @pdev: pointer to pci device + * + * Determine if the driver can recover from the PCI slot reset + */ +static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev) +{ + return PCI_ERS_RESULT_RECOVERED; +} + +static const struct dev_pm_ops hl_pm_ops = { + .suspend = hl_pmops_suspend, + .resume = hl_pmops_resume, +}; + +static const struct pci_error_handlers hl_pci_err_handler = { + .error_detected = hl_pci_err_detected, + .slot_reset = hl_pci_err_slot_reset, + .resume = hl_pci_err_resume, +}; + +static struct pci_driver hl_pci_driver = { + .name = HL_NAME, + .id_table = ids, + .probe = hl_pci_probe, + .remove = hl_pci_remove, + .shutdown = hl_pci_remove, + .driver = { + .name = HL_NAME, + .pm = &hl_pm_ops, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, + }, + .err_handler = &hl_pci_err_handler, +}; + +/* + * hl_init - Initialize the habanalabs kernel driver + */ +static int __init hl_init(void) +{ + int rc; + dev_t dev; + + pr_info("loading driver\n"); + + rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME); + if (rc < 0) { + pr_err("unable to get major\n"); + return rc; + } + + hl_major = MAJOR(dev); + + hl_class = class_create(THIS_MODULE, HL_NAME); + if (IS_ERR(hl_class)) { + pr_err("failed to allocate class\n"); + rc = PTR_ERR(hl_class); + goto remove_major; + } + + hl_debugfs_init(); + + rc = pci_register_driver(&hl_pci_driver); + if (rc) { + pr_err("failed to register pci device\n"); + goto remove_debugfs; + } + + pr_debug("driver loaded\n"); + + return 0; + +remove_debugfs: + hl_debugfs_fini(); + class_destroy(hl_class); +remove_major: + unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS); + return rc; +} + +/* + * hl_exit - Release all resources of the habanalabs kernel driver + */ +static void __exit hl_exit(void) +{ + pci_unregister_driver(&hl_pci_driver); + + /* + * Removing debugfs must be after all devices or simulator devices + * have been removed because otherwise we get a bug in the + * debugfs module for referencing NULL objects + */ + hl_debugfs_fini(); + + class_destroy(hl_class); + unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS); + + idr_destroy(&hl_devs_idr); + + pr_debug("driver removed\n"); +} + +module_init(hl_init); +module_exit(hl_exit); diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c new file mode 100644 index 000000000..1ea1ae34b --- /dev/null +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -0,0 +1,1097 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2022 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#define pr_fmt(fmt) "habanalabs: " fmt + +#include <uapi/misc/habanalabs.h> +#include "habanalabs.h" + +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/uaccess.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +static u32 hl_debug_struct_size[HL_DEBUG_OP_TIMESTAMP + 1] = { + [HL_DEBUG_OP_ETR] = sizeof(struct hl_debug_params_etr), + [HL_DEBUG_OP_ETF] = sizeof(struct hl_debug_params_etf), + [HL_DEBUG_OP_STM] = sizeof(struct hl_debug_params_stm), + [HL_DEBUG_OP_FUNNEL] = 0, + [HL_DEBUG_OP_BMON] = sizeof(struct hl_debug_params_bmon), + [HL_DEBUG_OP_SPMU] = sizeof(struct hl_debug_params_spmu), + [HL_DEBUG_OP_TIMESTAMP] = 0 + +}; + +static int device_status_info(struct hl_device *hdev, struct hl_info_args *args) +{ + struct hl_info_device_status dev_stat = {0}; + u32 size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!size) || (!out)) + return -EINVAL; + + dev_stat.status = hl_device_status(hdev); + + return copy_to_user(out, &dev_stat, + min((size_t)size, sizeof(dev_stat))) ? -EFAULT : 0; +} + +static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args) +{ + struct hl_info_hw_ip_info hw_ip = {0}; + u32 size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + struct asic_fixed_properties *prop = &hdev->asic_prop; + u64 sram_kmd_size, dram_kmd_size, dram_available_size; + + if ((!size) || (!out)) + return -EINVAL; + + sram_kmd_size = (prop->sram_user_base_address - + prop->sram_base_address); + dram_kmd_size = (prop->dram_user_base_address - + prop->dram_base_address); + + hw_ip.device_id = hdev->asic_funcs->get_pci_id(hdev); + hw_ip.sram_base_address = prop->sram_user_base_address; + hw_ip.dram_base_address = + hdev->mmu_enable && prop->dram_supports_virtual_memory ? + prop->dmmu.start_addr : prop->dram_user_base_address; + hw_ip.tpc_enabled_mask = prop->tpc_enabled_mask & 0xFF; + hw_ip.tpc_enabled_mask_ext = prop->tpc_enabled_mask; + + hw_ip.sram_size = prop->sram_size - sram_kmd_size; + + dram_available_size = prop->dram_size - dram_kmd_size; + + if (hdev->mmu_enable == MMU_EN_ALL) + hw_ip.dram_size = DIV_ROUND_DOWN_ULL(dram_available_size, + prop->dram_page_size) * prop->dram_page_size; + else + hw_ip.dram_size = dram_available_size; + + if (hw_ip.dram_size > PAGE_SIZE) + hw_ip.dram_enabled = 1; + + hw_ip.dram_page_size = prop->dram_page_size; + hw_ip.device_mem_alloc_default_page_size = prop->device_mem_alloc_default_page_size; + hw_ip.num_of_events = prop->num_of_events; + + memcpy(hw_ip.cpucp_version, prop->cpucp_info.cpucp_version, + min(VERSION_MAX_LEN, HL_INFO_VERSION_MAX_LEN)); + + memcpy(hw_ip.card_name, prop->cpucp_info.card_name, + min(CARD_NAME_MAX_LEN, HL_INFO_CARD_NAME_MAX_LEN)); + + hw_ip.cpld_version = le32_to_cpu(prop->cpucp_info.cpld_version); + hw_ip.module_id = le32_to_cpu(prop->cpucp_info.card_location); + + hw_ip.psoc_pci_pll_nr = prop->psoc_pci_pll_nr; + hw_ip.psoc_pci_pll_nf = prop->psoc_pci_pll_nf; + hw_ip.psoc_pci_pll_od = prop->psoc_pci_pll_od; + hw_ip.psoc_pci_pll_div_factor = prop->psoc_pci_pll_div_factor; + + hw_ip.decoder_enabled_mask = prop->decoder_enabled_mask; + hw_ip.mme_master_slave_mode = prop->mme_master_slave_mode; + hw_ip.first_available_interrupt_id = prop->first_available_user_interrupt; + hw_ip.number_of_user_interrupts = prop->user_interrupt_count; + + hw_ip.edma_enabled_mask = prop->edma_enabled_mask; + hw_ip.server_type = prop->server_type; + hw_ip.security_enabled = prop->fw_security_enabled; + + return copy_to_user(out, &hw_ip, + min((size_t) size, sizeof(hw_ip))) ? -EFAULT : 0; +} + +static int hw_events_info(struct hl_device *hdev, bool aggregate, + struct hl_info_args *args) +{ + u32 size, max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + void *arr; + + if ((!max_size) || (!out)) + return -EINVAL; + + arr = hdev->asic_funcs->get_events_stat(hdev, aggregate, &size); + + return copy_to_user(out, arr, min(max_size, size)) ? -EFAULT : 0; +} + +static int events_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + u32 max_size = args->return_size; + u64 events_mask; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((max_size < sizeof(u64)) || (!out)) + return -EINVAL; + + mutex_lock(&hpriv->notifier_event.lock); + events_mask = hpriv->notifier_event.events_mask; + hpriv->notifier_event.events_mask = 0; + mutex_unlock(&hpriv->notifier_event.lock); + + return copy_to_user(out, &events_mask, sizeof(u64)) ? -EFAULT : 0; +} + +static int dram_usage_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + struct hl_info_dram_usage dram_usage = {0}; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + struct asic_fixed_properties *prop = &hdev->asic_prop; + u64 dram_kmd_size; + + if ((!max_size) || (!out)) + return -EINVAL; + + dram_kmd_size = (prop->dram_user_base_address - + prop->dram_base_address); + dram_usage.dram_free_mem = (prop->dram_size - dram_kmd_size) - + atomic64_read(&hdev->dram_used_mem); + if (hpriv->ctx) + dram_usage.ctx_dram_mem = + atomic64_read(&hpriv->ctx->dram_phys_mem); + + return copy_to_user(out, &dram_usage, + min((size_t) max_size, sizeof(dram_usage))) ? -EFAULT : 0; +} + +static int hw_idle(struct hl_device *hdev, struct hl_info_args *args) +{ + struct hl_info_hw_idle hw_idle = {0}; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!max_size) || (!out)) + return -EINVAL; + + hw_idle.is_idle = hdev->asic_funcs->is_device_idle(hdev, + hw_idle.busy_engines_mask_ext, + HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL); + hw_idle.busy_engines_mask = + lower_32_bits(hw_idle.busy_engines_mask_ext[0]); + + return copy_to_user(out, &hw_idle, + min((size_t) max_size, sizeof(hw_idle))) ? -EFAULT : 0; +} + +static int debug_coresight(struct hl_device *hdev, struct hl_ctx *ctx, struct hl_debug_args *args) +{ + struct hl_debug_params *params; + void *input = NULL, *output = NULL; + int rc; + + params = kzalloc(sizeof(*params), GFP_KERNEL); + if (!params) + return -ENOMEM; + + params->reg_idx = args->reg_idx; + params->enable = args->enable; + params->op = args->op; + + if (args->input_ptr && args->input_size) { + input = kzalloc(hl_debug_struct_size[args->op], GFP_KERNEL); + if (!input) { + rc = -ENOMEM; + goto out; + } + + if (copy_from_user(input, u64_to_user_ptr(args->input_ptr), + args->input_size)) { + rc = -EFAULT; + dev_err(hdev->dev, "failed to copy input debug data\n"); + goto out; + } + + params->input = input; + } + + if (args->output_ptr && args->output_size) { + output = kzalloc(args->output_size, GFP_KERNEL); + if (!output) { + rc = -ENOMEM; + goto out; + } + + params->output = output; + params->output_size = args->output_size; + } + + rc = hdev->asic_funcs->debug_coresight(hdev, ctx, params); + if (rc) { + dev_err(hdev->dev, + "debug coresight operation failed %d\n", rc); + goto out; + } + + if (output && copy_to_user((void __user *) (uintptr_t) args->output_ptr, + output, args->output_size)) { + dev_err(hdev->dev, "copy to user failed in debug ioctl\n"); + rc = -EFAULT; + goto out; + } + + +out: + kfree(params); + kfree(output); + kfree(input); + + return rc; +} + +static int device_utilization(struct hl_device *hdev, struct hl_info_args *args) +{ + struct hl_info_device_utilization device_util = {0}; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + int rc; + + if ((!max_size) || (!out)) + return -EINVAL; + + rc = hl_device_utilization(hdev, &device_util.utilization); + if (rc) + return -EINVAL; + + return copy_to_user(out, &device_util, + min((size_t) max_size, sizeof(device_util))) ? -EFAULT : 0; +} + +static int get_clk_rate(struct hl_device *hdev, struct hl_info_args *args) +{ + struct hl_info_clk_rate clk_rate = {0}; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + int rc; + + if ((!max_size) || (!out)) + return -EINVAL; + + rc = hl_fw_get_clk_rate(hdev, &clk_rate.cur_clk_rate_mhz, &clk_rate.max_clk_rate_mhz); + if (rc) + return rc; + + return copy_to_user(out, &clk_rate, min_t(size_t, max_size, sizeof(clk_rate))) + ? -EFAULT : 0; +} + +static int get_reset_count(struct hl_device *hdev, struct hl_info_args *args) +{ + struct hl_info_reset_count reset_count = {0}; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!max_size) || (!out)) + return -EINVAL; + + reset_count.hard_reset_cnt = hdev->reset_info.hard_reset_cnt; + reset_count.soft_reset_cnt = hdev->reset_info.compute_reset_cnt; + + return copy_to_user(out, &reset_count, + min((size_t) max_size, sizeof(reset_count))) ? -EFAULT : 0; +} + +static int time_sync_info(struct hl_device *hdev, struct hl_info_args *args) +{ + struct hl_info_time_sync time_sync = {0}; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!max_size) || (!out)) + return -EINVAL; + + time_sync.device_time = hdev->asic_funcs->get_device_time(hdev); + time_sync.host_time = ktime_get_raw_ns(); + + return copy_to_user(out, &time_sync, + min((size_t) max_size, sizeof(time_sync))) ? -EFAULT : 0; +} + +static int pci_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + struct hl_info_pci_counters pci_counters = {0}; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + int rc; + + if ((!max_size) || (!out)) + return -EINVAL; + + rc = hl_fw_cpucp_pci_counters_get(hdev, &pci_counters); + if (rc) + return rc; + + return copy_to_user(out, &pci_counters, + min((size_t) max_size, sizeof(pci_counters))) ? -EFAULT : 0; +} + +static int clk_throttle_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + struct hl_device *hdev = hpriv->hdev; + struct hl_info_clk_throttle clk_throttle = {0}; + ktime_t end_time, zero_time = ktime_set(0, 0); + u32 max_size = args->return_size; + int i; + + if ((!max_size) || (!out)) + return -EINVAL; + + mutex_lock(&hdev->clk_throttling.lock); + + clk_throttle.clk_throttling_reason = hdev->clk_throttling.current_reason; + + for (i = 0 ; i < HL_CLK_THROTTLE_TYPE_MAX ; i++) { + if (!(hdev->clk_throttling.aggregated_reason & BIT(i))) + continue; + + clk_throttle.clk_throttling_timestamp_us[i] = + ktime_to_us(hdev->clk_throttling.timestamp[i].start); + + if (ktime_compare(hdev->clk_throttling.timestamp[i].end, zero_time)) + end_time = hdev->clk_throttling.timestamp[i].end; + else + end_time = ktime_get(); + + clk_throttle.clk_throttling_duration_ns[i] = + ktime_to_ns(ktime_sub(end_time, + hdev->clk_throttling.timestamp[i].start)); + + } + mutex_unlock(&hdev->clk_throttling.lock); + + return copy_to_user(out, &clk_throttle, + min((size_t) max_size, sizeof(clk_throttle))) ? -EFAULT : 0; +} + +static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + struct hl_info_cs_counters cs_counters = {0}; + struct hl_device *hdev = hpriv->hdev; + struct hl_cs_counters_atomic *cntr; + u32 max_size = args->return_size; + + cntr = &hdev->aggregated_cs_counters; + + if ((!max_size) || (!out)) + return -EINVAL; + + cs_counters.total_out_of_mem_drop_cnt = + atomic64_read(&cntr->out_of_mem_drop_cnt); + cs_counters.total_parsing_drop_cnt = + atomic64_read(&cntr->parsing_drop_cnt); + cs_counters.total_queue_full_drop_cnt = + atomic64_read(&cntr->queue_full_drop_cnt); + cs_counters.total_device_in_reset_drop_cnt = + atomic64_read(&cntr->device_in_reset_drop_cnt); + cs_counters.total_max_cs_in_flight_drop_cnt = + atomic64_read(&cntr->max_cs_in_flight_drop_cnt); + cs_counters.total_validation_drop_cnt = + atomic64_read(&cntr->validation_drop_cnt); + + if (hpriv->ctx) { + cs_counters.ctx_out_of_mem_drop_cnt = + atomic64_read( + &hpriv->ctx->cs_counters.out_of_mem_drop_cnt); + cs_counters.ctx_parsing_drop_cnt = + atomic64_read( + &hpriv->ctx->cs_counters.parsing_drop_cnt); + cs_counters.ctx_queue_full_drop_cnt = + atomic64_read( + &hpriv->ctx->cs_counters.queue_full_drop_cnt); + cs_counters.ctx_device_in_reset_drop_cnt = + atomic64_read( + &hpriv->ctx->cs_counters.device_in_reset_drop_cnt); + cs_counters.ctx_max_cs_in_flight_drop_cnt = + atomic64_read( + &hpriv->ctx->cs_counters.max_cs_in_flight_drop_cnt); + cs_counters.ctx_validation_drop_cnt = + atomic64_read( + &hpriv->ctx->cs_counters.validation_drop_cnt); + } + + return copy_to_user(out, &cs_counters, + min((size_t) max_size, sizeof(cs_counters))) ? -EFAULT : 0; +} + +static int sync_manager_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct hl_info_sync_manager sm_info = {0}; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!max_size) || (!out)) + return -EINVAL; + + if (args->dcore_id >= HL_MAX_DCORES) + return -EINVAL; + + sm_info.first_available_sync_object = + prop->first_available_user_sob[args->dcore_id]; + sm_info.first_available_monitor = + prop->first_available_user_mon[args->dcore_id]; + sm_info.first_available_cq = + prop->first_available_cq[args->dcore_id]; + + return copy_to_user(out, &sm_info, min_t(size_t, (size_t) max_size, + sizeof(sm_info))) ? -EFAULT : 0; +} + +static int total_energy_consumption_info(struct hl_fpriv *hpriv, + struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + struct hl_info_energy total_energy = {0}; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + int rc; + + if ((!max_size) || (!out)) + return -EINVAL; + + rc = hl_fw_cpucp_total_energy_get(hdev, + &total_energy.total_energy_consumption); + if (rc) + return rc; + + return copy_to_user(out, &total_energy, + min((size_t) max_size, sizeof(total_energy))) ? -EFAULT : 0; +} + +static int pll_frequency_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + struct hl_pll_frequency_info freq_info = { {0} }; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + int rc; + + if ((!max_size) || (!out)) + return -EINVAL; + + rc = hl_fw_cpucp_pll_info_get(hdev, args->pll_index, freq_info.output); + if (rc) + return rc; + + return copy_to_user(out, &freq_info, + min((size_t) max_size, sizeof(freq_info))) ? -EFAULT : 0; +} + +static int power_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + u32 max_size = args->return_size; + struct hl_power_info power_info = {0}; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + int rc; + + if ((!max_size) || (!out)) + return -EINVAL; + + rc = hl_fw_cpucp_power_get(hdev, &power_info.power); + if (rc) + return rc; + + return copy_to_user(out, &power_info, + min((size_t) max_size, sizeof(power_info))) ? -EFAULT : 0; +} + +static int open_stats_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + u32 max_size = args->return_size; + struct hl_open_stats_info open_stats_info = {0}; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!max_size) || (!out)) + return -EINVAL; + + open_stats_info.last_open_period_ms = jiffies64_to_msecs( + hdev->last_open_session_duration_jif); + open_stats_info.open_counter = hdev->open_counter; + open_stats_info.is_compute_ctx_active = hdev->is_compute_ctx_active; + open_stats_info.compute_ctx_in_release = hdev->compute_ctx_in_release; + + return copy_to_user(out, &open_stats_info, + min((size_t) max_size, sizeof(open_stats_info))) ? -EFAULT : 0; +} + +static int dram_pending_rows_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + u32 max_size = args->return_size; + u32 pend_rows_num = 0; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + int rc; + + if ((!max_size) || (!out)) + return -EINVAL; + + rc = hl_fw_dram_pending_row_get(hdev, &pend_rows_num); + if (rc) + return rc; + + return copy_to_user(out, &pend_rows_num, + min_t(size_t, max_size, sizeof(pend_rows_num))) ? -EFAULT : 0; +} + +static int dram_replaced_rows_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + u32 max_size = args->return_size; + struct cpucp_hbm_row_info info = {0}; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + int rc; + + if ((!max_size) || (!out)) + return -EINVAL; + + rc = hl_fw_dram_replaced_row_get(hdev, &info); + if (rc) + return rc; + + return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0; +} + +static int last_err_open_dev_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_info_last_err_open_dev_time info = {0}; + struct hl_device *hdev = hpriv->hdev; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!max_size) || (!out)) + return -EINVAL; + + info.timestamp = ktime_to_ns(hdev->last_successful_open_ktime); + + return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0; +} + +static int cs_timeout_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_info_cs_timeout_event info = {0}; + struct hl_device *hdev = hpriv->hdev; + u32 max_size = args->return_size; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!max_size) || (!out)) + return -EINVAL; + + info.seq = hdev->captured_err_info.cs_timeout.seq; + info.timestamp = ktime_to_ns(hdev->captured_err_info.cs_timeout.timestamp); + + return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0; +} + +static int razwi_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + u32 max_size = args->return_size; + struct hl_info_razwi_event info = {0}; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!max_size) || (!out)) + return -EINVAL; + + info.timestamp = ktime_to_ns(hdev->captured_err_info.razwi.timestamp); + info.addr = hdev->captured_err_info.razwi.addr; + info.engine_id_1 = hdev->captured_err_info.razwi.engine_id_1; + info.engine_id_2 = hdev->captured_err_info.razwi.engine_id_2; + info.no_engine_id = hdev->captured_err_info.razwi.non_engine_initiator; + info.error_type = hdev->captured_err_info.razwi.type; + + return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0; +} + +static int undefined_opcode_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + u32 max_size = args->return_size; + struct hl_info_undefined_opcode_event info = {0}; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!max_size) || (!out)) + return -EINVAL; + + info.timestamp = ktime_to_ns(hdev->captured_err_info.undef_opcode.timestamp); + info.engine_id = hdev->captured_err_info.undef_opcode.engine_id; + info.cq_addr = hdev->captured_err_info.undef_opcode.cq_addr; + info.cq_size = hdev->captured_err_info.undef_opcode.cq_size; + info.stream_id = hdev->captured_err_info.undef_opcode.stream_id; + info.cb_addr_streams_len = hdev->captured_err_info.undef_opcode.cb_addr_streams_len; + memcpy(info.cb_addr_streams, hdev->captured_err_info.undef_opcode.cb_addr_streams, + sizeof(info.cb_addr_streams)); + + return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0; +} + +static int dev_mem_alloc_page_sizes_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + struct hl_info_dev_memalloc_page_sizes info = {0}; + struct hl_device *hdev = hpriv->hdev; + u32 max_size = args->return_size; + + if ((!max_size) || (!out)) + return -EINVAL; + + /* + * Future ASICs that will support multiple DRAM page sizes will support only "powers of 2" + * pages (unlike some of the ASICs before supporting multiple page sizes). + * For this reason for all ASICs that not support multiple page size the function will + * return an empty bitmask indicating that multiple page sizes is not supported. + */ + info.page_order_bitmask = hdev->asic_prop.dmmu.supported_pages_mask; + + return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0; +} + +static int sec_attest_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + struct cpucp_sec_attest_info *sec_attest_info; + struct hl_info_sec_attest *info; + u32 max_size = args->return_size; + int rc; + + if ((!max_size) || (!out)) + return -EINVAL; + + sec_attest_info = kmalloc(sizeof(*sec_attest_info), GFP_KERNEL); + if (!sec_attest_info) + return -ENOMEM; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + rc = -ENOMEM; + goto free_sec_attest_info; + } + + rc = hl_fw_get_sec_attest_info(hpriv->hdev, sec_attest_info, args->sec_attest_nonce); + if (rc) + goto free_info; + + info->nonce = le32_to_cpu(sec_attest_info->nonce); + info->pcr_quote_len = le16_to_cpu(sec_attest_info->pcr_quote_len); + info->pub_data_len = le16_to_cpu(sec_attest_info->pub_data_len); + info->certificate_len = le16_to_cpu(sec_attest_info->certificate_len); + info->pcr_num_reg = sec_attest_info->pcr_num_reg; + info->pcr_reg_len = sec_attest_info->pcr_reg_len; + info->quote_sig_len = sec_attest_info->quote_sig_len; + memcpy(&info->pcr_data, &sec_attest_info->pcr_data, sizeof(info->pcr_data)); + memcpy(&info->pcr_quote, &sec_attest_info->pcr_quote, sizeof(info->pcr_quote)); + memcpy(&info->public_data, &sec_attest_info->public_data, sizeof(info->public_data)); + memcpy(&info->certificate, &sec_attest_info->certificate, sizeof(info->certificate)); + memcpy(&info->quote_sig, &sec_attest_info->quote_sig, sizeof(info->quote_sig)); + + rc = copy_to_user(out, info, + min_t(size_t, max_size, sizeof(*info))) ? -EFAULT : 0; + +free_info: + kfree(info); +free_sec_attest_info: + kfree(sec_attest_info); + + return rc; +} + +static int eventfd_register(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + int rc; + + /* check if there is already a registered on that process */ + mutex_lock(&hpriv->notifier_event.lock); + if (hpriv->notifier_event.eventfd) { + mutex_unlock(&hpriv->notifier_event.lock); + return -EINVAL; + } + + hpriv->notifier_event.eventfd = eventfd_ctx_fdget(args->eventfd); + if (IS_ERR(hpriv->notifier_event.eventfd)) { + rc = PTR_ERR(hpriv->notifier_event.eventfd); + hpriv->notifier_event.eventfd = NULL; + mutex_unlock(&hpriv->notifier_event.lock); + return rc; + } + + mutex_unlock(&hpriv->notifier_event.lock); + return 0; +} + +static int eventfd_unregister(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + mutex_lock(&hpriv->notifier_event.lock); + if (!hpriv->notifier_event.eventfd) { + mutex_unlock(&hpriv->notifier_event.lock); + return -EINVAL; + } + + eventfd_ctx_put(hpriv->notifier_event.eventfd); + hpriv->notifier_event.eventfd = NULL; + mutex_unlock(&hpriv->notifier_event.lock); + return 0; +} + +static int engine_status_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + u32 status_buf_size = args->return_size; + struct hl_device *hdev = hpriv->hdev; + struct engines_data eng_data; + int rc; + + if ((status_buf_size < SZ_1K) || (status_buf_size > HL_ENGINES_DATA_MAX_SIZE) || (!out)) + return -EINVAL; + + eng_data.actual_size = 0; + eng_data.allocated_buf_size = status_buf_size; + eng_data.buf = vmalloc(status_buf_size); + if (!eng_data.buf) + return -ENOMEM; + + hdev->asic_funcs->is_device_idle(hdev, NULL, 0, &eng_data); + + if (eng_data.actual_size > eng_data.allocated_buf_size) { + dev_err(hdev->dev, + "Engines data size (%d Bytes) is bigger than allocated size (%u Bytes)\n", + eng_data.actual_size, status_buf_size); + vfree(eng_data.buf); + return -ENOMEM; + } + + args->user_buffer_actual_size = eng_data.actual_size; + rc = copy_to_user(out, eng_data.buf, min_t(size_t, status_buf_size, eng_data.actual_size)) ? + -EFAULT : 0; + + vfree(eng_data.buf); + + return rc; +} + +static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, + struct device *dev) +{ + enum hl_device_status status; + struct hl_info_args *args = data; + struct hl_device *hdev = hpriv->hdev; + + int rc; + + /* + * Information is returned for the following opcodes even if the device + * is disabled or in reset. + */ + switch (args->op) { + case HL_INFO_HW_IP_INFO: + return hw_ip_info(hdev, args); + + case HL_INFO_DEVICE_STATUS: + return device_status_info(hdev, args); + + case HL_INFO_RESET_COUNT: + return get_reset_count(hdev, args); + + case HL_INFO_HW_EVENTS: + return hw_events_info(hdev, false, args); + + case HL_INFO_HW_EVENTS_AGGREGATE: + return hw_events_info(hdev, true, args); + + case HL_INFO_CS_COUNTERS: + return cs_counters_info(hpriv, args); + + case HL_INFO_CLK_THROTTLE_REASON: + return clk_throttle_info(hpriv, args); + + case HL_INFO_SYNC_MANAGER: + return sync_manager_info(hpriv, args); + + case HL_INFO_OPEN_STATS: + return open_stats_info(hpriv, args); + + case HL_INFO_LAST_ERR_OPEN_DEV_TIME: + return last_err_open_dev_info(hpriv, args); + + case HL_INFO_CS_TIMEOUT_EVENT: + return cs_timeout_info(hpriv, args); + + case HL_INFO_RAZWI_EVENT: + return razwi_info(hpriv, args); + + case HL_INFO_UNDEFINED_OPCODE_EVENT: + return undefined_opcode_info(hpriv, args); + + case HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES: + return dev_mem_alloc_page_sizes_info(hpriv, args); + + case HL_INFO_GET_EVENTS: + return events_info(hpriv, args); + + default: + break; + } + + if (!hl_device_operational(hdev, &status)) { + dev_warn_ratelimited(dev, + "Device is %s. Can't execute INFO IOCTL\n", + hdev->status[status]); + return -EBUSY; + } + + switch (args->op) { + case HL_INFO_DRAM_USAGE: + rc = dram_usage_info(hpriv, args); + break; + + case HL_INFO_HW_IDLE: + rc = hw_idle(hdev, args); + break; + + case HL_INFO_DEVICE_UTILIZATION: + rc = device_utilization(hdev, args); + break; + + case HL_INFO_CLK_RATE: + rc = get_clk_rate(hdev, args); + break; + + case HL_INFO_TIME_SYNC: + return time_sync_info(hdev, args); + + case HL_INFO_PCI_COUNTERS: + return pci_counters_info(hpriv, args); + + case HL_INFO_TOTAL_ENERGY: + return total_energy_consumption_info(hpriv, args); + + case HL_INFO_PLL_FREQUENCY: + return pll_frequency_info(hpriv, args); + + case HL_INFO_POWER: + return power_info(hpriv, args); + + + case HL_INFO_DRAM_REPLACED_ROWS: + return dram_replaced_rows_info(hpriv, args); + + case HL_INFO_DRAM_PENDING_ROWS: + return dram_pending_rows_info(hpriv, args); + + case HL_INFO_SECURED_ATTESTATION: + return sec_attest_info(hpriv, args); + + case HL_INFO_REGISTER_EVENTFD: + return eventfd_register(hpriv, args); + + case HL_INFO_UNREGISTER_EVENTFD: + return eventfd_unregister(hpriv, args); + + case HL_INFO_ENGINE_STATUS: + return engine_status_info(hpriv, args); + + default: + dev_err(dev, "Invalid request %d\n", args->op); + rc = -EINVAL; + break; + } + + return rc; +} + +static int hl_info_ioctl(struct hl_fpriv *hpriv, void *data) +{ + return _hl_info_ioctl(hpriv, data, hpriv->hdev->dev); +} + +static int hl_info_ioctl_control(struct hl_fpriv *hpriv, void *data) +{ + return _hl_info_ioctl(hpriv, data, hpriv->hdev->dev_ctrl); +} + +static int hl_debug_ioctl(struct hl_fpriv *hpriv, void *data) +{ + struct hl_debug_args *args = data; + struct hl_device *hdev = hpriv->hdev; + enum hl_device_status status; + + int rc = 0; + + if (!hl_device_operational(hdev, &status)) { + dev_warn_ratelimited(hdev->dev, + "Device is %s. Can't execute DEBUG IOCTL\n", + hdev->status[status]); + return -EBUSY; + } + + switch (args->op) { + case HL_DEBUG_OP_ETR: + case HL_DEBUG_OP_ETF: + case HL_DEBUG_OP_STM: + case HL_DEBUG_OP_FUNNEL: + case HL_DEBUG_OP_BMON: + case HL_DEBUG_OP_SPMU: + case HL_DEBUG_OP_TIMESTAMP: + if (!hdev->in_debug) { + dev_err_ratelimited(hdev->dev, + "Rejecting debug configuration request because device not in debug mode\n"); + return -EFAULT; + } + args->input_size = min(args->input_size, hl_debug_struct_size[args->op]); + rc = debug_coresight(hdev, hpriv->ctx, args); + break; + + case HL_DEBUG_OP_SET_MODE: + rc = hl_device_set_debug_mode(hdev, hpriv->ctx, (bool) args->enable); + break; + + default: + dev_err(hdev->dev, "Invalid request %d\n", args->op); + rc = -EINVAL; + break; + } + + return rc; +} + +#define HL_IOCTL_DEF(ioctl, _func) \ + [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func} + +static const struct hl_ioctl_desc hl_ioctls[] = { + HL_IOCTL_DEF(HL_IOCTL_INFO, hl_info_ioctl), + HL_IOCTL_DEF(HL_IOCTL_CB, hl_cb_ioctl), + HL_IOCTL_DEF(HL_IOCTL_CS, hl_cs_ioctl), + HL_IOCTL_DEF(HL_IOCTL_WAIT_CS, hl_wait_ioctl), + HL_IOCTL_DEF(HL_IOCTL_MEMORY, hl_mem_ioctl), + HL_IOCTL_DEF(HL_IOCTL_DEBUG, hl_debug_ioctl) +}; + +static const struct hl_ioctl_desc hl_ioctls_control[] = { + HL_IOCTL_DEF(HL_IOCTL_INFO, hl_info_ioctl_control) +}; + +static long _hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg, + const struct hl_ioctl_desc *ioctl, struct device *dev) +{ + struct hl_fpriv *hpriv = filep->private_data; + unsigned int nr = _IOC_NR(cmd); + char stack_kdata[128] = {0}; + char *kdata = NULL; + unsigned int usize, asize; + hl_ioctl_t *func; + u32 hl_size; + int retcode; + + /* Do not trust userspace, use our own definition */ + func = ioctl->func; + + if (unlikely(!func)) { + dev_dbg(dev, "no function\n"); + retcode = -ENOTTY; + goto out_err; + } + + hl_size = _IOC_SIZE(ioctl->cmd); + usize = asize = _IOC_SIZE(cmd); + if (hl_size > asize) + asize = hl_size; + + cmd = ioctl->cmd; + + if (cmd & (IOC_IN | IOC_OUT)) { + if (asize <= sizeof(stack_kdata)) { + kdata = stack_kdata; + } else { + kdata = kzalloc(asize, GFP_KERNEL); + if (!kdata) { + retcode = -ENOMEM; + goto out_err; + } + } + } + + if (cmd & IOC_IN) { + if (copy_from_user(kdata, (void __user *)arg, usize)) { + retcode = -EFAULT; + goto out_err; + } + } else if (cmd & IOC_OUT) { + memset(kdata, 0, usize); + } + + retcode = func(hpriv, kdata); + + if ((cmd & IOC_OUT) && copy_to_user((void __user *)arg, kdata, usize)) + retcode = -EFAULT; + +out_err: + if (retcode) + dev_dbg(dev, "error in ioctl: pid=%d, cmd=0x%02x, nr=0x%02x\n", + task_pid_nr(current), cmd, nr); + + if (kdata != stack_kdata) + kfree(kdata); + + return retcode; +} + +long hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) +{ + struct hl_fpriv *hpriv = filep->private_data; + struct hl_device *hdev = hpriv->hdev; + const struct hl_ioctl_desc *ioctl = NULL; + unsigned int nr = _IOC_NR(cmd); + + if (!hdev) { + pr_err_ratelimited("Sending ioctl after device was removed! Please close FD\n"); + return -ENODEV; + } + + if ((nr >= HL_COMMAND_START) && (nr < HL_COMMAND_END)) { + ioctl = &hl_ioctls[nr]; + } else { + dev_err(hdev->dev, "invalid ioctl: pid=%d, nr=0x%02x\n", + task_pid_nr(current), nr); + return -ENOTTY; + } + + return _hl_ioctl(filep, cmd, arg, ioctl, hdev->dev); +} + +long hl_ioctl_control(struct file *filep, unsigned int cmd, unsigned long arg) +{ + struct hl_fpriv *hpriv = filep->private_data; + struct hl_device *hdev = hpriv->hdev; + const struct hl_ioctl_desc *ioctl = NULL; + unsigned int nr = _IOC_NR(cmd); + + if (!hdev) { + pr_err_ratelimited("Sending ioctl after device was removed! Please close FD\n"); + return -ENODEV; + } + + if (nr == _IOC_NR(HL_IOCTL_INFO)) { + ioctl = &hl_ioctls_control[nr]; + } else { + dev_err(hdev->dev_ctrl, "invalid ioctl: pid=%d, nr=0x%02x\n", + task_pid_nr(current), nr); + return -ENOTTY; + } + + return _hl_ioctl(filep, cmd, arg, ioctl, hdev->dev_ctrl); +} diff --git a/drivers/misc/habanalabs/common/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c new file mode 100644 index 000000000..d0087c0ec --- /dev/null +++ b/drivers/misc/habanalabs/common/hw_queue.c @@ -0,0 +1,1137 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2019 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "habanalabs.h" + +#include <linux/slab.h> + +/* + * hl_queue_add_ptr - add to pi or ci and checks if it wraps around + * + * @ptr: the current pi/ci value + * @val: the amount to add + * + * Add val to ptr. It can go until twice the queue length. + */ +inline u32 hl_hw_queue_add_ptr(u32 ptr, u16 val) +{ + ptr += val; + ptr &= ((HL_QUEUE_LENGTH << 1) - 1); + return ptr; +} +static inline int queue_ci_get(atomic_t *ci, u32 queue_len) +{ + return atomic_read(ci) & ((queue_len << 1) - 1); +} + +static inline int queue_free_slots(struct hl_hw_queue *q, u32 queue_len) +{ + int delta = (q->pi - queue_ci_get(&q->ci, queue_len)); + + if (delta >= 0) + return (queue_len - delta); + else + return (abs(delta) - queue_len); +} + +void hl_hw_queue_update_ci(struct hl_cs *cs) +{ + struct hl_device *hdev = cs->ctx->hdev; + struct hl_hw_queue *q; + int i; + + if (hdev->disabled) + return; + + q = &hdev->kernel_queues[0]; + + /* There are no internal queues if H/W queues are being used */ + if (!hdev->asic_prop.max_queues || q->queue_type == QUEUE_TYPE_HW) + return; + + /* We must increment CI for every queue that will never get a + * completion, there are 2 scenarios this can happen: + * 1. All queues of a non completion CS will never get a completion. + * 2. Internal queues never gets completion. + */ + for (i = 0 ; i < hdev->asic_prop.max_queues ; i++, q++) { + if (!cs_needs_completion(cs) || q->queue_type == QUEUE_TYPE_INT) + atomic_add(cs->jobs_in_queue_cnt[i], &q->ci); + } +} + +/* + * hl_hw_queue_submit_bd() - Submit a buffer descriptor to an external or a + * H/W queue. + * @hdev: pointer to habanalabs device structure + * @q: pointer to habanalabs queue structure + * @ctl: BD's control word + * @len: BD's length + * @ptr: BD's pointer + * + * This function assumes there is enough space on the queue to submit a new + * BD to it. It initializes the next BD and calls the device specific + * function to set the pi (and doorbell) + * + * This function must be called when the scheduler mutex is taken + * + */ +void hl_hw_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q, + u32 ctl, u32 len, u64 ptr) +{ + struct hl_bd *bd; + + bd = q->kernel_address; + bd += hl_pi_2_offset(q->pi); + bd->ctl = cpu_to_le32(ctl); + bd->len = cpu_to_le32(len); + bd->ptr = cpu_to_le64(ptr); + + q->pi = hl_queue_inc_ptr(q->pi); + hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi); +} + +/* + * ext_queue_sanity_checks - perform some sanity checks on external queue + * + * @hdev : pointer to hl_device structure + * @q : pointer to hl_hw_queue structure + * @num_of_entries : how many entries to check for space + * @reserve_cq_entry : whether to reserve an entry in the cq + * + * H/W queues spinlock should be taken before calling this function + * + * Perform the following: + * - Make sure we have enough space in the h/w queue + * - Make sure we have enough space in the completion queue + * - Reserve space in the completion queue (needs to be reversed if there + * is a failure down the road before the actual submission of work). Only + * do this action if reserve_cq_entry is true + * + */ +static int ext_queue_sanity_checks(struct hl_device *hdev, + struct hl_hw_queue *q, int num_of_entries, + bool reserve_cq_entry) +{ + atomic_t *free_slots = + &hdev->completion_queue[q->cq_id].free_slots_cnt; + int free_slots_cnt; + + /* Check we have enough space in the queue */ + free_slots_cnt = queue_free_slots(q, HL_QUEUE_LENGTH); + + if (free_slots_cnt < num_of_entries) { + dev_dbg(hdev->dev, "Queue %d doesn't have room for %d CBs\n", + q->hw_queue_id, num_of_entries); + return -EAGAIN; + } + + if (reserve_cq_entry) { + /* + * Check we have enough space in the completion queue + * Add -1 to counter (decrement) unless counter was already 0 + * In that case, CQ is full so we can't submit a new CB because + * we won't get ack on its completion + * atomic_add_unless will return 0 if counter was already 0 + */ + if (atomic_add_negative(num_of_entries * -1, free_slots)) { + dev_dbg(hdev->dev, "No space for %d on CQ %d\n", + num_of_entries, q->hw_queue_id); + atomic_add(num_of_entries, free_slots); + return -EAGAIN; + } + } + + return 0; +} + +/* + * int_queue_sanity_checks - perform some sanity checks on internal queue + * + * @hdev : pointer to hl_device structure + * @q : pointer to hl_hw_queue structure + * @num_of_entries : how many entries to check for space + * + * H/W queues spinlock should be taken before calling this function + * + * Perform the following: + * - Make sure we have enough space in the h/w queue + * + */ +static int int_queue_sanity_checks(struct hl_device *hdev, + struct hl_hw_queue *q, + int num_of_entries) +{ + int free_slots_cnt; + + if (num_of_entries > q->int_queue_len) { + dev_err(hdev->dev, + "Cannot populate queue %u with %u jobs\n", + q->hw_queue_id, num_of_entries); + return -ENOMEM; + } + + /* Check we have enough space in the queue */ + free_slots_cnt = queue_free_slots(q, q->int_queue_len); + + if (free_slots_cnt < num_of_entries) { + dev_dbg(hdev->dev, "Queue %d doesn't have room for %d CBs\n", + q->hw_queue_id, num_of_entries); + return -EAGAIN; + } + + return 0; +} + +/* + * hw_queue_sanity_checks() - Make sure we have enough space in the h/w queue + * @hdev: Pointer to hl_device structure. + * @q: Pointer to hl_hw_queue structure. + * @num_of_entries: How many entries to check for space. + * + * Notice: We do not reserve queue entries so this function mustn't be called + * more than once per CS for the same queue + * + */ +static int hw_queue_sanity_checks(struct hl_device *hdev, struct hl_hw_queue *q, + int num_of_entries) +{ + int free_slots_cnt; + + /* Check we have enough space in the queue */ + free_slots_cnt = queue_free_slots(q, HL_QUEUE_LENGTH); + + if (free_slots_cnt < num_of_entries) { + dev_dbg(hdev->dev, "Queue %d doesn't have room for %d CBs\n", + q->hw_queue_id, num_of_entries); + return -EAGAIN; + } + + return 0; +} + +/* + * hl_hw_queue_send_cb_no_cmpl - send a single CB (not a JOB) without completion + * + * @hdev: pointer to hl_device structure + * @hw_queue_id: Queue's type + * @cb_size: size of CB + * @cb_ptr: pointer to CB location + * + * This function sends a single CB, that must NOT generate a completion entry. + * Sending CPU messages can be done instead via 'hl_hw_queue_submit_bd()' + */ +int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id, + u32 cb_size, u64 cb_ptr) +{ + struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id]; + int rc = 0; + + hdev->asic_funcs->hw_queues_lock(hdev); + + if (hdev->disabled) { + rc = -EPERM; + goto out; + } + + /* + * hl_hw_queue_send_cb_no_cmpl() is called for queues of a H/W queue + * type only on init phase, when the queues are empty and being tested, + * so there is no need for sanity checks. + */ + if (q->queue_type != QUEUE_TYPE_HW) { + rc = ext_queue_sanity_checks(hdev, q, 1, false); + if (rc) + goto out; + } + + hl_hw_queue_submit_bd(hdev, q, 0, cb_size, cb_ptr); + +out: + hdev->asic_funcs->hw_queues_unlock(hdev); + + return rc; +} + +/* + * ext_queue_schedule_job - submit a JOB to an external queue + * + * @job: pointer to the job that needs to be submitted to the queue + * + * This function must be called when the scheduler mutex is taken + * + */ +static void ext_queue_schedule_job(struct hl_cs_job *job) +{ + struct hl_device *hdev = job->cs->ctx->hdev; + struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id]; + struct hl_cq_entry cq_pkt; + struct hl_cq *cq; + u64 cq_addr; + struct hl_cb *cb; + u32 ctl; + u32 len; + u64 ptr; + + /* + * Update the JOB ID inside the BD CTL so the device would know what + * to write in the completion queue + */ + ctl = ((q->pi << BD_CTL_SHADOW_INDEX_SHIFT) & BD_CTL_SHADOW_INDEX_MASK); + + cb = job->patched_cb; + len = job->job_cb_size; + ptr = cb->bus_address; + + /* Skip completion flow in case this is a non completion CS */ + if (!cs_needs_completion(job->cs)) + goto submit_bd; + + cq_pkt.data = cpu_to_le32( + ((q->pi << CQ_ENTRY_SHADOW_INDEX_SHIFT) + & CQ_ENTRY_SHADOW_INDEX_MASK) | + FIELD_PREP(CQ_ENTRY_SHADOW_INDEX_VALID_MASK, 1) | + FIELD_PREP(CQ_ENTRY_READY_MASK, 1)); + + /* + * No need to protect pi_offset because scheduling to the + * H/W queues is done under the scheduler mutex + * + * No need to check if CQ is full because it was already + * checked in ext_queue_sanity_checks + */ + cq = &hdev->completion_queue[q->cq_id]; + cq_addr = cq->bus_address + cq->pi * sizeof(struct hl_cq_entry); + + hdev->asic_funcs->add_end_of_cb_packets(hdev, cb->kernel_address, len, + job->user_cb_size, + cq_addr, + le32_to_cpu(cq_pkt.data), + q->msi_vec, + job->contains_dma_pkt); + + q->shadow_queue[hl_pi_2_offset(q->pi)] = job; + + cq->pi = hl_cq_inc_ptr(cq->pi); + +submit_bd: + hl_hw_queue_submit_bd(hdev, q, ctl, len, ptr); +} + +/* + * int_queue_schedule_job - submit a JOB to an internal queue + * + * @job: pointer to the job that needs to be submitted to the queue + * + * This function must be called when the scheduler mutex is taken + * + */ +static void int_queue_schedule_job(struct hl_cs_job *job) +{ + struct hl_device *hdev = job->cs->ctx->hdev; + struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id]; + struct hl_bd bd; + __le64 *pi; + + bd.ctl = 0; + bd.len = cpu_to_le32(job->job_cb_size); + + if (job->is_kernel_allocated_cb) + /* bus_address is actually a mmu mapped address + * allocated from an internal pool + */ + bd.ptr = cpu_to_le64(job->user_cb->bus_address); + else + bd.ptr = cpu_to_le64((u64) (uintptr_t) job->user_cb); + + pi = q->kernel_address + (q->pi & (q->int_queue_len - 1)) * sizeof(bd); + + q->pi++; + q->pi &= ((q->int_queue_len << 1) - 1); + + hdev->asic_funcs->pqe_write(hdev, pi, &bd); + + hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi); +} + +/* + * hw_queue_schedule_job - submit a JOB to a H/W queue + * + * @job: pointer to the job that needs to be submitted to the queue + * + * This function must be called when the scheduler mutex is taken + * + */ +static void hw_queue_schedule_job(struct hl_cs_job *job) +{ + struct hl_device *hdev = job->cs->ctx->hdev; + struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id]; + u64 ptr; + u32 offset, ctl, len; + + /* + * Upon PQE completion, COMP_DATA is used as the write data to the + * completion queue (QMAN HBW message), and COMP_OFFSET is used as the + * write address offset in the SM block (QMAN LBW message). + * The write address offset is calculated as "COMP_OFFSET << 2". + */ + offset = job->cs->sequence & (hdev->asic_prop.max_pending_cs - 1); + ctl = ((offset << BD_CTL_COMP_OFFSET_SHIFT) & BD_CTL_COMP_OFFSET_MASK) | + ((q->pi << BD_CTL_COMP_DATA_SHIFT) & BD_CTL_COMP_DATA_MASK); + + len = job->job_cb_size; + + /* + * A patched CB is created only if a user CB was allocated by driver and + * MMU is disabled. If MMU is enabled, the user CB should be used + * instead. If the user CB wasn't allocated by driver, assume that it + * holds an address. + */ + if (job->patched_cb) + ptr = job->patched_cb->bus_address; + else if (job->is_kernel_allocated_cb) + ptr = job->user_cb->bus_address; + else + ptr = (u64) (uintptr_t) job->user_cb; + + hl_hw_queue_submit_bd(hdev, q, ctl, len, ptr); +} + +static int init_signal_cs(struct hl_device *hdev, + struct hl_cs_job *job, struct hl_cs_compl *cs_cmpl) +{ + struct hl_sync_stream_properties *prop; + struct hl_hw_sob *hw_sob; + u32 q_idx; + int rc = 0; + + q_idx = job->hw_queue_id; + prop = &hdev->kernel_queues[q_idx].sync_stream_prop; + hw_sob = &prop->hw_sob[prop->curr_sob_offset]; + + cs_cmpl->hw_sob = hw_sob; + cs_cmpl->sob_val = prop->next_sob_val; + + dev_dbg(hdev->dev, + "generate signal CB, sob_id: %d, sob val: %u, q_idx: %d, seq: %llu\n", + cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val, q_idx, + cs_cmpl->cs_seq); + + /* we set an EB since we must make sure all oeprations are done + * when sending the signal + */ + hdev->asic_funcs->gen_signal_cb(hdev, job->patched_cb, + cs_cmpl->hw_sob->sob_id, 0, true); + + rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, 1, + false); + + job->cs->sob_addr_offset = hw_sob->sob_addr; + job->cs->initial_sob_count = prop->next_sob_val - 1; + + return rc; +} + +void hl_hw_queue_encaps_sig_set_sob_info(struct hl_device *hdev, + struct hl_cs *cs, struct hl_cs_job *job, + struct hl_cs_compl *cs_cmpl) +{ + struct hl_cs_encaps_sig_handle *handle = cs->encaps_sig_hdl; + u32 offset = 0; + + cs_cmpl->hw_sob = handle->hw_sob; + + /* Note that encaps_sig_wait_offset was validated earlier in the flow + * for offset value which exceeds the max reserved signal count. + * always decrement 1 of the offset since when the user + * set offset 1 for example he mean to wait only for the first + * signal only, which will be pre_sob_val, and if he set offset 2 + * then the value required is (pre_sob_val + 1) and so on... + * if user set wait offset to 0, then treat it as legacy wait cs, + * wait for the next signal. + */ + if (job->encaps_sig_wait_offset) + offset = job->encaps_sig_wait_offset - 1; + + cs_cmpl->sob_val = handle->pre_sob_val + offset; +} + +static int init_wait_cs(struct hl_device *hdev, struct hl_cs *cs, + struct hl_cs_job *job, struct hl_cs_compl *cs_cmpl) +{ + struct hl_gen_wait_properties wait_prop; + struct hl_sync_stream_properties *prop; + struct hl_cs_compl *signal_cs_cmpl; + u32 q_idx; + + q_idx = job->hw_queue_id; + prop = &hdev->kernel_queues[q_idx].sync_stream_prop; + + signal_cs_cmpl = container_of(cs->signal_fence, + struct hl_cs_compl, + base_fence); + + if (cs->encaps_signals) { + /* use the encaps signal handle stored earlier in the flow + * and set the SOB information from the encaps + * signals handle + */ + hl_hw_queue_encaps_sig_set_sob_info(hdev, cs, job, cs_cmpl); + + dev_dbg(hdev->dev, "Wait for encaps signals handle, qidx(%u), CS sequence(%llu), sob val: 0x%x, offset: %u\n", + cs->encaps_sig_hdl->q_idx, + cs->encaps_sig_hdl->cs_seq, + cs_cmpl->sob_val, + job->encaps_sig_wait_offset); + } else { + /* Copy the SOB id and value of the signal CS */ + cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob; + cs_cmpl->sob_val = signal_cs_cmpl->sob_val; + } + + /* check again if the signal cs already completed. + * if yes then don't send any wait cs since the hw_sob + * could be in reset already. if signal is not completed + * then get refcount to hw_sob to prevent resetting the sob + * while wait cs is not submitted. + * note that this check is protected by two locks, + * hw queue lock and completion object lock, + * and the same completion object lock also protects + * the hw_sob reset handler function. + * The hw_queue lock prevent out of sync of hw_sob + * refcount value, changed by signal/wait flows. + */ + spin_lock(&signal_cs_cmpl->lock); + + if (completion_done(&cs->signal_fence->completion)) { + spin_unlock(&signal_cs_cmpl->lock); + return -EINVAL; + } + + kref_get(&cs_cmpl->hw_sob->kref); + + spin_unlock(&signal_cs_cmpl->lock); + + dev_dbg(hdev->dev, + "generate wait CB, sob_id: %d, sob_val: 0x%x, mon_id: %d, q_idx: %d, seq: %llu\n", + cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val, + prop->base_mon_id, q_idx, cs->sequence); + + wait_prop.data = (void *) job->patched_cb; + wait_prop.sob_base = cs_cmpl->hw_sob->sob_id; + wait_prop.sob_mask = 0x1; + wait_prop.sob_val = cs_cmpl->sob_val; + wait_prop.mon_id = prop->base_mon_id; + wait_prop.q_idx = q_idx; + wait_prop.size = 0; + + hdev->asic_funcs->gen_wait_cb(hdev, &wait_prop); + + mb(); + hl_fence_put(cs->signal_fence); + cs->signal_fence = NULL; + + return 0; +} + +/* + * init_signal_wait_cs - initialize a signal/wait CS + * @cs: pointer to the signal/wait CS + * + * H/W queues spinlock should be taken before calling this function + */ +static int init_signal_wait_cs(struct hl_cs *cs) +{ + struct hl_ctx *ctx = cs->ctx; + struct hl_device *hdev = ctx->hdev; + struct hl_cs_job *job; + struct hl_cs_compl *cs_cmpl = + container_of(cs->fence, struct hl_cs_compl, base_fence); + int rc = 0; + + /* There is only one job in a signal/wait CS */ + job = list_first_entry(&cs->job_list, struct hl_cs_job, + cs_node); + + if (cs->type & CS_TYPE_SIGNAL) + rc = init_signal_cs(hdev, job, cs_cmpl); + else if (cs->type & CS_TYPE_WAIT) + rc = init_wait_cs(hdev, cs, job, cs_cmpl); + + return rc; +} + +static int encaps_sig_first_staged_cs_handler + (struct hl_device *hdev, struct hl_cs *cs) +{ + struct hl_cs_compl *cs_cmpl = + container_of(cs->fence, + struct hl_cs_compl, base_fence); + struct hl_cs_encaps_sig_handle *encaps_sig_hdl; + struct hl_encaps_signals_mgr *mgr; + int rc = 0; + + mgr = &cs->ctx->sig_mgr; + + spin_lock(&mgr->lock); + encaps_sig_hdl = idr_find(&mgr->handles, cs->encaps_sig_hdl_id); + if (encaps_sig_hdl) { + /* + * Set handler CS sequence, + * the CS which contains the encapsulated signals. + */ + encaps_sig_hdl->cs_seq = cs->sequence; + /* store the handle and set encaps signal indication, + * to be used later in cs_do_release to put the last + * reference to encaps signals handlers. + */ + cs_cmpl->encaps_signals = true; + cs_cmpl->encaps_sig_hdl = encaps_sig_hdl; + + /* set hw_sob pointer in completion object + * since it's used in cs_do_release flow to put + * refcount to sob + */ + cs_cmpl->hw_sob = encaps_sig_hdl->hw_sob; + cs_cmpl->sob_val = encaps_sig_hdl->pre_sob_val + + encaps_sig_hdl->count; + + dev_dbg(hdev->dev, "CS seq (%llu) added to encaps signal handler id (%u), count(%u), qidx(%u), sob(%u), val(%u)\n", + cs->sequence, encaps_sig_hdl->id, + encaps_sig_hdl->count, + encaps_sig_hdl->q_idx, + cs_cmpl->hw_sob->sob_id, + cs_cmpl->sob_val); + + } else { + dev_err(hdev->dev, "encaps handle id(%u) wasn't found!\n", + cs->encaps_sig_hdl_id); + rc = -EINVAL; + } + + spin_unlock(&mgr->lock); + + return rc; +} + +/* + * hl_hw_queue_schedule_cs - schedule a command submission + * @cs: pointer to the CS + */ +int hl_hw_queue_schedule_cs(struct hl_cs *cs) +{ + enum hl_device_status status; + struct hl_cs_counters_atomic *cntr; + struct hl_ctx *ctx = cs->ctx; + struct hl_device *hdev = ctx->hdev; + struct hl_cs_job *job, *tmp; + struct hl_hw_queue *q; + int rc = 0, i, cq_cnt; + bool first_entry; + u32 max_queues; + + cntr = &hdev->aggregated_cs_counters; + + hdev->asic_funcs->hw_queues_lock(hdev); + + if (!hl_device_operational(hdev, &status)) { + atomic64_inc(&cntr->device_in_reset_drop_cnt); + atomic64_inc(&ctx->cs_counters.device_in_reset_drop_cnt); + dev_err(hdev->dev, + "device is %s, CS rejected!\n", hdev->status[status]); + rc = -EPERM; + goto out; + } + + max_queues = hdev->asic_prop.max_queues; + + q = &hdev->kernel_queues[0]; + for (i = 0, cq_cnt = 0 ; i < max_queues ; i++, q++) { + if (cs->jobs_in_queue_cnt[i]) { + switch (q->queue_type) { + case QUEUE_TYPE_EXT: + rc = ext_queue_sanity_checks(hdev, q, + cs->jobs_in_queue_cnt[i], + cs_needs_completion(cs) ? + true : false); + break; + case QUEUE_TYPE_INT: + rc = int_queue_sanity_checks(hdev, q, + cs->jobs_in_queue_cnt[i]); + break; + case QUEUE_TYPE_HW: + rc = hw_queue_sanity_checks(hdev, q, + cs->jobs_in_queue_cnt[i]); + break; + default: + dev_err(hdev->dev, "Queue type %d is invalid\n", + q->queue_type); + rc = -EINVAL; + break; + } + + if (rc) { + atomic64_inc( + &ctx->cs_counters.queue_full_drop_cnt); + atomic64_inc(&cntr->queue_full_drop_cnt); + goto unroll_cq_resv; + } + + if (q->queue_type == QUEUE_TYPE_EXT) + cq_cnt++; + } + } + + if ((cs->type == CS_TYPE_SIGNAL) || (cs->type == CS_TYPE_WAIT)) { + rc = init_signal_wait_cs(cs); + if (rc) + goto unroll_cq_resv; + } else if (cs->type == CS_TYPE_COLLECTIVE_WAIT) { + rc = hdev->asic_funcs->collective_wait_init_cs(cs); + if (rc) + goto unroll_cq_resv; + } + + rc = hdev->asic_funcs->pre_schedule_cs(cs); + if (rc) { + dev_err(hdev->dev, + "Failed in pre-submission operations of CS %d.%llu\n", + ctx->asid, cs->sequence); + goto unroll_cq_resv; + } + + hdev->shadow_cs_queue[cs->sequence & + (hdev->asic_prop.max_pending_cs - 1)] = cs; + + if (cs->encaps_signals && cs->staged_first) { + rc = encaps_sig_first_staged_cs_handler(hdev, cs); + if (rc) + goto unroll_cq_resv; + } + + spin_lock(&hdev->cs_mirror_lock); + + /* Verify staged CS exists and add to the staged list */ + if (cs->staged_cs && !cs->staged_first) { + struct hl_cs *staged_cs; + + staged_cs = hl_staged_cs_find_first(hdev, cs->staged_sequence); + if (!staged_cs) { + dev_err(hdev->dev, + "Cannot find staged submission sequence %llu", + cs->staged_sequence); + rc = -EINVAL; + goto unlock_cs_mirror; + } + + if (is_staged_cs_last_exists(hdev, staged_cs)) { + dev_err(hdev->dev, + "Staged submission sequence %llu already submitted", + cs->staged_sequence); + rc = -EINVAL; + goto unlock_cs_mirror; + } + + list_add_tail(&cs->staged_cs_node, &staged_cs->staged_cs_node); + + /* update stream map of the first CS */ + if (hdev->supports_wait_for_multi_cs) + staged_cs->fence->stream_master_qid_map |= + cs->fence->stream_master_qid_map; + } + + list_add_tail(&cs->mirror_node, &hdev->cs_mirror_list); + + /* Queue TDR if the CS is the first entry and if timeout is wanted */ + first_entry = list_first_entry(&hdev->cs_mirror_list, + struct hl_cs, mirror_node) == cs; + if ((hdev->timeout_jiffies != MAX_SCHEDULE_TIMEOUT) && + first_entry && cs_needs_timeout(cs)) { + cs->tdr_active = true; + schedule_delayed_work(&cs->work_tdr, cs->timeout_jiffies); + + } + + spin_unlock(&hdev->cs_mirror_lock); + + list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node) + switch (job->queue_type) { + case QUEUE_TYPE_EXT: + ext_queue_schedule_job(job); + break; + case QUEUE_TYPE_INT: + int_queue_schedule_job(job); + break; + case QUEUE_TYPE_HW: + hw_queue_schedule_job(job); + break; + default: + break; + } + + cs->submitted = true; + + goto out; + +unlock_cs_mirror: + spin_unlock(&hdev->cs_mirror_lock); +unroll_cq_resv: + q = &hdev->kernel_queues[0]; + for (i = 0 ; (i < max_queues) && (cq_cnt > 0) ; i++, q++) { + if ((q->queue_type == QUEUE_TYPE_EXT) && + (cs->jobs_in_queue_cnt[i])) { + atomic_t *free_slots = + &hdev->completion_queue[i].free_slots_cnt; + atomic_add(cs->jobs_in_queue_cnt[i], free_slots); + cq_cnt--; + } + } + +out: + hdev->asic_funcs->hw_queues_unlock(hdev); + + return rc; +} + +/* + * hl_hw_queue_inc_ci_kernel - increment ci for kernel's queue + * + * @hdev: pointer to hl_device structure + * @hw_queue_id: which queue to increment its ci + */ +void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id) +{ + struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id]; + + atomic_inc(&q->ci); +} + +static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q, + bool is_cpu_queue) +{ + void *p; + int rc; + + if (is_cpu_queue) + p = hl_cpu_accessible_dma_pool_alloc(hdev, HL_QUEUE_SIZE_IN_BYTES, &q->bus_address); + else + p = hl_asic_dma_alloc_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES, &q->bus_address, + GFP_KERNEL | __GFP_ZERO); + if (!p) + return -ENOMEM; + + q->kernel_address = p; + + q->shadow_queue = kmalloc_array(HL_QUEUE_LENGTH, sizeof(struct hl_cs_job *), GFP_KERNEL); + if (!q->shadow_queue) { + dev_err(hdev->dev, + "Failed to allocate shadow queue for H/W queue %d\n", + q->hw_queue_id); + rc = -ENOMEM; + goto free_queue; + } + + /* Make sure read/write pointers are initialized to start of queue */ + atomic_set(&q->ci, 0); + q->pi = 0; + + return 0; + +free_queue: + if (is_cpu_queue) + hl_cpu_accessible_dma_pool_free(hdev, HL_QUEUE_SIZE_IN_BYTES, q->kernel_address); + else + hl_asic_dma_free_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES, q->kernel_address, + q->bus_address); + + return rc; +} + +static int int_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) +{ + void *p; + + p = hdev->asic_funcs->get_int_queue_base(hdev, q->hw_queue_id, + &q->bus_address, &q->int_queue_len); + if (!p) { + dev_err(hdev->dev, + "Failed to get base address for internal queue %d\n", + q->hw_queue_id); + return -EFAULT; + } + + q->kernel_address = p; + q->pi = 0; + atomic_set(&q->ci, 0); + + return 0; +} + +static int cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) +{ + return ext_and_cpu_queue_init(hdev, q, true); +} + +static int ext_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) +{ + return ext_and_cpu_queue_init(hdev, q, false); +} + +static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) +{ + void *p; + + p = hl_asic_dma_alloc_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES, &q->bus_address, + GFP_KERNEL | __GFP_ZERO); + if (!p) + return -ENOMEM; + + q->kernel_address = p; + + /* Make sure read/write pointers are initialized to start of queue */ + atomic_set(&q->ci, 0); + q->pi = 0; + + return 0; +} + +static void sync_stream_queue_init(struct hl_device *hdev, u32 q_idx) +{ + struct hl_sync_stream_properties *sync_stream_prop; + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct hl_hw_sob *hw_sob; + int sob, reserved_mon_idx, queue_idx; + + sync_stream_prop = &hdev->kernel_queues[q_idx].sync_stream_prop; + + /* We use 'collective_mon_idx' as a running index in order to reserve + * monitors for collective master/slave queues. + * collective master queue gets 2 reserved monitors + * collective slave queue gets 1 reserved monitor + */ + if (hdev->kernel_queues[q_idx].collective_mode == + HL_COLLECTIVE_MASTER) { + reserved_mon_idx = hdev->collective_mon_idx; + + /* reserve the first monitor for collective master queue */ + sync_stream_prop->collective_mstr_mon_id[0] = + prop->collective_first_mon + reserved_mon_idx; + + /* reserve the second monitor for collective master queue */ + sync_stream_prop->collective_mstr_mon_id[1] = + prop->collective_first_mon + reserved_mon_idx + 1; + + hdev->collective_mon_idx += HL_COLLECTIVE_RSVD_MSTR_MONS; + } else if (hdev->kernel_queues[q_idx].collective_mode == + HL_COLLECTIVE_SLAVE) { + reserved_mon_idx = hdev->collective_mon_idx++; + + /* reserve a monitor for collective slave queue */ + sync_stream_prop->collective_slave_mon_id = + prop->collective_first_mon + reserved_mon_idx; + } + + if (!hdev->kernel_queues[q_idx].supports_sync_stream) + return; + + queue_idx = hdev->sync_stream_queue_idx++; + + sync_stream_prop->base_sob_id = prop->sync_stream_first_sob + + (queue_idx * HL_RSVD_SOBS); + sync_stream_prop->base_mon_id = prop->sync_stream_first_mon + + (queue_idx * HL_RSVD_MONS); + sync_stream_prop->next_sob_val = 1; + sync_stream_prop->curr_sob_offset = 0; + + for (sob = 0 ; sob < HL_RSVD_SOBS ; sob++) { + hw_sob = &sync_stream_prop->hw_sob[sob]; + hw_sob->hdev = hdev; + hw_sob->sob_id = sync_stream_prop->base_sob_id + sob; + hw_sob->sob_addr = + hdev->asic_funcs->get_sob_addr(hdev, hw_sob->sob_id); + hw_sob->q_idx = q_idx; + kref_init(&hw_sob->kref); + } +} + +static void sync_stream_queue_reset(struct hl_device *hdev, u32 q_idx) +{ + struct hl_sync_stream_properties *prop = + &hdev->kernel_queues[q_idx].sync_stream_prop; + + /* + * In case we got here due to a stuck CS, the refcnt might be bigger + * than 1 and therefore we reset it. + */ + kref_init(&prop->hw_sob[prop->curr_sob_offset].kref); + prop->curr_sob_offset = 0; + prop->next_sob_val = 1; +} + +/* + * queue_init - main initialization function for H/W queue object + * + * @hdev: pointer to hl_device device structure + * @q: pointer to hl_hw_queue queue structure + * @hw_queue_id: The id of the H/W queue + * + * Allocate dma-able memory for the queue and initialize fields + * Returns 0 on success + */ +static int queue_init(struct hl_device *hdev, struct hl_hw_queue *q, + u32 hw_queue_id) +{ + int rc; + + q->hw_queue_id = hw_queue_id; + + switch (q->queue_type) { + case QUEUE_TYPE_EXT: + rc = ext_queue_init(hdev, q); + break; + case QUEUE_TYPE_INT: + rc = int_queue_init(hdev, q); + break; + case QUEUE_TYPE_CPU: + rc = cpu_queue_init(hdev, q); + break; + case QUEUE_TYPE_HW: + rc = hw_queue_init(hdev, q); + break; + case QUEUE_TYPE_NA: + q->valid = 0; + return 0; + default: + dev_crit(hdev->dev, "wrong queue type %d during init\n", + q->queue_type); + rc = -EINVAL; + break; + } + + sync_stream_queue_init(hdev, q->hw_queue_id); + + if (rc) + return rc; + + q->valid = 1; + + return 0; +} + +/* + * hw_queue_fini - destroy queue + * + * @hdev: pointer to hl_device device structure + * @q: pointer to hl_hw_queue queue structure + * + * Free the queue memory + */ +static void queue_fini(struct hl_device *hdev, struct hl_hw_queue *q) +{ + if (!q->valid) + return; + + /* + * If we arrived here, there are no jobs waiting on this queue + * so we can safely remove it. + * This is because this function can only called when: + * 1. Either a context is deleted, which only can occur if all its + * jobs were finished + * 2. A context wasn't able to be created due to failure or timeout, + * which means there are no jobs on the queue yet + * + * The only exception are the queues of the kernel context, but + * if they are being destroyed, it means that the entire module is + * being removed. If the module is removed, it means there is no open + * user context. It also means that if a job was submitted by + * the kernel driver (e.g. context creation), the job itself was + * released by the kernel driver when a timeout occurred on its + * Completion. Thus, we don't need to release it again. + */ + + if (q->queue_type == QUEUE_TYPE_INT) + return; + + kfree(q->shadow_queue); + + if (q->queue_type == QUEUE_TYPE_CPU) + hl_cpu_accessible_dma_pool_free(hdev, HL_QUEUE_SIZE_IN_BYTES, q->kernel_address); + else + hl_asic_dma_free_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES, q->kernel_address, + q->bus_address); +} + +int hl_hw_queues_create(struct hl_device *hdev) +{ + struct asic_fixed_properties *asic = &hdev->asic_prop; + struct hl_hw_queue *q; + int i, rc, q_ready_cnt; + + hdev->kernel_queues = kcalloc(asic->max_queues, + sizeof(*hdev->kernel_queues), GFP_KERNEL); + + if (!hdev->kernel_queues) { + dev_err(hdev->dev, "Not enough memory for H/W queues\n"); + return -ENOMEM; + } + + /* Initialize the H/W queues */ + for (i = 0, q_ready_cnt = 0, q = hdev->kernel_queues; + i < asic->max_queues ; i++, q_ready_cnt++, q++) { + + q->queue_type = asic->hw_queues_props[i].type; + q->supports_sync_stream = + asic->hw_queues_props[i].supports_sync_stream; + q->collective_mode = asic->hw_queues_props[i].collective_mode; + rc = queue_init(hdev, q, i); + if (rc) { + dev_err(hdev->dev, + "failed to initialize queue %d\n", i); + goto release_queues; + } + } + + return 0; + +release_queues: + for (i = 0, q = hdev->kernel_queues ; i < q_ready_cnt ; i++, q++) + queue_fini(hdev, q); + + kfree(hdev->kernel_queues); + + return rc; +} + +void hl_hw_queues_destroy(struct hl_device *hdev) +{ + struct hl_hw_queue *q; + u32 max_queues = hdev->asic_prop.max_queues; + int i; + + for (i = 0, q = hdev->kernel_queues ; i < max_queues ; i++, q++) + queue_fini(hdev, q); + + kfree(hdev->kernel_queues); +} + +void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset) +{ + struct hl_hw_queue *q; + u32 max_queues = hdev->asic_prop.max_queues; + int i; + + for (i = 0, q = hdev->kernel_queues ; i < max_queues ; i++, q++) { + if ((!q->valid) || + ((!hard_reset) && (q->queue_type == QUEUE_TYPE_CPU))) + continue; + q->pi = 0; + atomic_set(&q->ci, 0); + + if (q->supports_sync_stream) + sync_stream_queue_reset(hdev, q->hw_queue_id); + } +} diff --git a/drivers/misc/habanalabs/common/hwmon.c b/drivers/misc/habanalabs/common/hwmon.c new file mode 100644 index 000000000..55eb02038 --- /dev/null +++ b/drivers/misc/habanalabs/common/hwmon.c @@ -0,0 +1,934 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2019 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "habanalabs.h" + +#include <linux/pci.h> +#include <linux/hwmon.h> + +#define HWMON_NR_SENSOR_TYPES (hwmon_max) + +#ifdef _HAS_HWMON_HWMON_T_ENABLE + +static u32 fixup_flags_legacy_fw(struct hl_device *hdev, enum hwmon_sensor_types type, + u32 cpucp_flags) +{ + u32 flags; + + switch (type) { + case hwmon_temp: + flags = (cpucp_flags << 1) | HWMON_T_ENABLE; + break; + + case hwmon_in: + flags = (cpucp_flags << 1) | HWMON_I_ENABLE; + break; + + case hwmon_curr: + flags = (cpucp_flags << 1) | HWMON_C_ENABLE; + break; + + case hwmon_fan: + flags = (cpucp_flags << 1) | HWMON_F_ENABLE; + break; + + case hwmon_power: + flags = (cpucp_flags << 1) | HWMON_P_ENABLE; + break; + + case hwmon_pwm: + /* enable bit was here from day 1, so no need to adjust */ + flags = cpucp_flags; + break; + + default: + dev_err(hdev->dev, "unsupported h/w sensor type %d\n", type); + flags = cpucp_flags; + break; + } + + return flags; +} + +static u32 fixup_attr_legacy_fw(u32 attr) +{ + return (attr - 1); +} + +#else + +static u32 fixup_flags_legacy_fw(struct hl_device *hdev, enum hwmon_sensor_types type, + u32 cpucp_flags) +{ + return cpucp_flags; +} + +static u32 fixup_attr_legacy_fw(u32 attr) +{ + return attr; +} + +#endif /* !_HAS_HWMON_HWMON_T_ENABLE */ + +static u32 adjust_hwmon_flags(struct hl_device *hdev, enum hwmon_sensor_types type, u32 cpucp_flags) +{ + u32 flags, cpucp_input_val; + bool use_cpucp_enum; + + use_cpucp_enum = (hdev->asic_prop.fw_app_cpu_boot_dev_sts0 & + CPU_BOOT_DEV_STS0_MAP_HWMON_EN) ? true : false; + + /* If f/w is using it's own enum, we need to check if the properties values are aligned. + * If not, it means we need to adjust the values to the new format that is used in the + * kernel since 5.6 (enum values were incremented by 1 by adding a new enable value). + */ + if (use_cpucp_enum) { + switch (type) { + case hwmon_temp: + cpucp_input_val = cpucp_temp_input; + if (cpucp_input_val == hwmon_temp_input) + flags = cpucp_flags; + else + flags = (cpucp_flags << 1) | HWMON_T_ENABLE; + break; + + case hwmon_in: + cpucp_input_val = cpucp_in_input; + if (cpucp_input_val == hwmon_in_input) + flags = cpucp_flags; + else + flags = (cpucp_flags << 1) | HWMON_I_ENABLE; + break; + + case hwmon_curr: + cpucp_input_val = cpucp_curr_input; + if (cpucp_input_val == hwmon_curr_input) + flags = cpucp_flags; + else + flags = (cpucp_flags << 1) | HWMON_C_ENABLE; + break; + + case hwmon_fan: + cpucp_input_val = cpucp_fan_input; + if (cpucp_input_val == hwmon_fan_input) + flags = cpucp_flags; + else + flags = (cpucp_flags << 1) | HWMON_F_ENABLE; + break; + + case hwmon_pwm: + /* enable bit was here from day 1, so no need to adjust */ + flags = cpucp_flags; + break; + + case hwmon_power: + cpucp_input_val = CPUCP_POWER_INPUT; + if (cpucp_input_val == hwmon_power_input) + flags = cpucp_flags; + else + flags = (cpucp_flags << 1) | HWMON_P_ENABLE; + break; + + default: + dev_err(hdev->dev, "unsupported h/w sensor type %d\n", type); + flags = cpucp_flags; + break; + } + } else { + flags = fixup_flags_legacy_fw(hdev, type, cpucp_flags); + } + + return flags; +} + +int hl_build_hwmon_channel_info(struct hl_device *hdev, struct cpucp_sensor *sensors_arr) +{ + u32 num_sensors_for_type, flags, num_active_sensor_types = 0, arr_size = 0, *curr_arr; + u32 sensors_by_type_next_index[HWMON_NR_SENSOR_TYPES] = {0}; + u32 *sensors_by_type[HWMON_NR_SENSOR_TYPES] = {NULL}; + struct hwmon_channel_info **channels_info; + u32 counts[HWMON_NR_SENSOR_TYPES] = {0}; + enum hwmon_sensor_types type; + int rc, i, j; + + for (i = 0 ; i < CPUCP_MAX_SENSORS ; i++) { + type = le32_to_cpu(sensors_arr[i].type); + + if ((type == 0) && (sensors_arr[i].flags == 0)) + break; + + if (type >= HWMON_NR_SENSOR_TYPES) { + dev_err(hdev->dev, "Got wrong sensor type %d from device\n", type); + return -EINVAL; + } + + counts[type]++; + arr_size++; + } + + for (i = 0 ; i < HWMON_NR_SENSOR_TYPES ; i++) { + if (counts[i] == 0) + continue; + + num_sensors_for_type = counts[i] + 1; + dev_dbg(hdev->dev, "num_sensors_for_type %d = %d\n", i, num_sensors_for_type); + + curr_arr = kcalloc(num_sensors_for_type, sizeof(*curr_arr), GFP_KERNEL); + if (!curr_arr) { + rc = -ENOMEM; + goto sensors_type_err; + } + + num_active_sensor_types++; + sensors_by_type[i] = curr_arr; + } + + for (i = 0 ; i < arr_size ; i++) { + type = le32_to_cpu(sensors_arr[i].type); + curr_arr = sensors_by_type[type]; + flags = adjust_hwmon_flags(hdev, type, le32_to_cpu(sensors_arr[i].flags)); + curr_arr[sensors_by_type_next_index[type]++] = flags; + } + + channels_info = kcalloc(num_active_sensor_types + 1, sizeof(struct hwmon_channel_info *), + GFP_KERNEL); + if (!channels_info) { + rc = -ENOMEM; + goto channels_info_array_err; + } + + for (i = 0 ; i < num_active_sensor_types ; i++) { + channels_info[i] = kzalloc(sizeof(*channels_info[i]), GFP_KERNEL); + if (!channels_info[i]) { + rc = -ENOMEM; + goto channel_info_err; + } + } + + for (i = 0, j = 0 ; i < HWMON_NR_SENSOR_TYPES ; i++) { + if (!sensors_by_type[i]) + continue; + + channels_info[j]->type = i; + channels_info[j]->config = sensors_by_type[i]; + j++; + } + + hdev->hl_chip_info->info = (const struct hwmon_channel_info **)channels_info; + + return 0; + +channel_info_err: + for (i = 0 ; i < num_active_sensor_types ; i++) { + if (channels_info[i]) { + kfree(channels_info[i]->config); + kfree(channels_info[i]); + } + } + kfree(channels_info); + +channels_info_array_err: +sensors_type_err: + for (i = 0 ; i < HWMON_NR_SENSOR_TYPES ; i++) + kfree(sensors_by_type[i]); + + return rc; +} + +static int hl_read(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel, long *val) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + bool use_cpucp_enum; + u32 cpucp_attr; + int rc; + + if (!hl_device_operational(hdev, NULL)) + return -ENODEV; + + use_cpucp_enum = (hdev->asic_prop.fw_app_cpu_boot_dev_sts0 & + CPU_BOOT_DEV_STS0_MAP_HWMON_EN) ? true : false; + + switch (type) { + case hwmon_temp: + switch (attr) { + case hwmon_temp_input: + cpucp_attr = cpucp_temp_input; + break; + case hwmon_temp_max: + cpucp_attr = cpucp_temp_max; + break; + case hwmon_temp_crit: + cpucp_attr = cpucp_temp_crit; + break; + case hwmon_temp_max_hyst: + cpucp_attr = cpucp_temp_max_hyst; + break; + case hwmon_temp_crit_hyst: + cpucp_attr = cpucp_temp_crit_hyst; + break; + case hwmon_temp_offset: + cpucp_attr = cpucp_temp_offset; + break; + case hwmon_temp_highest: + cpucp_attr = cpucp_temp_highest; + break; + default: + return -EINVAL; + } + + if (use_cpucp_enum) + rc = hl_get_temperature(hdev, channel, cpucp_attr, val); + else + rc = hl_get_temperature(hdev, channel, fixup_attr_legacy_fw(attr), val); + break; + case hwmon_in: + switch (attr) { + case hwmon_in_input: + cpucp_attr = cpucp_in_input; + break; + case hwmon_in_min: + cpucp_attr = cpucp_in_min; + break; + case hwmon_in_max: + cpucp_attr = cpucp_in_max; + break; + case hwmon_in_highest: + cpucp_attr = cpucp_in_highest; + break; + default: + return -EINVAL; + } + + if (use_cpucp_enum) + rc = hl_get_voltage(hdev, channel, cpucp_attr, val); + else + rc = hl_get_voltage(hdev, channel, fixup_attr_legacy_fw(attr), val); + break; + case hwmon_curr: + switch (attr) { + case hwmon_curr_input: + cpucp_attr = cpucp_curr_input; + break; + case hwmon_curr_min: + cpucp_attr = cpucp_curr_min; + break; + case hwmon_curr_max: + cpucp_attr = cpucp_curr_max; + break; + case hwmon_curr_highest: + cpucp_attr = cpucp_curr_highest; + break; + default: + return -EINVAL; + } + + if (use_cpucp_enum) + rc = hl_get_current(hdev, channel, cpucp_attr, val); + else + rc = hl_get_current(hdev, channel, fixup_attr_legacy_fw(attr), val); + break; + case hwmon_fan: + switch (attr) { + case hwmon_fan_input: + cpucp_attr = cpucp_fan_input; + break; + case hwmon_fan_min: + cpucp_attr = cpucp_fan_min; + break; + case hwmon_fan_max: + cpucp_attr = cpucp_fan_max; + break; + default: + return -EINVAL; + } + + if (use_cpucp_enum) + rc = hl_get_fan_speed(hdev, channel, cpucp_attr, val); + else + rc = hl_get_fan_speed(hdev, channel, fixup_attr_legacy_fw(attr), val); + break; + case hwmon_pwm: + switch (attr) { + case hwmon_pwm_input: + cpucp_attr = cpucp_pwm_input; + break; + case hwmon_pwm_enable: + cpucp_attr = cpucp_pwm_enable; + break; + default: + return -EINVAL; + } + + if (use_cpucp_enum) + rc = hl_get_pwm_info(hdev, channel, cpucp_attr, val); + else + /* no need for fixup as pwm was aligned from day 1 */ + rc = hl_get_pwm_info(hdev, channel, attr, val); + break; + case hwmon_power: + switch (attr) { + case hwmon_power_input: + cpucp_attr = CPUCP_POWER_INPUT; + break; + case hwmon_power_input_highest: + cpucp_attr = CPUCP_POWER_INPUT_HIGHEST; + break; + default: + return -EINVAL; + } + + if (use_cpucp_enum) + rc = hl_get_power(hdev, channel, cpucp_attr, val); + else + rc = hl_get_power(hdev, channel, fixup_attr_legacy_fw(attr), val); + break; + default: + return -EINVAL; + } + return rc; +} + +static int hl_write(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel, long val) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + u32 cpucp_attr; + bool use_cpucp_enum = (hdev->asic_prop.fw_app_cpu_boot_dev_sts0 & + CPU_BOOT_DEV_STS0_MAP_HWMON_EN) ? true : false; + + if (!hl_device_operational(hdev, NULL)) + return -ENODEV; + + switch (type) { + case hwmon_temp: + switch (attr) { + case hwmon_temp_offset: + cpucp_attr = cpucp_temp_offset; + break; + case hwmon_temp_reset_history: + cpucp_attr = cpucp_temp_reset_history; + break; + default: + return -EINVAL; + } + + if (use_cpucp_enum) + hl_set_temperature(hdev, channel, cpucp_attr, val); + else + hl_set_temperature(hdev, channel, fixup_attr_legacy_fw(attr), val); + break; + case hwmon_pwm: + switch (attr) { + case hwmon_pwm_input: + cpucp_attr = cpucp_pwm_input; + break; + case hwmon_pwm_enable: + cpucp_attr = cpucp_pwm_enable; + break; + default: + return -EINVAL; + } + + if (use_cpucp_enum) + hl_set_pwm_info(hdev, channel, cpucp_attr, val); + else + /* no need for fixup as pwm was aligned from day 1 */ + hl_set_pwm_info(hdev, channel, attr, val); + break; + case hwmon_in: + switch (attr) { + case hwmon_in_reset_history: + cpucp_attr = cpucp_in_reset_history; + break; + default: + return -EINVAL; + } + + if (use_cpucp_enum) + hl_set_voltage(hdev, channel, cpucp_attr, val); + else + hl_set_voltage(hdev, channel, fixup_attr_legacy_fw(attr), val); + break; + case hwmon_curr: + switch (attr) { + case hwmon_curr_reset_history: + cpucp_attr = cpucp_curr_reset_history; + break; + default: + return -EINVAL; + } + + if (use_cpucp_enum) + hl_set_current(hdev, channel, cpucp_attr, val); + else + hl_set_current(hdev, channel, fixup_attr_legacy_fw(attr), val); + break; + case hwmon_power: + switch (attr) { + case hwmon_power_reset_history: + cpucp_attr = CPUCP_POWER_RESET_INPUT_HISTORY; + break; + default: + return -EINVAL; + } + + if (use_cpucp_enum) + hl_set_power(hdev, channel, cpucp_attr, val); + else + hl_set_power(hdev, channel, fixup_attr_legacy_fw(attr), val); + break; + default: + return -EINVAL; + } + return 0; +} + +static umode_t hl_is_visible(const void *data, enum hwmon_sensor_types type, + u32 attr, int channel) +{ + switch (type) { + case hwmon_temp: + switch (attr) { + case hwmon_temp_input: + case hwmon_temp_max: + case hwmon_temp_max_hyst: + case hwmon_temp_crit: + case hwmon_temp_crit_hyst: + case hwmon_temp_highest: + return 0444; + case hwmon_temp_offset: + return 0644; + case hwmon_temp_reset_history: + return 0200; + } + break; + case hwmon_in: + switch (attr) { + case hwmon_in_input: + case hwmon_in_min: + case hwmon_in_max: + case hwmon_in_highest: + return 0444; + case hwmon_in_reset_history: + return 0200; + } + break; + case hwmon_curr: + switch (attr) { + case hwmon_curr_input: + case hwmon_curr_min: + case hwmon_curr_max: + case hwmon_curr_highest: + return 0444; + case hwmon_curr_reset_history: + return 0200; + } + break; + case hwmon_fan: + switch (attr) { + case hwmon_fan_input: + case hwmon_fan_min: + case hwmon_fan_max: + return 0444; + } + break; + case hwmon_pwm: + switch (attr) { + case hwmon_pwm_input: + case hwmon_pwm_enable: + return 0644; + } + break; + case hwmon_power: + switch (attr) { + case hwmon_power_input: + case hwmon_power_input_highest: + return 0444; + case hwmon_power_reset_history: + return 0200; + } + break; + default: + break; + } + return 0; +} + +static const struct hwmon_ops hl_hwmon_ops = { + .is_visible = hl_is_visible, + .read = hl_read, + .write = hl_write +}; + +int hl_get_temperature(struct hl_device *hdev, + int sensor_index, u32 attr, long *value) +{ + struct cpucp_packet pkt; + u64 result; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEMPERATURE_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.sensor_index = __cpu_to_le16(sensor_index); + pkt.type = __cpu_to_le16(attr); + + dev_dbg(hdev->dev, "get temp, ctl 0x%x, sensor %d, type %d\n", + pkt.ctl, pkt.sensor_index, pkt.type); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + 0, &result); + + *value = (long) result; + + if (rc) { + dev_err(hdev->dev, + "Failed to get temperature from sensor %d, error %d\n", + sensor_index, rc); + *value = 0; + } + + return rc; +} + +int hl_set_temperature(struct hl_device *hdev, + int sensor_index, u32 attr, long value) +{ + struct cpucp_packet pkt; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEMPERATURE_SET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.sensor_index = __cpu_to_le16(sensor_index); + pkt.type = __cpu_to_le16(attr); + pkt.value = __cpu_to_le64(value); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + 0, NULL); + + if (rc) + dev_err(hdev->dev, + "Failed to set temperature of sensor %d, error %d\n", + sensor_index, rc); + + return rc; +} + +int hl_get_voltage(struct hl_device *hdev, + int sensor_index, u32 attr, long *value) +{ + struct cpucp_packet pkt; + u64 result; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_VOLTAGE_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.sensor_index = __cpu_to_le16(sensor_index); + pkt.type = __cpu_to_le16(attr); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + 0, &result); + + *value = (long) result; + + if (rc) { + dev_err(hdev->dev, + "Failed to get voltage from sensor %d, error %d\n", + sensor_index, rc); + *value = 0; + } + + return rc; +} + +int hl_get_current(struct hl_device *hdev, + int sensor_index, u32 attr, long *value) +{ + struct cpucp_packet pkt; + u64 result; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_CURRENT_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.sensor_index = __cpu_to_le16(sensor_index); + pkt.type = __cpu_to_le16(attr); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + 0, &result); + + *value = (long) result; + + if (rc) { + dev_err(hdev->dev, + "Failed to get current from sensor %d, error %d\n", + sensor_index, rc); + *value = 0; + } + + return rc; +} + +int hl_get_fan_speed(struct hl_device *hdev, + int sensor_index, u32 attr, long *value) +{ + struct cpucp_packet pkt; + u64 result; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_FAN_SPEED_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.sensor_index = __cpu_to_le16(sensor_index); + pkt.type = __cpu_to_le16(attr); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + 0, &result); + + *value = (long) result; + + if (rc) { + dev_err(hdev->dev, + "Failed to get fan speed from sensor %d, error %d\n", + sensor_index, rc); + *value = 0; + } + + return rc; +} + +int hl_get_pwm_info(struct hl_device *hdev, + int sensor_index, u32 attr, long *value) +{ + struct cpucp_packet pkt; + u64 result; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_PWM_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.sensor_index = __cpu_to_le16(sensor_index); + pkt.type = __cpu_to_le16(attr); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + 0, &result); + + *value = (long) result; + + if (rc) { + dev_err(hdev->dev, + "Failed to get pwm info from sensor %d, error %d\n", + sensor_index, rc); + *value = 0; + } + + return rc; +} + +void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr, + long value) +{ + struct cpucp_packet pkt; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_PWM_SET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.sensor_index = __cpu_to_le16(sensor_index); + pkt.type = __cpu_to_le16(attr); + pkt.value = cpu_to_le64(value); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + 0, NULL); + + if (rc) + dev_err(hdev->dev, + "Failed to set pwm info to sensor %d, error %d\n", + sensor_index, rc); +} + +int hl_set_voltage(struct hl_device *hdev, + int sensor_index, u32 attr, long value) +{ + struct cpucp_packet pkt; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_VOLTAGE_SET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.sensor_index = __cpu_to_le16(sensor_index); + pkt.type = __cpu_to_le16(attr); + pkt.value = __cpu_to_le64(value); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + 0, NULL); + + if (rc) + dev_err(hdev->dev, + "Failed to set voltage of sensor %d, error %d\n", + sensor_index, rc); + + return rc; +} + +int hl_set_current(struct hl_device *hdev, + int sensor_index, u32 attr, long value) +{ + struct cpucp_packet pkt; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_CURRENT_SET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.sensor_index = __cpu_to_le16(sensor_index); + pkt.type = __cpu_to_le16(attr); + pkt.value = __cpu_to_le64(value); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + 0, NULL); + + if (rc) + dev_err(hdev->dev, + "Failed to set current of sensor %d, error %d\n", + sensor_index, rc); + + return rc; +} + +int hl_set_power(struct hl_device *hdev, + int sensor_index, u32 attr, long value) +{ + struct cpucp_packet pkt; + struct asic_fixed_properties *prop = &hdev->asic_prop; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + + if (prop->use_get_power_for_reset_history) + pkt.ctl = cpu_to_le32(CPUCP_PACKET_POWER_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + else + pkt.ctl = cpu_to_le32(CPUCP_PACKET_POWER_SET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + + pkt.sensor_index = __cpu_to_le16(sensor_index); + pkt.type = __cpu_to_le16(attr); + pkt.value = __cpu_to_le64(value); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + 0, NULL); + + if (rc) + dev_err(hdev->dev, + "Failed to set power of sensor %d, error %d\n", + sensor_index, rc); + + return rc; +} + +int hl_get_power(struct hl_device *hdev, + int sensor_index, u32 attr, long *value) +{ + struct cpucp_packet pkt; + u64 result; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_POWER_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.sensor_index = __cpu_to_le16(sensor_index); + pkt.type = __cpu_to_le16(attr); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + 0, &result); + + *value = (long) result; + + if (rc) { + dev_err(hdev->dev, + "Failed to get power of sensor %d, error %d\n", + sensor_index, rc); + *value = 0; + } + + return rc; +} + +int hl_hwmon_init(struct hl_device *hdev) +{ + struct device *dev = hdev->pdev ? &hdev->pdev->dev : hdev->dev; + struct asic_fixed_properties *prop = &hdev->asic_prop; + int rc; + + if ((hdev->hwmon_initialized) || !(hdev->cpu_queues_enable)) + return 0; + + if (hdev->hl_chip_info->info) { + hdev->hl_chip_info->ops = &hl_hwmon_ops; + + hdev->hwmon_dev = hwmon_device_register_with_info(dev, + prop->cpucp_info.card_name, hdev, + hdev->hl_chip_info, NULL); + if (IS_ERR(hdev->hwmon_dev)) { + rc = PTR_ERR(hdev->hwmon_dev); + dev_err(hdev->dev, + "Unable to register hwmon device: %d\n", rc); + return rc; + } + + dev_info(hdev->dev, "%s: add sensors information\n", + dev_name(hdev->hwmon_dev)); + + hdev->hwmon_initialized = true; + } else { + dev_info(hdev->dev, "no available sensors\n"); + } + + return 0; +} + +void hl_hwmon_fini(struct hl_device *hdev) +{ + if (!hdev->hwmon_initialized) + return; + + hwmon_device_unregister(hdev->hwmon_dev); +} + +void hl_hwmon_release_resources(struct hl_device *hdev) +{ + const struct hwmon_channel_info **channel_info_arr; + int i = 0; + + if (!hdev->hl_chip_info->info) + return; + + channel_info_arr = hdev->hl_chip_info->info; + + while (channel_info_arr[i]) { + kfree(channel_info_arr[i]->config); + kfree(channel_info_arr[i]); + i++; + } + + kfree(channel_info_arr); + + hdev->hl_chip_info->info = NULL; +} diff --git a/drivers/misc/habanalabs/common/irq.c b/drivers/misc/habanalabs/common/irq.c new file mode 100644 index 000000000..94d537fd4 --- /dev/null +++ b/drivers/misc/habanalabs/common/irq.c @@ -0,0 +1,571 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2022 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "habanalabs.h" + +#include <linux/slab.h> + +/** + * struct hl_eqe_work - This structure is used to schedule work of EQ + * entry and cpucp_reset event + * + * @eq_work: workqueue object to run when EQ entry is received + * @hdev: pointer to device structure + * @eq_entry: copy of the EQ entry + */ +struct hl_eqe_work { + struct work_struct eq_work; + struct hl_device *hdev; + struct hl_eq_entry eq_entry; +}; + +/** + * hl_cq_inc_ptr - increment ci or pi of cq + * + * @ptr: the current ci or pi value of the completion queue + * + * Increment ptr by 1. If it reaches the number of completion queue + * entries, set it to 0 + */ +inline u32 hl_cq_inc_ptr(u32 ptr) +{ + ptr++; + if (unlikely(ptr == HL_CQ_LENGTH)) + ptr = 0; + return ptr; +} + +/** + * hl_eq_inc_ptr - increment ci of eq + * + * @ptr: the current ci value of the event queue + * + * Increment ptr by 1. If it reaches the number of event queue + * entries, set it to 0 + */ +static inline u32 hl_eq_inc_ptr(u32 ptr) +{ + ptr++; + if (unlikely(ptr == HL_EQ_LENGTH)) + ptr = 0; + return ptr; +} + +static void irq_handle_eqe(struct work_struct *work) +{ + struct hl_eqe_work *eqe_work = container_of(work, struct hl_eqe_work, + eq_work); + struct hl_device *hdev = eqe_work->hdev; + + hdev->asic_funcs->handle_eqe(hdev, &eqe_work->eq_entry); + + kfree(eqe_work); +} + +/** + * job_finish - queue job finish work + * + * @hdev: pointer to device structure + * @cs_seq: command submission sequence + * @cq: completion queue + * + */ +static void job_finish(struct hl_device *hdev, u32 cs_seq, struct hl_cq *cq) +{ + struct hl_hw_queue *queue; + struct hl_cs_job *job; + + queue = &hdev->kernel_queues[cq->hw_queue_id]; + job = queue->shadow_queue[hl_pi_2_offset(cs_seq)]; + queue_work(hdev->cq_wq[cq->cq_idx], &job->finish_work); + + atomic_inc(&queue->ci); +} + +/** + * cs_finish - queue all cs jobs finish work + * + * @hdev: pointer to device structure + * @cs_seq: command submission sequence + * + */ +static void cs_finish(struct hl_device *hdev, u16 cs_seq) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct hl_hw_queue *queue; + struct hl_cs *cs; + struct hl_cs_job *job; + + cs = hdev->shadow_cs_queue[cs_seq & (prop->max_pending_cs - 1)]; + if (!cs) { + dev_warn(hdev->dev, + "No pointer to CS in shadow array at index %d\n", + cs_seq); + return; + } + + list_for_each_entry(job, &cs->job_list, cs_node) { + queue = &hdev->kernel_queues[job->hw_queue_id]; + atomic_inc(&queue->ci); + } + + queue_work(hdev->cs_cmplt_wq, &cs->finish_work); +} + +/** + * hl_irq_handler_cq - irq handler for completion queue + * + * @irq: irq number + * @arg: pointer to completion queue structure + * + */ +irqreturn_t hl_irq_handler_cq(int irq, void *arg) +{ + struct hl_cq *cq = arg; + struct hl_device *hdev = cq->hdev; + bool shadow_index_valid, entry_ready; + u16 shadow_index; + struct hl_cq_entry *cq_entry, *cq_base; + + if (hdev->disabled) { + dev_dbg(hdev->dev, + "Device disabled but received IRQ %d for CQ %d\n", + irq, cq->hw_queue_id); + return IRQ_HANDLED; + } + + cq_base = cq->kernel_address; + + while (1) { + cq_entry = (struct hl_cq_entry *) &cq_base[cq->ci]; + + entry_ready = !!FIELD_GET(CQ_ENTRY_READY_MASK, + le32_to_cpu(cq_entry->data)); + if (!entry_ready) + break; + + /* Make sure we read CQ entry contents after we've + * checked the ownership bit. + */ + dma_rmb(); + + shadow_index_valid = + !!FIELD_GET(CQ_ENTRY_SHADOW_INDEX_VALID_MASK, + le32_to_cpu(cq_entry->data)); + + shadow_index = FIELD_GET(CQ_ENTRY_SHADOW_INDEX_MASK, + le32_to_cpu(cq_entry->data)); + + /* + * CQ interrupt handler has 2 modes of operation: + * 1. Interrupt per CS completion: (Single CQ for all queues) + * CQ entry represents a completed CS + * + * 2. Interrupt per CS job completion in queue: (CQ per queue) + * CQ entry represents a completed job in a certain queue + */ + if (shadow_index_valid && !hdev->disabled) { + if (hdev->asic_prop.completion_mode == + HL_COMPLETION_MODE_CS) + cs_finish(hdev, shadow_index); + else + job_finish(hdev, shadow_index, cq); + } + + /* Clear CQ entry ready bit */ + cq_entry->data = cpu_to_le32(le32_to_cpu(cq_entry->data) & + ~CQ_ENTRY_READY_MASK); + + cq->ci = hl_cq_inc_ptr(cq->ci); + + /* Increment free slots */ + atomic_inc(&cq->free_slots_cnt); + } + + return IRQ_HANDLED; +} + +/* + * hl_ts_free_objects - handler of the free objects workqueue. + * This function should put refcount to objects that the registration node + * took refcount to them. + * @work: workqueue object pointer + */ +static void hl_ts_free_objects(struct work_struct *work) +{ + struct timestamp_reg_work_obj *job = + container_of(work, struct timestamp_reg_work_obj, free_obj); + struct timestamp_reg_free_node *free_obj, *temp_free_obj; + struct list_head *free_list_head = job->free_obj_head; + struct hl_device *hdev = job->hdev; + + list_for_each_entry_safe(free_obj, temp_free_obj, free_list_head, free_objects_node) { + dev_dbg(hdev->dev, "About to put refcount to buf (%p) cq_cb(%p)\n", + free_obj->buf, + free_obj->cq_cb); + + hl_mmap_mem_buf_put(free_obj->buf); + hl_cb_put(free_obj->cq_cb); + kfree(free_obj); + } + + kfree(free_list_head); + kfree(job); +} + +/* + * This function called with spin_lock of wait_list_lock taken + * This function will set timestamp and delete the registration node from the + * wait_list_lock. + * and since we're protected with spin_lock here, so we cannot just put the refcount + * for the objects here, since the release function may be called and it's also a long + * logic (which might sleep also) that cannot be handled in irq context. + * so here we'll be filling a list with nodes of "put" jobs and then will send this + * list to a dedicated workqueue to do the actual put. + */ +static int handle_registration_node(struct hl_device *hdev, struct hl_user_pending_interrupt *pend, + struct list_head **free_list) +{ + struct timestamp_reg_free_node *free_node; + u64 timestamp; + + if (!(*free_list)) { + /* Alloc/Init the timestamp registration free objects list */ + *free_list = kmalloc(sizeof(struct list_head), GFP_ATOMIC); + if (!(*free_list)) + return -ENOMEM; + + INIT_LIST_HEAD(*free_list); + } + + free_node = kmalloc(sizeof(*free_node), GFP_ATOMIC); + if (!free_node) + return -ENOMEM; + + timestamp = ktime_get_ns(); + + *pend->ts_reg_info.timestamp_kernel_addr = timestamp; + + dev_dbg(hdev->dev, "Timestamp is set to ts cb address (%p), ts: 0x%llx\n", + pend->ts_reg_info.timestamp_kernel_addr, + *(u64 *)pend->ts_reg_info.timestamp_kernel_addr); + + list_del(&pend->wait_list_node); + + /* Mark kernel CB node as free */ + pend->ts_reg_info.in_use = 0; + + /* Putting the refcount for ts_buff and cq_cb objects will be handled + * in workqueue context, just add job to free_list. + */ + free_node->buf = pend->ts_reg_info.buf; + free_node->cq_cb = pend->ts_reg_info.cq_cb; + list_add(&free_node->free_objects_node, *free_list); + + return 0; +} + +static void handle_user_interrupt(struct hl_device *hdev, struct hl_user_interrupt *intr) +{ + struct hl_user_pending_interrupt *pend, *temp_pend; + struct list_head *ts_reg_free_list_head = NULL; + struct timestamp_reg_work_obj *job; + bool reg_node_handle_fail = false; + ktime_t now = ktime_get(); + int rc; + + /* For registration nodes: + * As part of handling the registration nodes, we should put refcount to + * some objects. the problem is that we cannot do that under spinlock + * or in irq handler context at all (since release functions are long and + * might sleep), so we will need to handle that part in workqueue context. + * To avoid handling kmalloc failure which compels us rolling back actions + * and move nodes hanged on the free list back to the interrupt wait list + * we always alloc the job of the WQ at the beginning. + */ + job = kmalloc(sizeof(*job), GFP_ATOMIC); + if (!job) + return; + + spin_lock(&intr->wait_list_lock); + list_for_each_entry_safe(pend, temp_pend, &intr->wait_list_head, wait_list_node) { + if ((pend->cq_kernel_addr && *(pend->cq_kernel_addr) >= pend->cq_target_value) || + !pend->cq_kernel_addr) { + if (pend->ts_reg_info.buf) { + if (!reg_node_handle_fail) { + rc = handle_registration_node(hdev, pend, + &ts_reg_free_list_head); + if (rc) + reg_node_handle_fail = true; + } + } else { + /* Handle wait target value node */ + pend->fence.timestamp = now; + complete_all(&pend->fence.completion); + } + } + } + spin_unlock(&intr->wait_list_lock); + + if (ts_reg_free_list_head) { + INIT_WORK(&job->free_obj, hl_ts_free_objects); + job->free_obj_head = ts_reg_free_list_head; + job->hdev = hdev; + queue_work(hdev->ts_free_obj_wq, &job->free_obj); + } else { + kfree(job); + } +} + +/** + * hl_irq_handler_user_interrupt - irq handler for user interrupts + * + * @irq: irq number + * @arg: pointer to user interrupt structure + * + */ +irqreturn_t hl_irq_handler_user_interrupt(int irq, void *arg) +{ + struct hl_user_interrupt *user_int = arg; + struct hl_device *hdev = user_int->hdev; + + if (user_int->is_decoder) + handle_user_interrupt(hdev, &hdev->common_decoder_interrupt); + else + handle_user_interrupt(hdev, &hdev->common_user_cq_interrupt); + + /* Handle user cq or decoder interrupts registered on this specific irq */ + handle_user_interrupt(hdev, user_int); + + return IRQ_HANDLED; +} + +/** + * hl_irq_handler_default - default irq handler + * + * @irq: irq number + * @arg: pointer to user interrupt structure + * + */ +irqreturn_t hl_irq_handler_default(int irq, void *arg) +{ + struct hl_user_interrupt *user_interrupt = arg; + struct hl_device *hdev = user_interrupt->hdev; + u32 interrupt_id = user_interrupt->interrupt_id; + + dev_err(hdev->dev, "got invalid user interrupt %u", interrupt_id); + + return IRQ_HANDLED; +} + +/** + * hl_irq_handler_eq - irq handler for event queue + * + * @irq: irq number + * @arg: pointer to event queue structure + * + */ +irqreturn_t hl_irq_handler_eq(int irq, void *arg) +{ + struct hl_eq *eq = arg; + struct hl_device *hdev = eq->hdev; + struct hl_eq_entry *eq_entry; + struct hl_eq_entry *eq_base; + struct hl_eqe_work *handle_eqe_work; + bool entry_ready; + u32 cur_eqe; + u16 cur_eqe_index; + + eq_base = eq->kernel_address; + + while (1) { + cur_eqe = le32_to_cpu(eq_base[eq->ci].hdr.ctl); + entry_ready = !!FIELD_GET(EQ_CTL_READY_MASK, cur_eqe); + + if (!entry_ready) + break; + + cur_eqe_index = FIELD_GET(EQ_CTL_INDEX_MASK, cur_eqe); + if ((hdev->event_queue.check_eqe_index) && + (((eq->prev_eqe_index + 1) & EQ_CTL_INDEX_MASK) + != cur_eqe_index)) { + dev_dbg(hdev->dev, + "EQE 0x%x in queue is ready but index does not match %d!=%d", + eq_base[eq->ci].hdr.ctl, + ((eq->prev_eqe_index + 1) & EQ_CTL_INDEX_MASK), + cur_eqe_index); + break; + } + + eq->prev_eqe_index++; + + eq_entry = &eq_base[eq->ci]; + + /* + * Make sure we read EQ entry contents after we've + * checked the ownership bit. + */ + dma_rmb(); + + if (hdev->disabled && !hdev->reset_info.in_compute_reset) { + dev_warn(hdev->dev, "Device disabled but received an EQ event\n"); + goto skip_irq; + } + + handle_eqe_work = kmalloc(sizeof(*handle_eqe_work), GFP_ATOMIC); + if (handle_eqe_work) { + INIT_WORK(&handle_eqe_work->eq_work, irq_handle_eqe); + handle_eqe_work->hdev = hdev; + + memcpy(&handle_eqe_work->eq_entry, eq_entry, + sizeof(*eq_entry)); + + queue_work(hdev->eq_wq, &handle_eqe_work->eq_work); + } +skip_irq: + /* Clear EQ entry ready bit */ + eq_entry->hdr.ctl = + cpu_to_le32(le32_to_cpu(eq_entry->hdr.ctl) & + ~EQ_CTL_READY_MASK); + + eq->ci = hl_eq_inc_ptr(eq->ci); + + hdev->asic_funcs->update_eq_ci(hdev, eq->ci); + } + + return IRQ_HANDLED; +} + +/** + * hl_irq_handler_dec_abnrm - Decoder error interrupt handler + * @irq: IRQ number + * @arg: pointer to decoder structure. + */ +irqreturn_t hl_irq_handler_dec_abnrm(int irq, void *arg) +{ + struct hl_dec *dec = arg; + + schedule_work(&dec->completion_abnrm_work); + + return IRQ_HANDLED; +} + +/** + * hl_cq_init - main initialization function for an cq object + * + * @hdev: pointer to device structure + * @q: pointer to cq structure + * @hw_queue_id: The H/W queue ID this completion queue belongs to + * HL_INVALID_QUEUE if cq is not attached to any specific queue + * + * Allocate dma-able memory for the completion queue and initialize fields + * Returns 0 on success + */ +int hl_cq_init(struct hl_device *hdev, struct hl_cq *q, u32 hw_queue_id) +{ + void *p; + + p = hl_asic_dma_alloc_coherent(hdev, HL_CQ_SIZE_IN_BYTES, &q->bus_address, + GFP_KERNEL | __GFP_ZERO); + if (!p) + return -ENOMEM; + + q->hdev = hdev; + q->kernel_address = p; + q->hw_queue_id = hw_queue_id; + q->ci = 0; + q->pi = 0; + + atomic_set(&q->free_slots_cnt, HL_CQ_LENGTH); + + return 0; +} + +/** + * hl_cq_fini - destroy completion queue + * + * @hdev: pointer to device structure + * @q: pointer to cq structure + * + * Free the completion queue memory + */ +void hl_cq_fini(struct hl_device *hdev, struct hl_cq *q) +{ + hl_asic_dma_free_coherent(hdev, HL_CQ_SIZE_IN_BYTES, q->kernel_address, q->bus_address); +} + +void hl_cq_reset(struct hl_device *hdev, struct hl_cq *q) +{ + q->ci = 0; + q->pi = 0; + + atomic_set(&q->free_slots_cnt, HL_CQ_LENGTH); + + /* + * It's not enough to just reset the PI/CI because the H/W may have + * written valid completion entries before it was halted and therefore + * we need to clean the actual queues so we won't process old entries + * when the device is operational again + */ + + memset(q->kernel_address, 0, HL_CQ_SIZE_IN_BYTES); +} + +/** + * hl_eq_init - main initialization function for an event queue object + * + * @hdev: pointer to device structure + * @q: pointer to eq structure + * + * Allocate dma-able memory for the event queue and initialize fields + * Returns 0 on success + */ +int hl_eq_init(struct hl_device *hdev, struct hl_eq *q) +{ + void *p; + + p = hl_cpu_accessible_dma_pool_alloc(hdev, HL_EQ_SIZE_IN_BYTES, &q->bus_address); + if (!p) + return -ENOMEM; + + q->hdev = hdev; + q->kernel_address = p; + q->ci = 0; + q->prev_eqe_index = 0; + + return 0; +} + +/** + * hl_eq_fini - destroy event queue + * + * @hdev: pointer to device structure + * @q: pointer to eq structure + * + * Free the event queue memory + */ +void hl_eq_fini(struct hl_device *hdev, struct hl_eq *q) +{ + flush_workqueue(hdev->eq_wq); + + hl_cpu_accessible_dma_pool_free(hdev, HL_EQ_SIZE_IN_BYTES, q->kernel_address); +} + +void hl_eq_reset(struct hl_device *hdev, struct hl_eq *q) +{ + q->ci = 0; + q->prev_eqe_index = 0; + + /* + * It's not enough to just reset the PI/CI because the H/W may have + * written valid completion entries before it was halted and therefore + * we need to clean the actual queues so we won't process old entries + * when the device is operational again + */ + + memset(q->kernel_address, 0, HL_EQ_SIZE_IN_BYTES); +} diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c new file mode 100644 index 000000000..a49038da3 --- /dev/null +++ b/drivers/misc/habanalabs/common/memory.c @@ -0,0 +1,2932 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2022 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include <uapi/misc/habanalabs.h> +#include "habanalabs.h" +#include "../include/hw_ip/mmu/mmu_general.h" + +#include <linux/uaccess.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/pci-p2pdma.h> + +MODULE_IMPORT_NS(DMA_BUF); + +#define HL_MMU_DEBUG 0 + +/* use small pages for supporting non-pow2 (32M/40M/48M) DRAM phys page sizes */ +#define DRAM_POOL_PAGE_SIZE SZ_8M + +static int allocate_timestamps_buffers(struct hl_fpriv *hpriv, + struct hl_mem_in *args, u64 *handle); + +static int set_alloc_page_size(struct hl_device *hdev, struct hl_mem_in *args, u32 *page_size) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + u64 psize; + + /* + * for ASIC that supports setting the allocation page size by user we will address + * user's choice only if it is not 0 (as 0 means taking the default page size) + */ + if (prop->supports_user_set_page_size && args->alloc.page_size) { + psize = args->alloc.page_size; + + if (!is_power_of_2(psize)) { + dev_err(hdev->dev, "user page size (%#llx) is not power of 2\n", psize); + return -EINVAL; + } + } else { + psize = prop->device_mem_alloc_default_page_size; + } + + *page_size = psize; + + return 0; +} + +/* + * The va ranges in context object contain a list with the available chunks of + * device virtual memory. + * There is one range for host allocations and one for DRAM allocations. + * + * On initialization each range contains one chunk of all of its available + * virtual range which is a half of the total device virtual range. + * + * On each mapping of physical pages, a suitable virtual range chunk (with a + * minimum size) is selected from the list. If the chunk size equals the + * requested size, the chunk is returned. Otherwise, the chunk is split into + * two chunks - one to return as result and a remainder to stay in the list. + * + * On each Unmapping of a virtual address, the relevant virtual chunk is + * returned to the list. The chunk is added to the list and if its edges match + * the edges of the adjacent chunks (means a contiguous chunk can be created), + * the chunks are merged. + * + * On finish, the list is checked to have only one chunk of all the relevant + * virtual range (which is a half of the device total virtual range). + * If not (means not all mappings were unmapped), a warning is printed. + */ + +/* + * alloc_device_memory() - allocate device memory. + * @ctx: pointer to the context structure. + * @args: host parameters containing the requested size. + * @ret_handle: result handle. + * + * This function does the following: + * - Allocate the requested size rounded up to 'dram_page_size' pages. + * - Return unique handle for later map/unmap/free. + */ +static int alloc_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args, + u32 *ret_handle) +{ + struct hl_device *hdev = ctx->hdev; + struct hl_vm *vm = &hdev->vm; + struct hl_vm_phys_pg_pack *phys_pg_pack; + u64 paddr = 0, total_size, num_pgs, i; + u32 num_curr_pgs, page_size; + bool contiguous; + int handle, rc; + + num_curr_pgs = 0; + + rc = set_alloc_page_size(hdev, args, &page_size); + if (rc) + return rc; + + num_pgs = DIV_ROUND_UP_ULL(args->alloc.mem_size, page_size); + total_size = num_pgs * page_size; + + if (!total_size) { + dev_err(hdev->dev, "Cannot allocate 0 bytes\n"); + return -EINVAL; + } + + contiguous = args->flags & HL_MEM_CONTIGUOUS; + + if (contiguous) { + if (is_power_of_2(page_size)) + paddr = (uintptr_t) gen_pool_dma_alloc_align(vm->dram_pg_pool, + total_size, NULL, page_size); + else + paddr = gen_pool_alloc(vm->dram_pg_pool, total_size); + if (!paddr) { + dev_err(hdev->dev, + "Cannot allocate %llu contiguous pages with total size of %llu\n", + num_pgs, total_size); + return -ENOMEM; + } + } + + phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL); + if (!phys_pg_pack) { + rc = -ENOMEM; + goto pages_pack_err; + } + + phys_pg_pack->vm_type = VM_TYPE_PHYS_PACK; + phys_pg_pack->asid = ctx->asid; + phys_pg_pack->npages = num_pgs; + phys_pg_pack->page_size = page_size; + phys_pg_pack->total_size = total_size; + phys_pg_pack->flags = args->flags; + phys_pg_pack->contiguous = contiguous; + + phys_pg_pack->pages = kvmalloc_array(num_pgs, sizeof(u64), GFP_KERNEL); + if (ZERO_OR_NULL_PTR(phys_pg_pack->pages)) { + rc = -ENOMEM; + goto pages_arr_err; + } + + if (phys_pg_pack->contiguous) { + for (i = 0 ; i < num_pgs ; i++) + phys_pg_pack->pages[i] = paddr + i * page_size; + } else { + for (i = 0 ; i < num_pgs ; i++) { + if (is_power_of_2(page_size)) + phys_pg_pack->pages[i] = + (uintptr_t)gen_pool_dma_alloc_align(vm->dram_pg_pool, + page_size, NULL, + page_size); + else + phys_pg_pack->pages[i] = gen_pool_alloc(vm->dram_pg_pool, + page_size); + + if (!phys_pg_pack->pages[i]) { + dev_err(hdev->dev, + "Cannot allocate device memory (out of memory)\n"); + rc = -ENOMEM; + goto page_err; + } + + num_curr_pgs++; + } + } + + spin_lock(&vm->idr_lock); + handle = idr_alloc(&vm->phys_pg_pack_handles, phys_pg_pack, 1, 0, + GFP_ATOMIC); + spin_unlock(&vm->idr_lock); + + if (handle < 0) { + dev_err(hdev->dev, "Failed to get handle for page\n"); + rc = -EFAULT; + goto idr_err; + } + + for (i = 0 ; i < num_pgs ; i++) + kref_get(&vm->dram_pg_pool_refcount); + + phys_pg_pack->handle = handle; + + atomic64_add(phys_pg_pack->total_size, &ctx->dram_phys_mem); + atomic64_add(phys_pg_pack->total_size, &hdev->dram_used_mem); + + *ret_handle = handle; + + return 0; + +idr_err: +page_err: + if (!phys_pg_pack->contiguous) + for (i = 0 ; i < num_curr_pgs ; i++) + gen_pool_free(vm->dram_pg_pool, phys_pg_pack->pages[i], + page_size); + + kvfree(phys_pg_pack->pages); +pages_arr_err: + kfree(phys_pg_pack); +pages_pack_err: + if (contiguous) + gen_pool_free(vm->dram_pg_pool, paddr, total_size); + + return rc; +} + +/** + * dma_map_host_va() - DMA mapping of the given host virtual address. + * @hdev: habanalabs device structure. + * @addr: the host virtual address of the memory area. + * @size: the size of the memory area. + * @p_userptr: pointer to result userptr structure. + * + * This function does the following: + * - Allocate userptr structure. + * - Pin the given host memory using the userptr structure. + * - Perform DMA mapping to have the DMA addresses of the pages. + */ +static int dma_map_host_va(struct hl_device *hdev, u64 addr, u64 size, + struct hl_userptr **p_userptr) +{ + struct hl_userptr *userptr; + int rc; + + userptr = kzalloc(sizeof(*userptr), GFP_KERNEL); + if (!userptr) { + rc = -ENOMEM; + goto userptr_err; + } + + rc = hl_pin_host_memory(hdev, addr, size, userptr); + if (rc) { + dev_err(hdev->dev, "Failed to pin host memory\n"); + goto pin_err; + } + + userptr->dma_mapped = true; + userptr->dir = DMA_BIDIRECTIONAL; + userptr->vm_type = VM_TYPE_USERPTR; + + *p_userptr = userptr; + + rc = hdev->asic_funcs->asic_dma_map_sgtable(hdev, userptr->sgt, DMA_BIDIRECTIONAL); + if (rc) { + dev_err(hdev->dev, "failed to map sgt with DMA region\n"); + goto dma_map_err; + } + + return 0; + +dma_map_err: + hl_unpin_host_memory(hdev, userptr); +pin_err: + kfree(userptr); +userptr_err: + + return rc; +} + +/** + * dma_unmap_host_va() - DMA unmapping of the given host virtual address. + * @hdev: habanalabs device structure. + * @userptr: userptr to free. + * + * This function does the following: + * - Unpins the physical pages. + * - Frees the userptr structure. + */ +static void dma_unmap_host_va(struct hl_device *hdev, + struct hl_userptr *userptr) +{ + hl_unpin_host_memory(hdev, userptr); + kfree(userptr); +} + +/** + * dram_pg_pool_do_release() - free DRAM pages pool + * @ref: pointer to reference object. + * + * This function does the following: + * - Frees the idr structure of physical pages handles. + * - Frees the generic pool of DRAM physical pages. + */ +static void dram_pg_pool_do_release(struct kref *ref) +{ + struct hl_vm *vm = container_of(ref, struct hl_vm, + dram_pg_pool_refcount); + + /* + * free the idr here as only here we know for sure that there are no + * allocated physical pages and hence there are no handles in use + */ + idr_destroy(&vm->phys_pg_pack_handles); + gen_pool_destroy(vm->dram_pg_pool); +} + +/** + * free_phys_pg_pack() - free physical page pack. + * @hdev: habanalabs device structure. + * @phys_pg_pack: physical page pack to free. + * + * This function does the following: + * - For DRAM memory only + * - iterate over the pack, free each physical block structure by + * returning it to the general pool. + * - Free the hl_vm_phys_pg_pack structure. + */ +static void free_phys_pg_pack(struct hl_device *hdev, + struct hl_vm_phys_pg_pack *phys_pg_pack) +{ + struct hl_vm *vm = &hdev->vm; + u64 i; + + if (phys_pg_pack->created_from_userptr) + goto end; + + if (phys_pg_pack->contiguous) { + gen_pool_free(vm->dram_pg_pool, phys_pg_pack->pages[0], + phys_pg_pack->total_size); + + for (i = 0; i < phys_pg_pack->npages ; i++) + kref_put(&vm->dram_pg_pool_refcount, + dram_pg_pool_do_release); + } else { + for (i = 0 ; i < phys_pg_pack->npages ; i++) { + gen_pool_free(vm->dram_pg_pool, + phys_pg_pack->pages[i], + phys_pg_pack->page_size); + kref_put(&vm->dram_pg_pool_refcount, + dram_pg_pool_do_release); + } + } + +end: + kvfree(phys_pg_pack->pages); + kfree(phys_pg_pack); + + return; +} + +/** + * free_device_memory() - free device memory. + * @ctx: pointer to the context structure. + * @args: host parameters containing the requested size. + * + * This function does the following: + * - Free the device memory related to the given handle. + */ +static int free_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args) +{ + struct hl_device *hdev = ctx->hdev; + struct hl_vm *vm = &hdev->vm; + struct hl_vm_phys_pg_pack *phys_pg_pack; + u32 handle = args->free.handle; + + spin_lock(&vm->idr_lock); + phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle); + if (!phys_pg_pack) { + spin_unlock(&vm->idr_lock); + dev_err(hdev->dev, "free device memory failed, no match for handle %u\n", handle); + return -EINVAL; + } + + if (atomic_read(&phys_pg_pack->mapping_cnt) > 0) { + spin_unlock(&vm->idr_lock); + dev_err(hdev->dev, "handle %u is mapped, cannot free\n", handle); + return -EINVAL; + } + + if (phys_pg_pack->exporting_cnt) { + spin_unlock(&vm->idr_lock); + dev_dbg(hdev->dev, "handle %u is exported, cannot free\n", handle); + return -EINVAL; + } + + /* must remove from idr before the freeing of the physical pages as the refcount of the pool + * is also the trigger of the idr destroy + */ + idr_remove(&vm->phys_pg_pack_handles, handle); + spin_unlock(&vm->idr_lock); + + atomic64_sub(phys_pg_pack->total_size, &ctx->dram_phys_mem); + atomic64_sub(phys_pg_pack->total_size, &hdev->dram_used_mem); + + free_phys_pg_pack(hdev, phys_pg_pack); + + return 0; +} + +/** + * clear_va_list_locked() - free virtual addresses list. + * @hdev: habanalabs device structure. + * @va_list: list of virtual addresses to free. + * + * This function does the following: + * - Iterate over the list and free each virtual addresses block. + * + * This function should be called only when va_list lock is taken. + */ +static void clear_va_list_locked(struct hl_device *hdev, + struct list_head *va_list) +{ + struct hl_vm_va_block *va_block, *tmp; + + list_for_each_entry_safe(va_block, tmp, va_list, node) { + list_del(&va_block->node); + kfree(va_block); + } +} + +/** + * print_va_list_locked() - print virtual addresses list. + * @hdev: habanalabs device structure. + * @va_list: list of virtual addresses to print. + * + * This function does the following: + * - Iterate over the list and print each virtual addresses block. + * + * This function should be called only when va_list lock is taken. + */ +static void print_va_list_locked(struct hl_device *hdev, + struct list_head *va_list) +{ +#if HL_MMU_DEBUG + struct hl_vm_va_block *va_block; + + dev_dbg(hdev->dev, "print va list:\n"); + + list_for_each_entry(va_block, va_list, node) + dev_dbg(hdev->dev, + "va block, start: 0x%llx, end: 0x%llx, size: %llu\n", + va_block->start, va_block->end, va_block->size); +#endif +} + +/** + * merge_va_blocks_locked() - merge a virtual block if possible. + * @hdev: pointer to the habanalabs device structure. + * @va_list: pointer to the virtual addresses block list. + * @va_block: virtual block to merge with adjacent blocks. + * + * This function does the following: + * - Merge the given blocks with the adjacent blocks if their virtual ranges + * create a contiguous virtual range. + * + * This Function should be called only when va_list lock is taken. + */ +static void merge_va_blocks_locked(struct hl_device *hdev, + struct list_head *va_list, struct hl_vm_va_block *va_block) +{ + struct hl_vm_va_block *prev, *next; + + prev = list_prev_entry(va_block, node); + if (&prev->node != va_list && prev->end + 1 == va_block->start) { + prev->end = va_block->end; + prev->size = prev->end - prev->start + 1; + list_del(&va_block->node); + kfree(va_block); + va_block = prev; + } + + next = list_next_entry(va_block, node); + if (&next->node != va_list && va_block->end + 1 == next->start) { + next->start = va_block->start; + next->size = next->end - next->start + 1; + list_del(&va_block->node); + kfree(va_block); + } +} + +/** + * add_va_block_locked() - add a virtual block to the virtual addresses list. + * @hdev: pointer to the habanalabs device structure. + * @va_list: pointer to the virtual addresses block list. + * @start: start virtual address. + * @end: end virtual address. + * + * This function does the following: + * - Add the given block to the virtual blocks list and merge with other blocks + * if a contiguous virtual block can be created. + * + * This Function should be called only when va_list lock is taken. + */ +static int add_va_block_locked(struct hl_device *hdev, + struct list_head *va_list, u64 start, u64 end) +{ + struct hl_vm_va_block *va_block, *res = NULL; + u64 size = end - start + 1; + + print_va_list_locked(hdev, va_list); + + list_for_each_entry(va_block, va_list, node) { + /* TODO: remove upon matureness */ + if (hl_mem_area_crosses_range(start, size, va_block->start, + va_block->end)) { + dev_err(hdev->dev, + "block crossing ranges at start 0x%llx, end 0x%llx\n", + va_block->start, va_block->end); + return -EINVAL; + } + + if (va_block->end < start) + res = va_block; + } + + va_block = kmalloc(sizeof(*va_block), GFP_KERNEL); + if (!va_block) + return -ENOMEM; + + va_block->start = start; + va_block->end = end; + va_block->size = size; + + if (!res) + list_add(&va_block->node, va_list); + else + list_add(&va_block->node, &res->node); + + merge_va_blocks_locked(hdev, va_list, va_block); + + print_va_list_locked(hdev, va_list); + + return 0; +} + +/** + * add_va_block() - wrapper for add_va_block_locked. + * @hdev: pointer to the habanalabs device structure. + * @va_range: pointer to the virtual addresses range object. + * @start: start virtual address. + * @end: end virtual address. + * + * This function does the following: + * - Takes the list lock and calls add_va_block_locked. + */ +static inline int add_va_block(struct hl_device *hdev, + struct hl_va_range *va_range, u64 start, u64 end) +{ + int rc; + + mutex_lock(&va_range->lock); + rc = add_va_block_locked(hdev, &va_range->list, start, end); + mutex_unlock(&va_range->lock); + + return rc; +} + +/** + * is_hint_crossing_range() - check if hint address crossing specified reserved. + * @range_type: virtual space range type. + * @start_addr: start virtual address. + * @size: block size. + * @prop: asic properties structure to retrieve reserved ranges from. + */ +static inline bool is_hint_crossing_range(enum hl_va_range_type range_type, + u64 start_addr, u32 size, struct asic_fixed_properties *prop) { + bool range_cross; + + if (range_type == HL_VA_RANGE_TYPE_DRAM) + range_cross = + hl_mem_area_crosses_range(start_addr, size, + prop->hints_dram_reserved_va_range.start_addr, + prop->hints_dram_reserved_va_range.end_addr); + else if (range_type == HL_VA_RANGE_TYPE_HOST) + range_cross = + hl_mem_area_crosses_range(start_addr, size, + prop->hints_host_reserved_va_range.start_addr, + prop->hints_host_reserved_va_range.end_addr); + else + range_cross = + hl_mem_area_crosses_range(start_addr, size, + prop->hints_host_hpage_reserved_va_range.start_addr, + prop->hints_host_hpage_reserved_va_range.end_addr); + + return range_cross; +} + +/** + * get_va_block() - get a virtual block for the given size and alignment. + * + * @hdev: pointer to the habanalabs device structure. + * @va_range: pointer to the virtual addresses range. + * @size: requested block size. + * @hint_addr: hint for requested address by the user. + * @va_block_align: required alignment of the virtual block start address. + * @range_type: va range type (host, dram) + * @flags: additional memory flags, currently only uses HL_MEM_FORCE_HINT + * + * This function does the following: + * - Iterate on the virtual block list to find a suitable virtual block for the + * given size, hint address and alignment. + * - Reserve the requested block and update the list. + * - Return the start address of the virtual block. + */ +static u64 get_va_block(struct hl_device *hdev, + struct hl_va_range *va_range, + u64 size, u64 hint_addr, u32 va_block_align, + enum hl_va_range_type range_type, + u32 flags) +{ + struct hl_vm_va_block *va_block, *new_va_block = NULL; + struct asic_fixed_properties *prop = &hdev->asic_prop; + u64 tmp_hint_addr, valid_start, valid_size, prev_start, prev_end, + align_mask, reserved_valid_start = 0, reserved_valid_size = 0, + dram_hint_mask = prop->dram_hints_align_mask; + bool add_prev = false; + bool is_align_pow_2 = is_power_of_2(va_range->page_size); + bool is_hint_dram_addr = hl_is_dram_va(hdev, hint_addr); + bool force_hint = flags & HL_MEM_FORCE_HINT; + + if (is_align_pow_2) + align_mask = ~((u64)va_block_align - 1); + else + /* + * with non-power-of-2 range we work only with page granularity + * and the start address is page aligned, + * so no need for alignment checking. + */ + size = DIV_ROUND_UP_ULL(size, va_range->page_size) * + va_range->page_size; + + tmp_hint_addr = hint_addr & ~dram_hint_mask; + + /* Check if we need to ignore hint address */ + if ((is_align_pow_2 && (hint_addr & (va_block_align - 1))) || + (!is_align_pow_2 && is_hint_dram_addr && + do_div(tmp_hint_addr, va_range->page_size))) { + + if (force_hint) { + /* Hint must be respected, so here we just fail */ + dev_err(hdev->dev, + "Hint address 0x%llx is not page aligned - cannot be respected\n", + hint_addr); + return 0; + } + + dev_dbg(hdev->dev, + "Hint address 0x%llx will be ignored because it is not aligned\n", + hint_addr); + hint_addr = 0; + } + + mutex_lock(&va_range->lock); + + print_va_list_locked(hdev, &va_range->list); + + list_for_each_entry(va_block, &va_range->list, node) { + /* Calc the first possible aligned addr */ + valid_start = va_block->start; + + if (is_align_pow_2 && (valid_start & (va_block_align - 1))) { + valid_start &= align_mask; + valid_start += va_block_align; + if (valid_start > va_block->end) + continue; + } + + valid_size = va_block->end - valid_start + 1; + if (valid_size < size) + continue; + + /* + * In case hint address is 0, and hints_range_reservation + * property enabled, then avoid allocating va blocks from the + * range reserved for hint addresses + */ + if (prop->hints_range_reservation && !hint_addr) + if (is_hint_crossing_range(range_type, valid_start, + size, prop)) + continue; + + /* Pick the minimal length block which has the required size */ + if (!new_va_block || (valid_size < reserved_valid_size)) { + new_va_block = va_block; + reserved_valid_start = valid_start; + reserved_valid_size = valid_size; + } + + if (hint_addr && hint_addr >= valid_start && + (hint_addr + size) <= va_block->end) { + new_va_block = va_block; + reserved_valid_start = hint_addr; + reserved_valid_size = valid_size; + break; + } + } + + if (!new_va_block) { + dev_err(hdev->dev, "no available va block for size %llu\n", + size); + goto out; + } + + if (force_hint && reserved_valid_start != hint_addr) { + /* Hint address must be respected. If we are here - this means + * we could not respect it. + */ + dev_err(hdev->dev, + "Hint address 0x%llx could not be respected\n", + hint_addr); + reserved_valid_start = 0; + goto out; + } + + /* + * Check if there is some leftover range due to reserving the new + * va block, then return it to the main virtual addresses list. + */ + if (reserved_valid_start > new_va_block->start) { + prev_start = new_va_block->start; + prev_end = reserved_valid_start - 1; + + new_va_block->start = reserved_valid_start; + new_va_block->size = reserved_valid_size; + + add_prev = true; + } + + if (new_va_block->size > size) { + new_va_block->start += size; + new_va_block->size = new_va_block->end - new_va_block->start + 1; + } else { + list_del(&new_va_block->node); + kfree(new_va_block); + } + + if (add_prev) + add_va_block_locked(hdev, &va_range->list, prev_start, + prev_end); + + print_va_list_locked(hdev, &va_range->list); +out: + mutex_unlock(&va_range->lock); + + return reserved_valid_start; +} + +/* + * hl_reserve_va_block() - reserve a virtual block of a given size. + * @hdev: pointer to the habanalabs device structure. + * @ctx: current context + * @type: virtual addresses range type. + * @size: requested block size. + * @alignment: required alignment in bytes of the virtual block start address, + * 0 means no alignment. + * + * This function does the following: + * - Iterate on the virtual block list to find a suitable virtual block for the + * given size and alignment. + * - Reserve the requested block and update the list. + * - Return the start address of the virtual block. + */ +u64 hl_reserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx, + enum hl_va_range_type type, u64 size, u32 alignment) +{ + return get_va_block(hdev, ctx->va_range[type], size, 0, + max(alignment, ctx->va_range[type]->page_size), + type, 0); +} + +/** + * hl_get_va_range_type() - get va_range type for the given address and size. + * @ctx: context to fetch va_range from. + * @address: the start address of the area we want to validate. + * @size: the size in bytes of the area we want to validate. + * @type: returned va_range type. + * + * Return: true if the area is inside a valid range, false otherwise. + */ +static int hl_get_va_range_type(struct hl_ctx *ctx, u64 address, u64 size, + enum hl_va_range_type *type) +{ + int i; + + for (i = 0 ; i < HL_VA_RANGE_TYPE_MAX; i++) { + if (hl_mem_area_inside_range(address, size, + ctx->va_range[i]->start_addr, + ctx->va_range[i]->end_addr)) { + *type = i; + return 0; + } + } + + return -EINVAL; +} + +/** + * hl_unreserve_va_block() - wrapper for add_va_block to unreserve a va block. + * @hdev: pointer to the habanalabs device structure + * @ctx: pointer to the context structure. + * @start_addr: start virtual address. + * @size: number of bytes to unreserve. + * + * This function does the following: + * - Takes the list lock and calls add_va_block_locked. + */ +int hl_unreserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx, + u64 start_addr, u64 size) +{ + enum hl_va_range_type type; + int rc; + + rc = hl_get_va_range_type(ctx, start_addr, size, &type); + if (rc) { + dev_err(hdev->dev, + "cannot find va_range for va %#llx size %llu", + start_addr, size); + return rc; + } + + rc = add_va_block(hdev, ctx->va_range[type], start_addr, + start_addr + size - 1); + if (rc) + dev_warn(hdev->dev, + "add va block failed for vaddr: 0x%llx\n", start_addr); + + return rc; +} + +/** + * init_phys_pg_pack_from_userptr() - initialize physical page pack from host + * memory + * @ctx: pointer to the context structure. + * @userptr: userptr to initialize from. + * @pphys_pg_pack: result pointer. + * @force_regular_page: tell the function to ignore huge page optimization, + * even if possible. Needed for cases where the device VA + * is allocated before we know the composition of the + * physical pages + * + * This function does the following: + * - Pin the physical pages related to the given virtual block. + * - Create a physical page pack from the physical pages related to the given + * virtual block. + */ +static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx, + struct hl_userptr *userptr, + struct hl_vm_phys_pg_pack **pphys_pg_pack, + bool force_regular_page) +{ + u32 npages, page_size = PAGE_SIZE, + huge_page_size = ctx->hdev->asic_prop.pmmu_huge.page_size; + u32 pgs_in_huge_page = huge_page_size >> __ffs(page_size); + struct hl_vm_phys_pg_pack *phys_pg_pack; + bool first = true, is_huge_page_opt; + u64 page_mask, total_npages; + struct scatterlist *sg; + dma_addr_t dma_addr; + int rc, i, j; + + phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL); + if (!phys_pg_pack) + return -ENOMEM; + + phys_pg_pack->vm_type = userptr->vm_type; + phys_pg_pack->created_from_userptr = true; + phys_pg_pack->asid = ctx->asid; + atomic_set(&phys_pg_pack->mapping_cnt, 1); + + is_huge_page_opt = (force_regular_page ? false : true); + + /* Only if all dma_addrs are aligned to 2MB and their + * sizes is at least 2MB, we can use huge page mapping. + * We limit the 2MB optimization to this condition, + * since later on we acquire the related VA range as one + * consecutive block. + */ + total_npages = 0; + for_each_sgtable_dma_sg(userptr->sgt, sg, i) { + npages = hl_get_sg_info(sg, &dma_addr); + + total_npages += npages; + + if ((npages % pgs_in_huge_page) || + (dma_addr & (huge_page_size - 1))) + is_huge_page_opt = false; + } + + if (is_huge_page_opt) { + page_size = huge_page_size; + do_div(total_npages, pgs_in_huge_page); + } + + page_mask = ~(((u64) page_size) - 1); + + phys_pg_pack->pages = kvmalloc_array(total_npages, sizeof(u64), + GFP_KERNEL); + if (ZERO_OR_NULL_PTR(phys_pg_pack->pages)) { + rc = -ENOMEM; + goto page_pack_arr_mem_err; + } + + phys_pg_pack->npages = total_npages; + phys_pg_pack->page_size = page_size; + phys_pg_pack->total_size = total_npages * page_size; + + j = 0; + for_each_sgtable_dma_sg(userptr->sgt, sg, i) { + npages = hl_get_sg_info(sg, &dma_addr); + + /* align down to physical page size and save the offset */ + if (first) { + first = false; + phys_pg_pack->offset = dma_addr & (page_size - 1); + dma_addr &= page_mask; + } + + while (npages) { + phys_pg_pack->pages[j++] = dma_addr; + dma_addr += page_size; + + if (is_huge_page_opt) + npages -= pgs_in_huge_page; + else + npages--; + } + } + + *pphys_pg_pack = phys_pg_pack; + + return 0; + +page_pack_arr_mem_err: + kfree(phys_pg_pack); + + return rc; +} + +/** + * map_phys_pg_pack() - maps the physical page pack.. + * @ctx: pointer to the context structure. + * @vaddr: start address of the virtual area to map from. + * @phys_pg_pack: the pack of physical pages to map to. + * + * This function does the following: + * - Maps each chunk of virtual memory to matching physical chunk. + * - Stores number of successful mappings in the given argument. + * - Returns 0 on success, error code otherwise. + */ +static int map_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr, + struct hl_vm_phys_pg_pack *phys_pg_pack) +{ + struct hl_device *hdev = ctx->hdev; + u64 next_vaddr = vaddr, paddr, mapped_pg_cnt = 0, i; + u32 page_size = phys_pg_pack->page_size; + int rc = 0; + bool is_host_addr; + + for (i = 0 ; i < phys_pg_pack->npages ; i++) { + paddr = phys_pg_pack->pages[i]; + + rc = hl_mmu_map_page(ctx, next_vaddr, paddr, page_size, + (i + 1) == phys_pg_pack->npages); + if (rc) { + dev_err(hdev->dev, + "map failed for handle %u, npages: %llu, mapped: %llu", + phys_pg_pack->handle, phys_pg_pack->npages, + mapped_pg_cnt); + goto err; + } + + mapped_pg_cnt++; + next_vaddr += page_size; + } + + return 0; + +err: + is_host_addr = !hl_is_dram_va(hdev, vaddr); + + next_vaddr = vaddr; + for (i = 0 ; i < mapped_pg_cnt ; i++) { + if (hl_mmu_unmap_page(ctx, next_vaddr, page_size, + (i + 1) == mapped_pg_cnt)) + dev_warn_ratelimited(hdev->dev, + "failed to unmap handle %u, va: 0x%llx, pa: 0x%llx, page size: %u\n", + phys_pg_pack->handle, next_vaddr, + phys_pg_pack->pages[i], page_size); + + next_vaddr += page_size; + + /* + * unmapping on Palladium can be really long, so avoid a CPU + * soft lockup bug by sleeping a little between unmapping pages + * + * In addition, on host num of pages could be huge, + * because page size could be 4KB, so when unmapping host + * pages sleep every 32K pages to avoid soft lockup + */ + if (hdev->pldm || (is_host_addr && (i & 0x7FFF) == 0)) + usleep_range(50, 200); + } + + return rc; +} + +/** + * unmap_phys_pg_pack() - unmaps the physical page pack. + * @ctx: pointer to the context structure. + * @vaddr: start address of the virtual area to unmap. + * @phys_pg_pack: the pack of physical pages to unmap. + */ +static void unmap_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr, + struct hl_vm_phys_pg_pack *phys_pg_pack) +{ + struct hl_device *hdev = ctx->hdev; + u64 next_vaddr, i; + bool is_host_addr; + u32 page_size; + + is_host_addr = !hl_is_dram_va(hdev, vaddr); + page_size = phys_pg_pack->page_size; + next_vaddr = vaddr; + + for (i = 0 ; i < phys_pg_pack->npages ; i++, next_vaddr += page_size) { + if (hl_mmu_unmap_page(ctx, next_vaddr, page_size, + (i + 1) == phys_pg_pack->npages)) + dev_warn_ratelimited(hdev->dev, + "unmap failed for vaddr: 0x%llx\n", next_vaddr); + + /* + * unmapping on Palladium can be really long, so avoid a CPU + * soft lockup bug by sleeping a little between unmapping pages + * + * In addition, on host num of pages could be huge, + * because page size could be 4KB, so when unmapping host + * pages sleep every 32K pages to avoid soft lockup + */ + if (hdev->pldm || (is_host_addr && (i & 0x7FFF) == 0)) + usleep_range(50, 200); + } +} + +static int get_paddr_from_handle(struct hl_ctx *ctx, struct hl_mem_in *args, + u64 *paddr) +{ + struct hl_device *hdev = ctx->hdev; + struct hl_vm *vm = &hdev->vm; + struct hl_vm_phys_pg_pack *phys_pg_pack; + u32 handle; + + handle = lower_32_bits(args->map_device.handle); + spin_lock(&vm->idr_lock); + phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle); + if (!phys_pg_pack) { + spin_unlock(&vm->idr_lock); + dev_err(hdev->dev, "no match for handle %u\n", handle); + return -EINVAL; + } + + *paddr = phys_pg_pack->pages[0]; + + spin_unlock(&vm->idr_lock); + + return 0; +} + +/** + * map_device_va() - map the given memory. + * @ctx: pointer to the context structure. + * @args: host parameters with handle/host virtual address. + * @device_addr: pointer to result device virtual address. + * + * This function does the following: + * - If given a physical device memory handle, map to a device virtual block + * and return the start address of this block. + * - If given a host virtual address and size, find the related physical pages, + * map a device virtual block to this pages and return the start address of + * this block. + */ +static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, u64 *device_addr) +{ + struct hl_vm_phys_pg_pack *phys_pg_pack; + enum hl_va_range_type va_range_type = 0; + struct hl_device *hdev = ctx->hdev; + struct hl_userptr *userptr = NULL; + u32 handle = 0, va_block_align; + struct hl_vm_hash_node *hnode; + struct hl_vm *vm = &hdev->vm; + struct hl_va_range *va_range; + bool is_userptr, do_prefetch; + u64 ret_vaddr, hint_addr; + enum vm_type *vm_type; + int rc; + + /* set map flags */ + is_userptr = args->flags & HL_MEM_USERPTR; + do_prefetch = hdev->supports_mmu_prefetch && (args->flags & HL_MEM_PREFETCH); + + /* Assume failure */ + *device_addr = 0; + + if (is_userptr) { + u64 addr = args->map_host.host_virt_addr, + size = args->map_host.mem_size; + u32 page_size = hdev->asic_prop.pmmu.page_size, + huge_page_size = hdev->asic_prop.pmmu_huge.page_size; + + rc = dma_map_host_va(hdev, addr, size, &userptr); + if (rc) { + dev_err(hdev->dev, "failed to get userptr from va\n"); + return rc; + } + + rc = init_phys_pg_pack_from_userptr(ctx, userptr, + &phys_pg_pack, false); + if (rc) { + dev_err(hdev->dev, + "unable to init page pack for vaddr 0x%llx\n", + addr); + goto init_page_pack_err; + } + + vm_type = (enum vm_type *) userptr; + hint_addr = args->map_host.hint_addr; + handle = phys_pg_pack->handle; + + /* get required alignment */ + if (phys_pg_pack->page_size == page_size) { + va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST]; + va_range_type = HL_VA_RANGE_TYPE_HOST; + /* + * huge page alignment may be needed in case of regular + * page mapping, depending on the host VA alignment + */ + if (addr & (huge_page_size - 1)) + va_block_align = page_size; + else + va_block_align = huge_page_size; + } else { + /* + * huge page alignment is needed in case of huge page + * mapping + */ + va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]; + va_range_type = HL_VA_RANGE_TYPE_HOST_HUGE; + va_block_align = huge_page_size; + } + } else { + handle = lower_32_bits(args->map_device.handle); + + spin_lock(&vm->idr_lock); + phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle); + if (!phys_pg_pack) { + spin_unlock(&vm->idr_lock); + dev_err(hdev->dev, + "no match for handle %u\n", handle); + return -EINVAL; + } + + /* increment now to avoid freeing device memory while mapping */ + atomic_inc(&phys_pg_pack->mapping_cnt); + + spin_unlock(&vm->idr_lock); + + vm_type = (enum vm_type *) phys_pg_pack; + + hint_addr = args->map_device.hint_addr; + + /* DRAM VA alignment is the same as the MMU page size */ + va_range = ctx->va_range[HL_VA_RANGE_TYPE_DRAM]; + va_range_type = HL_VA_RANGE_TYPE_DRAM; + va_block_align = hdev->asic_prop.dmmu.page_size; + } + + /* + * relevant for mapping device physical memory only, as host memory is + * implicitly shared + */ + if (!is_userptr && !(phys_pg_pack->flags & HL_MEM_SHARED) && + phys_pg_pack->asid != ctx->asid) { + dev_err(hdev->dev, + "Failed to map memory, handle %u is not shared\n", + handle); + rc = -EPERM; + goto shared_err; + } + + hnode = kzalloc(sizeof(*hnode), GFP_KERNEL); + if (!hnode) { + rc = -ENOMEM; + goto hnode_err; + } + + if (hint_addr && phys_pg_pack->offset) { + if (args->flags & HL_MEM_FORCE_HINT) { + /* Fail if hint must be respected but it can't be */ + dev_err(hdev->dev, + "Hint address 0x%llx cannot be respected because source memory is not aligned 0x%x\n", + hint_addr, phys_pg_pack->offset); + rc = -EINVAL; + goto va_block_err; + } + dev_dbg(hdev->dev, + "Hint address 0x%llx will be ignored because source memory is not aligned 0x%x\n", + hint_addr, phys_pg_pack->offset); + } + + ret_vaddr = get_va_block(hdev, va_range, phys_pg_pack->total_size, + hint_addr, va_block_align, + va_range_type, args->flags); + if (!ret_vaddr) { + dev_err(hdev->dev, "no available va block for handle %u\n", + handle); + rc = -ENOMEM; + goto va_block_err; + } + + mutex_lock(&hdev->mmu_lock); + + rc = map_phys_pg_pack(ctx, ret_vaddr, phys_pg_pack); + if (rc) { + dev_err(hdev->dev, "mapping page pack failed for handle %u\n", handle); + mutex_unlock(&hdev->mmu_lock); + goto map_err; + } + + rc = hl_mmu_invalidate_cache_range(hdev, false, *vm_type | MMU_OP_SKIP_LOW_CACHE_INV, + ctx->asid, ret_vaddr, phys_pg_pack->total_size); + mutex_unlock(&hdev->mmu_lock); + if (rc) + goto map_err; + + /* + * prefetch is done upon user's request. it is performed in WQ as and so can + * be outside the MMU lock. the operation itself is already protected by the mmu lock + */ + if (do_prefetch) { + rc = hl_mmu_prefetch_cache_range(ctx, *vm_type, ctx->asid, ret_vaddr, + phys_pg_pack->total_size); + if (rc) + goto map_err; + } + + ret_vaddr += phys_pg_pack->offset; + + hnode->ptr = vm_type; + hnode->vaddr = ret_vaddr; + + mutex_lock(&ctx->mem_hash_lock); + hash_add(ctx->mem_hash, &hnode->node, ret_vaddr); + mutex_unlock(&ctx->mem_hash_lock); + + *device_addr = ret_vaddr; + + if (is_userptr) + free_phys_pg_pack(hdev, phys_pg_pack); + + return rc; + +map_err: + if (add_va_block(hdev, va_range, ret_vaddr, + ret_vaddr + phys_pg_pack->total_size - 1)) + dev_warn(hdev->dev, + "release va block failed for handle 0x%x, vaddr: 0x%llx\n", + handle, ret_vaddr); + +va_block_err: + kfree(hnode); +hnode_err: +shared_err: + atomic_dec(&phys_pg_pack->mapping_cnt); + if (is_userptr) + free_phys_pg_pack(hdev, phys_pg_pack); +init_page_pack_err: + if (is_userptr) + dma_unmap_host_va(hdev, userptr); + + return rc; +} + +/** + * unmap_device_va() - unmap the given device virtual address. + * @ctx: pointer to the context structure. + * @args: host parameters with device virtual address to unmap. + * @ctx_free: true if in context free flow, false otherwise. + * + * This function does the following: + * - unmap the physical pages related to the given virtual address. + * - return the device virtual block to the virtual block list. + */ +static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, + bool ctx_free) +{ + struct hl_vm_phys_pg_pack *phys_pg_pack = NULL; + u64 vaddr = args->unmap.device_virt_addr; + struct hl_vm_hash_node *hnode = NULL; + struct asic_fixed_properties *prop; + struct hl_device *hdev = ctx->hdev; + struct hl_userptr *userptr = NULL; + struct hl_va_range *va_range; + enum vm_type *vm_type; + bool is_userptr; + int rc = 0; + + prop = &hdev->asic_prop; + + /* protect from double entrance */ + mutex_lock(&ctx->mem_hash_lock); + hash_for_each_possible(ctx->mem_hash, hnode, node, (unsigned long)vaddr) + if (vaddr == hnode->vaddr) + break; + + if (!hnode) { + mutex_unlock(&ctx->mem_hash_lock); + dev_err(hdev->dev, + "unmap failed, no mem hnode for vaddr 0x%llx\n", + vaddr); + return -EINVAL; + } + + hash_del(&hnode->node); + mutex_unlock(&ctx->mem_hash_lock); + + vm_type = hnode->ptr; + + if (*vm_type == VM_TYPE_USERPTR) { + is_userptr = true; + userptr = hnode->ptr; + + rc = init_phys_pg_pack_from_userptr(ctx, userptr, &phys_pg_pack, + false); + if (rc) { + dev_err(hdev->dev, + "unable to init page pack for vaddr 0x%llx\n", + vaddr); + goto vm_type_err; + } + + if (phys_pg_pack->page_size == + hdev->asic_prop.pmmu.page_size) + va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST]; + else + va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]; + } else if (*vm_type == VM_TYPE_PHYS_PACK) { + is_userptr = false; + va_range = ctx->va_range[HL_VA_RANGE_TYPE_DRAM]; + phys_pg_pack = hnode->ptr; + } else { + dev_warn(hdev->dev, + "unmap failed, unknown vm desc for vaddr 0x%llx\n", + vaddr); + rc = -EFAULT; + goto vm_type_err; + } + + if (atomic_read(&phys_pg_pack->mapping_cnt) == 0) { + dev_err(hdev->dev, "vaddr 0x%llx is not mapped\n", vaddr); + rc = -EINVAL; + goto mapping_cnt_err; + } + + if (!is_userptr && !is_power_of_2(phys_pg_pack->page_size)) + vaddr = prop->dram_base_address + + DIV_ROUND_DOWN_ULL(vaddr - prop->dram_base_address, + phys_pg_pack->page_size) * + phys_pg_pack->page_size; + else + vaddr &= ~(((u64) phys_pg_pack->page_size) - 1); + + mutex_lock(&hdev->mmu_lock); + + unmap_phys_pg_pack(ctx, vaddr, phys_pg_pack); + + /* + * During context free this function is called in a loop to clean all + * the context mappings. Hence the cache invalidation can be called once + * at the loop end rather than for each iteration + */ + if (!ctx_free) + rc = hl_mmu_invalidate_cache_range(hdev, true, *vm_type, ctx->asid, vaddr, + phys_pg_pack->total_size); + + mutex_unlock(&hdev->mmu_lock); + + /* + * If the context is closing we don't need to check for the MMU cache + * invalidation return code and update the VA free list as in this flow + * we invalidate the MMU cache outside of this unmap function and the VA + * free list will be freed anyway. + */ + if (!ctx_free) { + int tmp_rc; + + tmp_rc = add_va_block(hdev, va_range, vaddr, + vaddr + phys_pg_pack->total_size - 1); + if (tmp_rc) { + dev_warn(hdev->dev, + "add va block failed for vaddr: 0x%llx\n", + vaddr); + if (!rc) + rc = tmp_rc; + } + } + + atomic_dec(&phys_pg_pack->mapping_cnt); + kfree(hnode); + + if (is_userptr) { + free_phys_pg_pack(hdev, phys_pg_pack); + dma_unmap_host_va(hdev, userptr); + } + + return rc; + +mapping_cnt_err: + if (is_userptr) + free_phys_pg_pack(hdev, phys_pg_pack); +vm_type_err: + mutex_lock(&ctx->mem_hash_lock); + hash_add(ctx->mem_hash, &hnode->node, vaddr); + mutex_unlock(&ctx->mem_hash_lock); + + return rc; +} + +static int map_block(struct hl_device *hdev, u64 address, u64 *handle, u32 *size) +{ + u32 block_id; + int rc; + + *handle = 0; + if (size) + *size = 0; + + rc = hdev->asic_funcs->get_hw_block_id(hdev, address, size, &block_id); + if (rc) + return rc; + + *handle = block_id | HL_MMAP_TYPE_BLOCK; + *handle <<= PAGE_SHIFT; + + return 0; +} + +static void hw_block_vm_close(struct vm_area_struct *vma) +{ + struct hl_vm_hw_block_list_node *lnode = + (struct hl_vm_hw_block_list_node *) vma->vm_private_data; + struct hl_ctx *ctx = lnode->ctx; + long new_mmap_size; + + new_mmap_size = lnode->mapped_size - (vma->vm_end - vma->vm_start); + if (new_mmap_size > 0) { + lnode->mapped_size = new_mmap_size; + return; + } + + mutex_lock(&ctx->hw_block_list_lock); + list_del(&lnode->node); + mutex_unlock(&ctx->hw_block_list_lock); + hl_ctx_put(ctx); + kfree(lnode); + vma->vm_private_data = NULL; +} + +static const struct vm_operations_struct hw_block_vm_ops = { + .close = hw_block_vm_close +}; + +/** + * hl_hw_block_mmap() - mmap a hw block to user. + * @hpriv: pointer to the private data of the fd + * @vma: pointer to vm_area_struct of the process + * + * Driver increments context reference for every HW block mapped in order + * to prevent user from closing FD without unmapping first + */ +int hl_hw_block_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma) +{ + struct hl_vm_hw_block_list_node *lnode; + struct hl_device *hdev = hpriv->hdev; + struct hl_ctx *ctx = hpriv->ctx; + u32 block_id, block_size; + int rc; + + /* We use the page offset to hold the block id and thus we need to clear + * it before doing the mmap itself + */ + block_id = vma->vm_pgoff; + vma->vm_pgoff = 0; + + /* Driver only allows mapping of a complete HW block */ + block_size = vma->vm_end - vma->vm_start; + + if (!access_ok((void __user *) (uintptr_t) vma->vm_start, block_size)) { + dev_err(hdev->dev, + "user pointer is invalid - 0x%lx\n", + vma->vm_start); + + return -EINVAL; + } + + lnode = kzalloc(sizeof(*lnode), GFP_KERNEL); + if (!lnode) + return -ENOMEM; + + rc = hdev->asic_funcs->hw_block_mmap(hdev, vma, block_id, block_size); + if (rc) { + kfree(lnode); + return rc; + } + + hl_ctx_get(ctx); + + lnode->ctx = ctx; + lnode->vaddr = vma->vm_start; + lnode->block_size = block_size; + lnode->mapped_size = lnode->block_size; + lnode->id = block_id; + + vma->vm_private_data = lnode; + vma->vm_ops = &hw_block_vm_ops; + + mutex_lock(&ctx->hw_block_list_lock); + list_add_tail(&lnode->node, &ctx->hw_block_mem_list); + mutex_unlock(&ctx->hw_block_list_lock); + + vma->vm_pgoff = block_id; + + return 0; +} + +static int set_dma_sg(struct scatterlist *sg, u64 bar_address, u64 chunk_size, + struct device *dev, enum dma_data_direction dir) +{ + dma_addr_t addr; + int rc; + + addr = dma_map_resource(dev, bar_address, chunk_size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + rc = dma_mapping_error(dev, addr); + if (rc) + return rc; + + sg_set_page(sg, NULL, chunk_size, 0); + sg_dma_address(sg) = addr; + sg_dma_len(sg) = chunk_size; + + return 0; +} + +static struct sg_table *alloc_sgt_from_device_pages(struct hl_device *hdev, u64 *pages, u64 npages, + u64 page_size, struct device *dev, + enum dma_data_direction dir) +{ + u64 chunk_size, bar_address, dma_max_seg_size; + struct asic_fixed_properties *prop; + int rc, i, j, nents, cur_page; + struct scatterlist *sg; + struct sg_table *sgt; + + prop = &hdev->asic_prop; + + dma_max_seg_size = dma_get_max_seg_size(dev); + + /* We would like to align the max segment size to PAGE_SIZE, so the + * SGL will contain aligned addresses that can be easily mapped to + * an MMU + */ + dma_max_seg_size = ALIGN_DOWN(dma_max_seg_size, PAGE_SIZE); + if (dma_max_seg_size < PAGE_SIZE) { + dev_err_ratelimited(hdev->dev, + "dma_max_seg_size %llu can't be smaller than PAGE_SIZE\n", + dma_max_seg_size); + return ERR_PTR(-EINVAL); + } + + sgt = kzalloc(sizeof(*sgt), GFP_KERNEL); + if (!sgt) + return ERR_PTR(-ENOMEM); + + /* If the size of each page is larger than the dma max segment size, + * then we can't combine pages and the number of entries in the SGL + * will just be the + * <number of pages> * <chunks of max segment size in each page> + */ + if (page_size > dma_max_seg_size) + nents = npages * DIV_ROUND_UP_ULL(page_size, dma_max_seg_size); + else + /* Get number of non-contiguous chunks */ + for (i = 1, nents = 1, chunk_size = page_size ; i < npages ; i++) { + if (pages[i - 1] + page_size != pages[i] || + chunk_size + page_size > dma_max_seg_size) { + nents++; + chunk_size = page_size; + continue; + } + + chunk_size += page_size; + } + + rc = sg_alloc_table(sgt, nents, GFP_KERNEL | __GFP_ZERO); + if (rc) + goto error_free; + + cur_page = 0; + + if (page_size > dma_max_seg_size) { + u64 size_left, cur_device_address = 0; + + size_left = page_size; + + /* Need to split each page into the number of chunks of + * dma_max_seg_size + */ + for_each_sgtable_dma_sg(sgt, sg, i) { + if (size_left == page_size) + cur_device_address = + pages[cur_page] - prop->dram_base_address; + else + cur_device_address += dma_max_seg_size; + + chunk_size = min(size_left, dma_max_seg_size); + + bar_address = hdev->dram_pci_bar_start + cur_device_address; + + rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir); + if (rc) + goto error_unmap; + + if (size_left > dma_max_seg_size) { + size_left -= dma_max_seg_size; + } else { + cur_page++; + size_left = page_size; + } + } + } else { + /* Merge pages and put them into the scatterlist */ + for_each_sgtable_dma_sg(sgt, sg, i) { + chunk_size = page_size; + for (j = cur_page + 1 ; j < npages ; j++) { + if (pages[j - 1] + page_size != pages[j] || + chunk_size + page_size > dma_max_seg_size) + break; + + chunk_size += page_size; + } + + bar_address = hdev->dram_pci_bar_start + + (pages[cur_page] - prop->dram_base_address); + + rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir); + if (rc) + goto error_unmap; + + cur_page = j; + } + } + + /* Because we are not going to include a CPU list we want to have some + * chance that other users will detect this by setting the orig_nents + * to 0 and using only nents (length of DMA list) when going over the + * sgl + */ + sgt->orig_nents = 0; + + return sgt; + +error_unmap: + for_each_sgtable_dma_sg(sgt, sg, i) { + if (!sg_dma_len(sg)) + continue; + + dma_unmap_resource(dev, sg_dma_address(sg), + sg_dma_len(sg), dir, + DMA_ATTR_SKIP_CPU_SYNC); + } + + sg_free_table(sgt); + +error_free: + kfree(sgt); + return ERR_PTR(rc); +} + +static int hl_dmabuf_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + struct hl_dmabuf_priv *hl_dmabuf; + struct hl_device *hdev; + int rc; + + hl_dmabuf = dmabuf->priv; + hdev = hl_dmabuf->ctx->hdev; + + rc = pci_p2pdma_distance_many(hdev->pdev, &attachment->dev, 1, true); + + if (rc < 0) + attachment->peer2peer = false; + return 0; +} + +static struct sg_table *hl_map_dmabuf(struct dma_buf_attachment *attachment, + enum dma_data_direction dir) +{ + struct dma_buf *dma_buf = attachment->dmabuf; + struct hl_vm_phys_pg_pack *phys_pg_pack; + struct hl_dmabuf_priv *hl_dmabuf; + struct hl_device *hdev; + struct sg_table *sgt; + + hl_dmabuf = dma_buf->priv; + hdev = hl_dmabuf->ctx->hdev; + phys_pg_pack = hl_dmabuf->phys_pg_pack; + + if (!attachment->peer2peer) { + dev_dbg(hdev->dev, "Failed to map dmabuf because p2p is disabled\n"); + return ERR_PTR(-EPERM); + } + + if (phys_pg_pack) + sgt = alloc_sgt_from_device_pages(hdev, + phys_pg_pack->pages, + phys_pg_pack->npages, + phys_pg_pack->page_size, + attachment->dev, + dir); + else + sgt = alloc_sgt_from_device_pages(hdev, + &hl_dmabuf->device_address, + 1, + hl_dmabuf->dmabuf->size, + attachment->dev, + dir); + + if (IS_ERR(sgt)) + dev_err(hdev->dev, "failed (%ld) to initialize sgt for dmabuf\n", PTR_ERR(sgt)); + + return sgt; +} + +static void hl_unmap_dmabuf(struct dma_buf_attachment *attachment, + struct sg_table *sgt, + enum dma_data_direction dir) +{ + struct scatterlist *sg; + int i; + + /* The memory behind the dma-buf has *always* resided on the device itself, i.e. it lives + * only in the 'device' domain (after all, it maps a PCI bar address which points to the + * device memory). + * + * Therefore, it was never in the 'CPU' domain and hence, there is no need to perform + * a sync of the memory to the CPU's cache, as it never resided inside that cache. + */ + for_each_sgtable_dma_sg(sgt, sg, i) + dma_unmap_resource(attachment->dev, sg_dma_address(sg), + sg_dma_len(sg), dir, + DMA_ATTR_SKIP_CPU_SYNC); + + /* Need to restore orig_nents because sg_free_table use that field */ + sgt->orig_nents = sgt->nents; + sg_free_table(sgt); + kfree(sgt); +} + +static void hl_release_dmabuf(struct dma_buf *dmabuf) +{ + struct hl_dmabuf_priv *hl_dmabuf = dmabuf->priv; + struct hl_ctx *ctx = hl_dmabuf->ctx; + struct hl_device *hdev = ctx->hdev; + struct hl_vm *vm = &hdev->vm; + + if (hl_dmabuf->phys_pg_pack) { + spin_lock(&vm->idr_lock); + hl_dmabuf->phys_pg_pack->exporting_cnt--; + spin_unlock(&vm->idr_lock); + } + + hl_ctx_put(hl_dmabuf->ctx); + + kfree(hl_dmabuf); +} + +static const struct dma_buf_ops habanalabs_dmabuf_ops = { + .attach = hl_dmabuf_attach, + .map_dma_buf = hl_map_dmabuf, + .unmap_dma_buf = hl_unmap_dmabuf, + .release = hl_release_dmabuf, +}; + +static int export_dmabuf_common(struct hl_ctx *ctx, + struct hl_dmabuf_priv *hl_dmabuf, + u64 total_size, int flags, int *dmabuf_fd) +{ + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct hl_device *hdev = ctx->hdev; + int rc, fd; + + exp_info.ops = &habanalabs_dmabuf_ops; + exp_info.size = total_size; + exp_info.flags = flags; + exp_info.priv = hl_dmabuf; + + hl_dmabuf->dmabuf = dma_buf_export(&exp_info); + if (IS_ERR(hl_dmabuf->dmabuf)) { + dev_err(hdev->dev, "failed to export dma-buf\n"); + return PTR_ERR(hl_dmabuf->dmabuf); + } + + fd = dma_buf_fd(hl_dmabuf->dmabuf, flags); + if (fd < 0) { + dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf\n"); + rc = fd; + goto err_dma_buf_put; + } + + hl_dmabuf->ctx = ctx; + hl_ctx_get(hl_dmabuf->ctx); + + *dmabuf_fd = fd; + + return 0; + +err_dma_buf_put: + dma_buf_put(hl_dmabuf->dmabuf); + return rc; +} + +/** + * export_dmabuf_from_addr() - export a dma-buf object for the given memory + * address and size. + * @ctx: pointer to the context structure. + * @device_addr: device memory physical address. + * @size: size of device memory. + * @flags: DMA-BUF file/FD flags. + * @dmabuf_fd: pointer to result FD that represents the dma-buf object. + * + * Create and export a dma-buf object for an existing memory allocation inside + * the device memory, and return a FD which is associated with the dma-buf + * object. + * + * Return: 0 on success, non-zero for failure. + */ +static int export_dmabuf_from_addr(struct hl_ctx *ctx, u64 device_addr, + u64 size, int flags, int *dmabuf_fd) +{ + struct hl_dmabuf_priv *hl_dmabuf; + struct hl_device *hdev = ctx->hdev; + struct asic_fixed_properties *prop; + u64 bar_address; + int rc; + + prop = &hdev->asic_prop; + + if (!IS_ALIGNED(device_addr, PAGE_SIZE)) { + dev_dbg(hdev->dev, + "exported device memory address 0x%llx should be aligned to 0x%lx\n", + device_addr, PAGE_SIZE); + return -EINVAL; + } + + if (size < PAGE_SIZE) { + dev_dbg(hdev->dev, + "exported device memory size %llu should be equal to or greater than %lu\n", + size, PAGE_SIZE); + return -EINVAL; + } + + if (device_addr < prop->dram_user_base_address || + device_addr + size > prop->dram_end_address || + device_addr + size < device_addr) { + dev_dbg(hdev->dev, + "DRAM memory range 0x%llx (+0x%llx) is outside of DRAM boundaries\n", + device_addr, size); + return -EINVAL; + } + + bar_address = hdev->dram_pci_bar_start + + (device_addr - prop->dram_base_address); + + if (bar_address + size > + hdev->dram_pci_bar_start + prop->dram_pci_bar_size || + bar_address + size < bar_address) { + dev_dbg(hdev->dev, + "DRAM memory range 0x%llx (+0x%llx) is outside of PCI BAR boundaries\n", + device_addr, size); + return -EINVAL; + } + + hl_dmabuf = kzalloc(sizeof(*hl_dmabuf), GFP_KERNEL); + if (!hl_dmabuf) + return -ENOMEM; + + hl_dmabuf->device_address = device_addr; + + rc = export_dmabuf_common(ctx, hl_dmabuf, size, flags, dmabuf_fd); + if (rc) + goto err_free_dmabuf_wrapper; + + return 0; + +err_free_dmabuf_wrapper: + kfree(hl_dmabuf); + return rc; +} + +/** + * export_dmabuf_from_handle() - export a dma-buf object for the given memory + * handle. + * @ctx: pointer to the context structure. + * @handle: device memory allocation handle. + * @flags: DMA-BUF file/FD flags. + * @dmabuf_fd: pointer to result FD that represents the dma-buf object. + * + * Create and export a dma-buf object for an existing memory allocation inside + * the device memory, and return a FD which is associated with the dma-buf + * object. + * + * Return: 0 on success, non-zero for failure. + */ +static int export_dmabuf_from_handle(struct hl_ctx *ctx, u64 handle, int flags, + int *dmabuf_fd) +{ + struct hl_vm_phys_pg_pack *phys_pg_pack; + struct hl_dmabuf_priv *hl_dmabuf; + struct hl_device *hdev = ctx->hdev; + struct asic_fixed_properties *prop; + struct hl_vm *vm = &hdev->vm; + u64 bar_address; + int rc, i; + + prop = &hdev->asic_prop; + + if (upper_32_bits(handle)) { + dev_dbg(hdev->dev, "no match for handle 0x%llx\n", handle); + return -EINVAL; + } + + spin_lock(&vm->idr_lock); + + phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, (u32) handle); + if (!phys_pg_pack) { + spin_unlock(&vm->idr_lock); + dev_dbg(hdev->dev, "no match for handle 0x%x\n", (u32) handle); + return -EINVAL; + } + + /* increment now to avoid freeing device memory while exporting */ + phys_pg_pack->exporting_cnt++; + + spin_unlock(&vm->idr_lock); + + if (phys_pg_pack->vm_type != VM_TYPE_PHYS_PACK) { + dev_dbg(hdev->dev, "handle 0x%llx does not represent DRAM memory\n", handle); + rc = -EINVAL; + goto err_dec_exporting_cnt; + } + + for (i = 0 ; i < phys_pg_pack->npages ; i++) { + + bar_address = hdev->dram_pci_bar_start + + (phys_pg_pack->pages[i] - + prop->dram_base_address); + + if (bar_address + phys_pg_pack->page_size > + hdev->dram_pci_bar_start + prop->dram_pci_bar_size || + bar_address + phys_pg_pack->page_size < bar_address) { + + dev_dbg(hdev->dev, + "DRAM memory range 0x%llx (+0x%x) is outside of PCI BAR boundaries\n", + phys_pg_pack->pages[i], + phys_pg_pack->page_size); + + rc = -EINVAL; + goto err_dec_exporting_cnt; + } + } + + hl_dmabuf = kzalloc(sizeof(*hl_dmabuf), GFP_KERNEL); + if (!hl_dmabuf) { + rc = -ENOMEM; + goto err_dec_exporting_cnt; + } + + hl_dmabuf->phys_pg_pack = phys_pg_pack; + + rc = export_dmabuf_common(ctx, hl_dmabuf, phys_pg_pack->total_size, + flags, dmabuf_fd); + if (rc) + goto err_free_dmabuf_wrapper; + + return 0; + +err_free_dmabuf_wrapper: + kfree(hl_dmabuf); + +err_dec_exporting_cnt: + spin_lock(&vm->idr_lock); + phys_pg_pack->exporting_cnt--; + spin_unlock(&vm->idr_lock); + + return rc; +} + +static int mem_ioctl_no_mmu(struct hl_fpriv *hpriv, union hl_mem_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + u64 block_handle, device_addr = 0; + struct hl_ctx *ctx = hpriv->ctx; + u32 handle = 0, block_size; + int rc; + + switch (args->in.op) { + case HL_MEM_OP_ALLOC: + if (args->in.alloc.mem_size == 0) { + dev_err(hdev->dev, "alloc size must be larger than 0\n"); + rc = -EINVAL; + goto out; + } + + /* Force contiguous as there are no real MMU + * translations to overcome physical memory gaps + */ + args->in.flags |= HL_MEM_CONTIGUOUS; + rc = alloc_device_memory(ctx, &args->in, &handle); + + memset(args, 0, sizeof(*args)); + args->out.handle = (__u64) handle; + break; + + case HL_MEM_OP_FREE: + rc = free_device_memory(ctx, &args->in); + break; + + case HL_MEM_OP_MAP: + if (args->in.flags & HL_MEM_USERPTR) { + dev_err(hdev->dev, "Failed to map host memory when MMU is disabled\n"); + rc = -EPERM; + } else { + rc = get_paddr_from_handle(ctx, &args->in, &device_addr); + memset(args, 0, sizeof(*args)); + args->out.device_virt_addr = device_addr; + } + + break; + + case HL_MEM_OP_UNMAP: + rc = 0; + break; + + case HL_MEM_OP_MAP_BLOCK: + rc = map_block(hdev, args->in.map_block.block_addr, &block_handle, &block_size); + args->out.block_handle = block_handle; + args->out.block_size = block_size; + break; + + case HL_MEM_OP_EXPORT_DMABUF_FD: + dev_err(hdev->dev, "Failed to export dma-buf object when MMU is disabled\n"); + rc = -EPERM; + break; + + case HL_MEM_OP_TS_ALLOC: + rc = allocate_timestamps_buffers(hpriv, &args->in, &args->out.handle); + break; + default: + dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n"); + rc = -EINVAL; + break; + } + +out: + return rc; +} + +static void ts_buff_release(struct hl_mmap_mem_buf *buf) +{ + struct hl_ts_buff *ts_buff = buf->private; + + vfree(ts_buff->kernel_buff_address); + vfree(ts_buff->user_buff_address); + kfree(ts_buff); +} + +static int hl_ts_mmap(struct hl_mmap_mem_buf *buf, struct vm_area_struct *vma, void *args) +{ + struct hl_ts_buff *ts_buff = buf->private; + + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP | VM_DONTCOPY | VM_NORESERVE; + return remap_vmalloc_range(vma, ts_buff->user_buff_address, 0); +} + +static int hl_ts_alloc_buf(struct hl_mmap_mem_buf *buf, gfp_t gfp, void *args) +{ + struct hl_ts_buff *ts_buff = NULL; + u32 num_elements; + size_t size; + void *p; + + num_elements = *(u32 *)args; + + ts_buff = kzalloc(sizeof(*ts_buff), gfp); + if (!ts_buff) + return -ENOMEM; + + /* Allocate the user buffer */ + size = num_elements * sizeof(u64); + p = vmalloc_user(size); + if (!p) + goto free_mem; + + ts_buff->user_buff_address = p; + buf->mappable_size = size; + + /* Allocate the internal kernel buffer */ + size = num_elements * sizeof(struct hl_user_pending_interrupt); + p = vmalloc(size); + if (!p) + goto free_user_buff; + + ts_buff->kernel_buff_address = p; + ts_buff->kernel_buff_size = size; + + buf->private = ts_buff; + + return 0; + +free_user_buff: + vfree(ts_buff->user_buff_address); +free_mem: + kfree(ts_buff); + return -ENOMEM; +} + +static struct hl_mmap_mem_buf_behavior hl_ts_behavior = { + .topic = "TS", + .mem_id = HL_MMAP_TYPE_TS_BUFF, + .mmap = hl_ts_mmap, + .alloc = hl_ts_alloc_buf, + .release = ts_buff_release, +}; + +/** + * allocate_timestamps_buffers() - allocate timestamps buffers + * This function will allocate ts buffer that will later on be mapped to the user + * in order to be able to read the timestamp. + * in additon it'll allocate an extra buffer for registration management. + * since we cannot fail during registration for out-of-memory situation, so + * we'll prepare a pool which will be used as user interrupt nodes and instead + * of dynamically allocating nodes while registration we'll pick the node from + * this pool. in addtion it'll add node to the mapping hash which will be used + * to map user ts buffer to the internal kernel ts buffer. + * @hpriv: pointer to the private data of the fd + * @args: ioctl input + * @handle: user timestamp buffer handle as an output + */ +static int allocate_timestamps_buffers(struct hl_fpriv *hpriv, struct hl_mem_in *args, u64 *handle) +{ + struct hl_mem_mgr *mmg = &hpriv->mem_mgr; + struct hl_mmap_mem_buf *buf; + + if (args->num_of_elements > TS_MAX_ELEMENTS_NUM) { + dev_err(mmg->dev, "Num of elements exceeds Max allowed number (0x%x > 0x%x)\n", + args->num_of_elements, TS_MAX_ELEMENTS_NUM); + return -EINVAL; + } + + buf = hl_mmap_mem_buf_alloc(mmg, &hl_ts_behavior, GFP_KERNEL, &args->num_of_elements); + if (!buf) + return -ENOMEM; + + *handle = buf->handle; + + return 0; +} + +int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data) +{ + enum hl_device_status status; + union hl_mem_args *args = data; + struct hl_device *hdev = hpriv->hdev; + struct hl_ctx *ctx = hpriv->ctx; + u64 block_handle, device_addr = 0; + u32 handle = 0, block_size; + int rc, dmabuf_fd = -EBADF; + + if (!hl_device_operational(hdev, &status)) { + dev_warn_ratelimited(hdev->dev, + "Device is %s. Can't execute MEMORY IOCTL\n", + hdev->status[status]); + return -EBUSY; + } + + if (!hdev->mmu_enable) + return mem_ioctl_no_mmu(hpriv, args); + + switch (args->in.op) { + case HL_MEM_OP_ALLOC: + if (args->in.alloc.mem_size == 0) { + dev_err(hdev->dev, + "alloc size must be larger than 0\n"); + rc = -EINVAL; + goto out; + } + + /* If DRAM does not support virtual memory the driver won't + * handle the allocation/freeing of that memory. However, for + * system administration/monitoring purposes, the driver will + * keep track of the amount of DRAM memory that is allocated + * and freed by the user. Because this code totally relies on + * the user's input, the driver can't ensure the validity + * of this accounting. + */ + if (!hdev->asic_prop.dram_supports_virtual_memory) { + atomic64_add(args->in.alloc.mem_size, + &ctx->dram_phys_mem); + atomic64_add(args->in.alloc.mem_size, + &hdev->dram_used_mem); + + dev_dbg(hdev->dev, "DRAM alloc is not supported\n"); + rc = 0; + + memset(args, 0, sizeof(*args)); + args->out.handle = 0; + goto out; + } + + rc = alloc_device_memory(ctx, &args->in, &handle); + + memset(args, 0, sizeof(*args)); + args->out.handle = (__u64) handle; + break; + + case HL_MEM_OP_FREE: + /* If DRAM does not support virtual memory the driver won't + * handle the allocation/freeing of that memory. However, for + * system administration/monitoring purposes, the driver will + * keep track of the amount of DRAM memory that is allocated + * and freed by the user. Because this code totally relies on + * the user's input, the driver can't ensure the validity + * of this accounting. + */ + if (!hdev->asic_prop.dram_supports_virtual_memory) { + atomic64_sub(args->in.alloc.mem_size, + &ctx->dram_phys_mem); + atomic64_sub(args->in.alloc.mem_size, + &hdev->dram_used_mem); + + dev_dbg(hdev->dev, "DRAM alloc is not supported\n"); + rc = 0; + + goto out; + } + + rc = free_device_memory(ctx, &args->in); + break; + + case HL_MEM_OP_MAP: + rc = map_device_va(ctx, &args->in, &device_addr); + + memset(args, 0, sizeof(*args)); + args->out.device_virt_addr = device_addr; + break; + + case HL_MEM_OP_UNMAP: + rc = unmap_device_va(ctx, &args->in, false); + break; + + case HL_MEM_OP_MAP_BLOCK: + rc = map_block(hdev, args->in.map_block.block_addr, + &block_handle, &block_size); + args->out.block_handle = block_handle; + args->out.block_size = block_size; + break; + + case HL_MEM_OP_EXPORT_DMABUF_FD: + if (hdev->asic_prop.dram_supports_virtual_memory) + rc = export_dmabuf_from_handle(ctx, + args->in.export_dmabuf_fd.handle, + args->in.flags, + &dmabuf_fd); + else + rc = export_dmabuf_from_addr(ctx, + args->in.export_dmabuf_fd.handle, + args->in.export_dmabuf_fd.mem_size, + args->in.flags, + &dmabuf_fd); + memset(args, 0, sizeof(*args)); + args->out.fd = dmabuf_fd; + break; + + case HL_MEM_OP_TS_ALLOC: + rc = allocate_timestamps_buffers(hpriv, &args->in, &args->out.handle); + break; + default: + dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n"); + rc = -EINVAL; + break; + } + +out: + return rc; +} + +static int get_user_memory(struct hl_device *hdev, u64 addr, u64 size, + u32 npages, u64 start, u32 offset, + struct hl_userptr *userptr) +{ + int rc; + + if (!access_ok((void __user *) (uintptr_t) addr, size)) { + dev_err(hdev->dev, "user pointer is invalid - 0x%llx\n", addr); + return -EFAULT; + } + + userptr->pages = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); + if (!userptr->pages) + return -ENOMEM; + + rc = pin_user_pages_fast(start, npages, + FOLL_FORCE | FOLL_WRITE | FOLL_LONGTERM, + userptr->pages); + + if (rc != npages) { + dev_err(hdev->dev, + "Failed (%d) to pin host memory with user ptr 0x%llx, size 0x%llx, npages %d\n", + rc, addr, size, npages); + if (rc < 0) + goto destroy_pages; + npages = rc; + rc = -EFAULT; + goto put_pages; + } + userptr->npages = npages; + + rc = sg_alloc_table_from_pages(userptr->sgt, + userptr->pages, + npages, offset, size, GFP_KERNEL); + if (rc < 0) { + dev_err(hdev->dev, "failed to create SG table from pages\n"); + goto put_pages; + } + + return 0; + +put_pages: + unpin_user_pages(userptr->pages, npages); +destroy_pages: + kvfree(userptr->pages); + return rc; +} + +/** + * hl_pin_host_memory() - pins a chunk of host memory. + * @hdev: pointer to the habanalabs device structure. + * @addr: the host virtual address of the memory area. + * @size: the size of the memory area. + * @userptr: pointer to hl_userptr structure. + * + * This function does the following: + * - Pins the physical pages. + * - Create an SG list from those pages. + */ +int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size, + struct hl_userptr *userptr) +{ + u64 start, end; + u32 npages, offset; + int rc; + + if (!size) { + dev_err(hdev->dev, "size to pin is invalid - %llu\n", size); + return -EINVAL; + } + + /* + * If the combination of the address and size requested for this memory + * region causes an integer overflow, return error. + */ + if (((addr + size) < addr) || + PAGE_ALIGN(addr + size) < (addr + size)) { + dev_err(hdev->dev, + "user pointer 0x%llx + %llu causes integer overflow\n", + addr, size); + return -EINVAL; + } + + userptr->pid = current->pid; + userptr->sgt = kzalloc(sizeof(*userptr->sgt), GFP_KERNEL); + if (!userptr->sgt) + return -ENOMEM; + + start = addr & PAGE_MASK; + offset = addr & ~PAGE_MASK; + end = PAGE_ALIGN(addr + size); + npages = (end - start) >> PAGE_SHIFT; + + userptr->size = size; + userptr->addr = addr; + userptr->dma_mapped = false; + INIT_LIST_HEAD(&userptr->job_node); + + rc = get_user_memory(hdev, addr, size, npages, start, offset, + userptr); + if (rc) { + dev_err(hdev->dev, + "failed to get user memory for address 0x%llx\n", + addr); + goto free_sgt; + } + + hl_debugfs_add_userptr(hdev, userptr); + + return 0; + +free_sgt: + kfree(userptr->sgt); + return rc; +} + +/* + * hl_unpin_host_memory - unpins a chunk of host memory. + * @hdev: pointer to the habanalabs device structure + * @userptr: pointer to hl_userptr structure + * + * This function does the following: + * - Unpins the physical pages related to the host memory + * - Free the SG list + */ +void hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr) +{ + hl_debugfs_remove_userptr(hdev, userptr); + + if (userptr->dma_mapped) + hdev->asic_funcs->hl_dma_unmap_sgtable(hdev, userptr->sgt, userptr->dir); + + unpin_user_pages_dirty_lock(userptr->pages, userptr->npages, true); + kvfree(userptr->pages); + + list_del(&userptr->job_node); + + sg_free_table(userptr->sgt); + kfree(userptr->sgt); +} + +/** + * hl_userptr_delete_list() - clear userptr list. + * @hdev: pointer to the habanalabs device structure. + * @userptr_list: pointer to the list to clear. + * + * This function does the following: + * - Iterates over the list and unpins the host memory and frees the userptr + * structure. + */ +void hl_userptr_delete_list(struct hl_device *hdev, + struct list_head *userptr_list) +{ + struct hl_userptr *userptr, *tmp; + + list_for_each_entry_safe(userptr, tmp, userptr_list, job_node) { + hl_unpin_host_memory(hdev, userptr); + kfree(userptr); + } + + INIT_LIST_HEAD(userptr_list); +} + +/** + * hl_userptr_is_pinned() - returns whether the given userptr is pinned. + * @hdev: pointer to the habanalabs device structure. + * @addr: user address to check. + * @size: user block size to check. + * @userptr_list: pointer to the list to clear. + * @userptr: pointer to userptr to check. + * + * This function does the following: + * - Iterates over the list and checks if the given userptr is in it, means is + * pinned. If so, returns true, otherwise returns false. + */ +bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr, + u32 size, struct list_head *userptr_list, + struct hl_userptr **userptr) +{ + list_for_each_entry((*userptr), userptr_list, job_node) { + if ((addr == (*userptr)->addr) && (size == (*userptr)->size)) + return true; + } + + return false; +} + +/** + * va_range_init() - initialize virtual addresses range. + * @hdev: pointer to the habanalabs device structure. + * @va_ranges: pointer to va_ranges array. + * @range_type: virtual address range type. + * @start: range start address, inclusive. + * @end: range end address, inclusive. + * @page_size: page size for this va_range. + * + * This function does the following: + * - Initializes the virtual addresses list of the given range with the given + * addresses. + */ +static int va_range_init(struct hl_device *hdev, struct hl_va_range **va_ranges, + enum hl_va_range_type range_type, u64 start, + u64 end, u32 page_size) +{ + struct hl_va_range *va_range = va_ranges[range_type]; + int rc; + + INIT_LIST_HEAD(&va_range->list); + + /* + * PAGE_SIZE alignment + * it is the callers responsibility to align the addresses if the + * page size is not a power of 2 + */ + + if (is_power_of_2(page_size)) { + if (start & (PAGE_SIZE - 1)) { + start &= PAGE_MASK; + start += PAGE_SIZE; + } + + /* + * The end of the range is inclusive, hence we need to align it + * to the end of the last full page in the range. For example if + * end = 0x3ff5 with page size 0x1000, we need to align it to + * 0x2fff. The remainig 0xff5 bytes do not form a full page. + */ + if ((end + 1) & (PAGE_SIZE - 1)) + end = ((end + 1) & PAGE_MASK) - 1; + } + + if (start >= end) { + dev_err(hdev->dev, "too small vm range for va list\n"); + return -EFAULT; + } + + rc = add_va_block(hdev, va_range, start, end); + + if (rc) { + dev_err(hdev->dev, "Failed to init host va list\n"); + return rc; + } + + va_range->start_addr = start; + va_range->end_addr = end; + va_range->page_size = page_size; + + return 0; +} + +/** + * va_range_fini() - clear a virtual addresses range. + * @hdev: pointer to the habanalabs structure. + * @va_range: pointer to virtual addresses range. + * + * This function does the following: + * - Frees the virtual addresses block list and its lock. + */ +static void va_range_fini(struct hl_device *hdev, struct hl_va_range *va_range) +{ + mutex_lock(&va_range->lock); + clear_va_list_locked(hdev, &va_range->list); + mutex_unlock(&va_range->lock); + + mutex_destroy(&va_range->lock); + kfree(va_range); +} + +/** + * vm_ctx_init_with_ranges() - initialize virtual memory for context. + * @ctx: pointer to the habanalabs context structure. + * @host_range_start: host virtual addresses range start. + * @host_range_end: host virtual addresses range end. + * @host_page_size: host page size. + * @host_huge_range_start: host virtual addresses range start for memory + * allocated with huge pages. + * @host_huge_range_end: host virtual addresses range end for memory allocated + * with huge pages. + * @host_huge_page_size: host huge page size. + * @dram_range_start: dram virtual addresses range start. + * @dram_range_end: dram virtual addresses range end. + * @dram_page_size: dram page size. + * + * This function initializes the following: + * - MMU for context. + * - Virtual address to area descriptor hashtable. + * - Virtual block list of available virtual memory. + */ +static int vm_ctx_init_with_ranges(struct hl_ctx *ctx, + u64 host_range_start, + u64 host_range_end, + u32 host_page_size, + u64 host_huge_range_start, + u64 host_huge_range_end, + u32 host_huge_page_size, + u64 dram_range_start, + u64 dram_range_end, + u32 dram_page_size) +{ + struct hl_device *hdev = ctx->hdev; + int i, rc; + + for (i = 0 ; i < HL_VA_RANGE_TYPE_MAX ; i++) { + ctx->va_range[i] = + kzalloc(sizeof(struct hl_va_range), GFP_KERNEL); + if (!ctx->va_range[i]) { + rc = -ENOMEM; + goto free_va_range; + } + } + + rc = hl_mmu_ctx_init(ctx); + if (rc) { + dev_err(hdev->dev, "failed to init context %d\n", ctx->asid); + goto free_va_range; + } + + mutex_init(&ctx->mem_hash_lock); + hash_init(ctx->mem_hash); + + mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock); + + rc = va_range_init(hdev, ctx->va_range, HL_VA_RANGE_TYPE_HOST, + host_range_start, host_range_end, host_page_size); + if (rc) { + dev_err(hdev->dev, "failed to init host vm range\n"); + goto mmu_ctx_fini; + } + + if (hdev->pmmu_huge_range) { + mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock); + + rc = va_range_init(hdev, + ctx->va_range, HL_VA_RANGE_TYPE_HOST_HUGE, + host_huge_range_start, host_huge_range_end, + host_huge_page_size); + if (rc) { + dev_err(hdev->dev, + "failed to init host huge vm range\n"); + goto clear_host_va_range; + } + } else { + kfree(ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]); + ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE] = + ctx->va_range[HL_VA_RANGE_TYPE_HOST]; + } + + mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_DRAM]->lock); + + rc = va_range_init(hdev, ctx->va_range, HL_VA_RANGE_TYPE_DRAM, + dram_range_start, dram_range_end, dram_page_size); + if (rc) { + dev_err(hdev->dev, "failed to init dram vm range\n"); + goto clear_host_huge_va_range; + } + + hl_debugfs_add_ctx_mem_hash(hdev, ctx); + + return 0; + +clear_host_huge_va_range: + mutex_destroy(&ctx->va_range[HL_VA_RANGE_TYPE_DRAM]->lock); + + if (hdev->pmmu_huge_range) { + mutex_lock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock); + clear_va_list_locked(hdev, + &ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->list); + mutex_unlock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock); + } +clear_host_va_range: + if (hdev->pmmu_huge_range) + mutex_destroy(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock); + mutex_lock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock); + clear_va_list_locked(hdev, &ctx->va_range[HL_VA_RANGE_TYPE_HOST]->list); + mutex_unlock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock); +mmu_ctx_fini: + mutex_destroy(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock); + mutex_destroy(&ctx->mem_hash_lock); + hl_mmu_ctx_fini(ctx); +free_va_range: + for (i = 0 ; i < HL_VA_RANGE_TYPE_MAX ; i++) + kfree(ctx->va_range[i]); + + return rc; +} + +int hl_vm_ctx_init(struct hl_ctx *ctx) +{ + struct asic_fixed_properties *prop = &ctx->hdev->asic_prop; + u64 host_range_start, host_range_end, host_huge_range_start, + host_huge_range_end, dram_range_start, dram_range_end; + u32 host_page_size, host_huge_page_size, dram_page_size; + + atomic64_set(&ctx->dram_phys_mem, 0); + + /* + * - If MMU is enabled, init the ranges as usual. + * - If MMU is disabled, in case of host mapping, the returned address + * is the given one. + * In case of DRAM mapping, the returned address is the physical + * address of the memory related to the given handle. + */ + if (!ctx->hdev->mmu_enable) + return 0; + + dram_range_start = prop->dmmu.start_addr; + dram_range_end = prop->dmmu.end_addr - 1; + dram_page_size = prop->dram_page_size ? + prop->dram_page_size : prop->dmmu.page_size; + host_range_start = prop->pmmu.start_addr; + host_range_end = prop->pmmu.end_addr - 1; + host_page_size = prop->pmmu.page_size; + host_huge_range_start = prop->pmmu_huge.start_addr; + host_huge_range_end = prop->pmmu_huge.end_addr - 1; + host_huge_page_size = prop->pmmu_huge.page_size; + + return vm_ctx_init_with_ranges(ctx, host_range_start, host_range_end, + host_page_size, host_huge_range_start, + host_huge_range_end, host_huge_page_size, + dram_range_start, dram_range_end, dram_page_size); +} + +/** + * hl_vm_ctx_fini() - virtual memory teardown of context. + * @ctx: pointer to the habanalabs context structure. + * + * This function perform teardown the following: + * - Virtual block list of available virtual memory. + * - Virtual address to area descriptor hashtable. + * - MMU for context. + * + * In addition this function does the following: + * - Unmaps the existing hashtable nodes if the hashtable is not empty. The + * hashtable should be empty as no valid mappings should exist at this + * point. + * - Frees any existing physical page list from the idr which relates to the + * current context asid. + * - This function checks the virtual block list for correctness. At this point + * the list should contain one element which describes the whole virtual + * memory range of the context. Otherwise, a warning is printed. + */ +void hl_vm_ctx_fini(struct hl_ctx *ctx) +{ + struct hl_vm_phys_pg_pack *phys_pg_list, *tmp_phys_node; + struct hl_device *hdev = ctx->hdev; + struct hl_vm_hash_node *hnode; + struct hl_vm *vm = &hdev->vm; + struct hlist_node *tmp_node; + struct list_head free_list; + struct hl_mem_in args; + int i; + + if (!hdev->mmu_enable) + return; + + hl_debugfs_remove_ctx_mem_hash(hdev, ctx); + + /* + * Clearly something went wrong on hard reset so no point in printing + * another side effect error + */ + if (!hdev->reset_info.hard_reset_pending && !hash_empty(ctx->mem_hash)) + dev_dbg(hdev->dev, + "user released device without removing its memory mappings\n"); + + hash_for_each_safe(ctx->mem_hash, i, tmp_node, hnode, node) { + dev_dbg(hdev->dev, + "hl_mem_hash_node of vaddr 0x%llx of asid %d is still alive\n", + hnode->vaddr, ctx->asid); + args.unmap.device_virt_addr = hnode->vaddr; + unmap_device_va(ctx, &args, true); + } + + mutex_lock(&hdev->mmu_lock); + + /* invalidate the cache once after the unmapping loop */ + hl_mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR); + hl_mmu_invalidate_cache(hdev, true, MMU_OP_PHYS_PACK); + + mutex_unlock(&hdev->mmu_lock); + + INIT_LIST_HEAD(&free_list); + + spin_lock(&vm->idr_lock); + idr_for_each_entry(&vm->phys_pg_pack_handles, phys_pg_list, i) + if (phys_pg_list->asid == ctx->asid) { + dev_dbg(hdev->dev, + "page list 0x%px of asid %d is still alive\n", + phys_pg_list, ctx->asid); + + atomic64_sub(phys_pg_list->total_size, &hdev->dram_used_mem); + idr_remove(&vm->phys_pg_pack_handles, i); + list_add(&phys_pg_list->node, &free_list); + } + spin_unlock(&vm->idr_lock); + + list_for_each_entry_safe(phys_pg_list, tmp_phys_node, &free_list, node) + free_phys_pg_pack(hdev, phys_pg_list); + + va_range_fini(hdev, ctx->va_range[HL_VA_RANGE_TYPE_DRAM]); + va_range_fini(hdev, ctx->va_range[HL_VA_RANGE_TYPE_HOST]); + + if (hdev->pmmu_huge_range) + va_range_fini(hdev, ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]); + + mutex_destroy(&ctx->mem_hash_lock); + hl_mmu_ctx_fini(ctx); + + /* In this case we need to clear the global accounting of DRAM usage + * because the user notifies us on allocations. If the user is no more, + * all DRAM is available + */ + if (ctx->asid != HL_KERNEL_ASID_ID && + !hdev->asic_prop.dram_supports_virtual_memory) + atomic64_set(&hdev->dram_used_mem, 0); +} + +/** + * hl_vm_init() - initialize virtual memory module. + * @hdev: pointer to the habanalabs device structure. + * + * This function initializes the following: + * - MMU module. + * - DRAM physical pages pool of 2MB. + * - Idr for device memory allocation handles. + */ +int hl_vm_init(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct hl_vm *vm = &hdev->vm; + int rc; + + if (is_power_of_2(prop->dram_page_size)) + vm->dram_pg_pool = + gen_pool_create(__ffs(prop->dram_page_size), -1); + else + vm->dram_pg_pool = + gen_pool_create(__ffs(DRAM_POOL_PAGE_SIZE), -1); + + if (!vm->dram_pg_pool) { + dev_err(hdev->dev, "Failed to create dram page pool\n"); + return -ENOMEM; + } + + kref_init(&vm->dram_pg_pool_refcount); + + rc = gen_pool_add(vm->dram_pg_pool, prop->dram_user_base_address, + prop->dram_end_address - prop->dram_user_base_address, + -1); + + if (rc) { + dev_err(hdev->dev, + "Failed to add memory to dram page pool %d\n", rc); + goto pool_add_err; + } + + spin_lock_init(&vm->idr_lock); + idr_init(&vm->phys_pg_pack_handles); + + atomic64_set(&hdev->dram_used_mem, 0); + + vm->init_done = true; + + return 0; + +pool_add_err: + gen_pool_destroy(vm->dram_pg_pool); + + return rc; +} + +/** + * hl_vm_fini() - virtual memory module teardown. + * @hdev: pointer to the habanalabs device structure. + * + * This function perform teardown to the following: + * - Idr for device memory allocation handles. + * - DRAM physical pages pool of 2MB. + * - MMU module. + */ +void hl_vm_fini(struct hl_device *hdev) +{ + struct hl_vm *vm = &hdev->vm; + + if (!vm->init_done) + return; + + /* + * At this point all the contexts should be freed and hence no DRAM + * memory should be in use. Hence the DRAM pool should be freed here. + */ + if (kref_put(&vm->dram_pg_pool_refcount, dram_pg_pool_do_release) != 1) + dev_warn(hdev->dev, "dram_pg_pool was not destroyed on %s\n", + __func__); + + vm->init_done = false; +} + +/** + * hl_hw_block_mem_init() - HW block memory initialization. + * @ctx: pointer to the habanalabs context structure. + * + * This function initializes the HW block virtual mapped addresses list and + * it's lock. + */ +void hl_hw_block_mem_init(struct hl_ctx *ctx) +{ + mutex_init(&ctx->hw_block_list_lock); + INIT_LIST_HEAD(&ctx->hw_block_mem_list); +} + +/** + * hl_hw_block_mem_fini() - HW block memory teardown. + * @ctx: pointer to the habanalabs context structure. + * + * This function clears the HW block virtual mapped addresses list and destroys + * it's lock. + */ +void hl_hw_block_mem_fini(struct hl_ctx *ctx) +{ + struct hl_vm_hw_block_list_node *lnode, *tmp; + + if (!list_empty(&ctx->hw_block_mem_list)) + dev_crit(ctx->hdev->dev, "HW block mem list isn't empty\n"); + + list_for_each_entry_safe(lnode, tmp, &ctx->hw_block_mem_list, node) { + list_del(&lnode->node); + kfree(lnode); + } + + mutex_destroy(&ctx->hw_block_list_lock); +} diff --git a/drivers/misc/habanalabs/common/memory_mgr.c b/drivers/misc/habanalabs/common/memory_mgr.c new file mode 100644 index 000000000..1936d6536 --- /dev/null +++ b/drivers/misc/habanalabs/common/memory_mgr.c @@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2022 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "habanalabs.h" + +/** + * hl_mmap_mem_buf_get - increase the buffer refcount and return a pointer to + * the buffer descriptor. + * + * @mmg: parent unified memory manager + * @handle: requested buffer handle + * + * Find the buffer in the store and return a pointer to its descriptor. + * Increase buffer refcount. If not found - return NULL. + */ +struct hl_mmap_mem_buf *hl_mmap_mem_buf_get(struct hl_mem_mgr *mmg, u64 handle) +{ + struct hl_mmap_mem_buf *buf; + + spin_lock(&mmg->lock); + buf = idr_find(&mmg->handles, lower_32_bits(handle >> PAGE_SHIFT)); + if (!buf) { + spin_unlock(&mmg->lock); + dev_warn(mmg->dev, + "Buff get failed, no match to handle %#llx\n", handle); + return NULL; + } + kref_get(&buf->refcount); + spin_unlock(&mmg->lock); + return buf; +} + +/** + * hl_mmap_mem_buf_destroy - destroy the unused buffer + * + * @buf: memory manager buffer descriptor + * + * Internal function, used as a final step of buffer release. Shall be invoked + * only when the buffer is no longer in use (removed from idr). Will call the + * release callback (if applicable), and free the memory. + */ +static void hl_mmap_mem_buf_destroy(struct hl_mmap_mem_buf *buf) +{ + if (buf->behavior->release) + buf->behavior->release(buf); + + kfree(buf); +} + +/** + * hl_mmap_mem_buf_release - release buffer + * + * @kref: kref that reached 0. + * + * Internal function, used as a kref release callback, when the last user of + * the buffer is released. Shall be called from an interrupt context. + */ +static void hl_mmap_mem_buf_release(struct kref *kref) +{ + struct hl_mmap_mem_buf *buf = + container_of(kref, struct hl_mmap_mem_buf, refcount); + + spin_lock(&buf->mmg->lock); + idr_remove(&buf->mmg->handles, lower_32_bits(buf->handle >> PAGE_SHIFT)); + spin_unlock(&buf->mmg->lock); + + hl_mmap_mem_buf_destroy(buf); +} + +/** + * hl_mmap_mem_buf_remove_idr_locked - remove handle from idr + * + * @kref: kref that reached 0. + * + * Internal function, used for kref put by handle. Assumes mmg lock is taken. + * Will remove the buffer from idr, without destroying it. + */ +static void hl_mmap_mem_buf_remove_idr_locked(struct kref *kref) +{ + struct hl_mmap_mem_buf *buf = + container_of(kref, struct hl_mmap_mem_buf, refcount); + + idr_remove(&buf->mmg->handles, lower_32_bits(buf->handle >> PAGE_SHIFT)); +} + +/** + * hl_mmap_mem_buf_put - decrease the reference to the buffer + * + * @buf: memory manager buffer descriptor + * + * Decrease the reference to the buffer, and release it if it was the last one. + * Shall be called from an interrupt context. + */ +int hl_mmap_mem_buf_put(struct hl_mmap_mem_buf *buf) +{ + return kref_put(&buf->refcount, hl_mmap_mem_buf_release); +} + +/** + * hl_mmap_mem_buf_put_handle - decrease the reference to the buffer with the + * given handle. + * + * @mmg: parent unified memory manager + * @handle: requested buffer handle + * + * Decrease the reference to the buffer, and release it if it was the last one. + * Shall not be called from an interrupt context. Return -EINVAL if handle was + * not found, else return the put outcome (0 or 1). + */ +int hl_mmap_mem_buf_put_handle(struct hl_mem_mgr *mmg, u64 handle) +{ + struct hl_mmap_mem_buf *buf; + + spin_lock(&mmg->lock); + buf = idr_find(&mmg->handles, lower_32_bits(handle >> PAGE_SHIFT)); + if (!buf) { + spin_unlock(&mmg->lock); + dev_dbg(mmg->dev, + "Buff put failed, no match to handle %#llx\n", handle); + return -EINVAL; + } + + if (kref_put(&buf->refcount, hl_mmap_mem_buf_remove_idr_locked)) { + spin_unlock(&mmg->lock); + hl_mmap_mem_buf_destroy(buf); + return 1; + } + + spin_unlock(&mmg->lock); + return 0; +} + +/** + * hl_mmap_mem_buf_alloc - allocate a new mappable buffer + * + * @mmg: parent unified memory manager + * @behavior: behavior object describing this buffer polymorphic behavior + * @gfp: gfp flags to use for the memory allocations + * @args: additional args passed to behavior->alloc + * + * Allocate and register a new memory buffer inside the give memory manager. + * Return the pointer to the new buffer on success or NULL on failure. + */ +struct hl_mmap_mem_buf * +hl_mmap_mem_buf_alloc(struct hl_mem_mgr *mmg, + struct hl_mmap_mem_buf_behavior *behavior, gfp_t gfp, + void *args) +{ + struct hl_mmap_mem_buf *buf; + int rc; + + buf = kzalloc(sizeof(*buf), gfp); + if (!buf) + return NULL; + + spin_lock(&mmg->lock); + rc = idr_alloc(&mmg->handles, buf, 1, 0, GFP_ATOMIC); + spin_unlock(&mmg->lock); + if (rc < 0) { + dev_err(mmg->dev, + "%s: Failed to allocate IDR for a new buffer, rc=%d\n", + behavior->topic, rc); + goto free_buf; + } + + buf->mmg = mmg; + buf->behavior = behavior; + buf->handle = (((u64)rc | buf->behavior->mem_id) << PAGE_SHIFT); + kref_init(&buf->refcount); + + rc = buf->behavior->alloc(buf, gfp, args); + if (rc) { + dev_err(mmg->dev, "%s: Failure in buffer alloc callback %d\n", + behavior->topic, rc); + goto remove_idr; + } + + return buf; + +remove_idr: + spin_lock(&mmg->lock); + idr_remove(&mmg->handles, lower_32_bits(buf->handle >> PAGE_SHIFT)); + spin_unlock(&mmg->lock); +free_buf: + kfree(buf); + return NULL; +} + +/** + * hl_mmap_mem_buf_vm_close - handle mmap close + * + * @vma: the vma object for which mmap was closed. + * + * Put the memory buffer if it is no longer mapped. + */ +static void hl_mmap_mem_buf_vm_close(struct vm_area_struct *vma) +{ + struct hl_mmap_mem_buf *buf = + (struct hl_mmap_mem_buf *)vma->vm_private_data; + long new_mmap_size; + + new_mmap_size = buf->real_mapped_size - (vma->vm_end - vma->vm_start); + + if (new_mmap_size > 0) { + buf->real_mapped_size = new_mmap_size; + return; + } + + atomic_set(&buf->mmap, 0); + hl_mmap_mem_buf_put(buf); + vma->vm_private_data = NULL; +} + +static const struct vm_operations_struct hl_mmap_mem_buf_vm_ops = { + .close = hl_mmap_mem_buf_vm_close +}; + +/** + * hl_mem_mgr_mmap - map the given buffer to the user + * + * @mmg: unified memory manager + * @vma: the vma object for which mmap was closed. + * @args: additional args passed to behavior->mmap + * + * Map the buffer specified by the vma->vm_pgoff to the given vma. + */ +int hl_mem_mgr_mmap(struct hl_mem_mgr *mmg, struct vm_area_struct *vma, + void *args) +{ + struct hl_mmap_mem_buf *buf; + u64 user_mem_size; + u64 handle; + int rc; + + /* We use the page offset to hold the idr and thus we need to clear + * it before doing the mmap itself + */ + handle = vma->vm_pgoff << PAGE_SHIFT; + vma->vm_pgoff = 0; + + /* Reference was taken here */ + buf = hl_mmap_mem_buf_get(mmg, handle); + if (!buf) { + dev_err(mmg->dev, + "Memory mmap failed, no match to handle %#llx\n", handle); + return -EINVAL; + } + + /* Validation check */ + user_mem_size = vma->vm_end - vma->vm_start; + if (user_mem_size != ALIGN(buf->mappable_size, PAGE_SIZE)) { + dev_err(mmg->dev, + "%s: Memory mmap failed, mmap VM size 0x%llx != 0x%llx allocated physical mem size\n", + buf->behavior->topic, user_mem_size, buf->mappable_size); + rc = -EINVAL; + goto put_mem; + } + +#ifdef _HAS_TYPE_ARG_IN_ACCESS_OK + if (!access_ok(VERIFY_WRITE, (void __user *)(uintptr_t)vma->vm_start, + user_mem_size)) { +#else + if (!access_ok((void __user *)(uintptr_t)vma->vm_start, + user_mem_size)) { +#endif + dev_err(mmg->dev, "%s: User pointer is invalid - 0x%lx\n", + buf->behavior->topic, vma->vm_start); + + rc = -EINVAL; + goto put_mem; + } + + if (atomic_cmpxchg(&buf->mmap, 0, 1)) { + dev_err(mmg->dev, + "%s, Memory mmap failed, already mmaped to user\n", + buf->behavior->topic); + rc = -EINVAL; + goto put_mem; + } + + vma->vm_ops = &hl_mmap_mem_buf_vm_ops; + + /* Note: We're transferring the memory reference to vma->vm_private_data here. */ + + vma->vm_private_data = buf; + + rc = buf->behavior->mmap(buf, vma, args); + if (rc) { + atomic_set(&buf->mmap, 0); + goto put_mem; + } + + buf->real_mapped_size = buf->mappable_size; + vma->vm_pgoff = handle >> PAGE_SHIFT; + + return 0; + +put_mem: + hl_mmap_mem_buf_put(buf); + return rc; +} + +/** + * hl_mem_mgr_init - initialize unified memory manager + * + * @dev: owner device pointer + * @mmg: structure to initialize + * + * Initialize an instance of unified memory manager + */ +void hl_mem_mgr_init(struct device *dev, struct hl_mem_mgr *mmg) +{ + mmg->dev = dev; + spin_lock_init(&mmg->lock); + idr_init(&mmg->handles); +} + +/** + * hl_mem_mgr_fini - release unified memory manager + * + * @mmg: parent unified memory manager + * + * Release the unified memory manager. Shall be called from an interrupt context. + */ +void hl_mem_mgr_fini(struct hl_mem_mgr *mmg) +{ + struct hl_mmap_mem_buf *buf; + struct idr *idp; + const char *topic; + u32 id; + + idp = &mmg->handles; + + idr_for_each_entry(idp, buf, id) { + topic = buf->behavior->topic; + if (hl_mmap_mem_buf_put(buf) != 1) + dev_err(mmg->dev, + "%s: Buff handle %u for CTX is still alive\n", + topic, id); + } + + /* TODO: can it happen that some buffer is still in use at this point? */ + + idr_destroy(&mmg->handles); +} diff --git a/drivers/misc/habanalabs/common/mmu/Makefile b/drivers/misc/habanalabs/common/mmu/Makefile new file mode 100644 index 000000000..1806c524e --- /dev/null +++ b/drivers/misc/habanalabs/common/mmu/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +HL_COMMON_MMU_FILES := common/mmu/mmu.o common/mmu/mmu_v1.o \ + common/mmu/mmu_v2_hr.o diff --git a/drivers/misc/habanalabs/common/mmu/mmu.c b/drivers/misc/habanalabs/common/mmu/mmu.c new file mode 100644 index 000000000..cf8946266 --- /dev/null +++ b/drivers/misc/habanalabs/common/mmu/mmu.c @@ -0,0 +1,1246 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2022 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include <linux/slab.h> + +#include "../habanalabs.h" + +#include <trace/events/habanalabs.h> + +/** + * hl_mmu_get_funcs() - get MMU functions structure + * @hdev: habanalabs device structure. + * @pgt_residency: page table residency. + * @is_dram_addr: true if we need HMMU functions + * + * @return appropriate MMU functions structure + */ +static struct hl_mmu_funcs *hl_mmu_get_funcs(struct hl_device *hdev, int pgt_residency, + bool is_dram_addr) +{ + return &hdev->mmu_func[pgt_residency]; +} + +bool hl_is_dram_va(struct hl_device *hdev, u64 virt_addr) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + + return hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size, + prop->dmmu.start_addr, + prop->dmmu.end_addr); +} + +/** + * hl_mmu_init() - initialize the MMU module. + * @hdev: habanalabs device structure. + * + * Return: 0 for success, non-zero for failure. + */ +int hl_mmu_init(struct hl_device *hdev) +{ + int rc = -EOPNOTSUPP; + + if (!hdev->mmu_enable) + return 0; + + mutex_init(&hdev->mmu_lock); + + if (hdev->mmu_func[MMU_DR_PGT].init != NULL) { + rc = hdev->mmu_func[MMU_DR_PGT].init(hdev); + if (rc) + return rc; + } + + if (hdev->mmu_func[MMU_HR_PGT].init != NULL) { + rc = hdev->mmu_func[MMU_HR_PGT].init(hdev); + if (rc) + goto fini_dr_mmu; + } + + return 0; + +fini_dr_mmu: + if (hdev->mmu_func[MMU_DR_PGT].fini != NULL) + hdev->mmu_func[MMU_DR_PGT].fini(hdev); + + return rc; +} + +/** + * hl_mmu_fini() - release the MMU module. + * @hdev: habanalabs device structure. + * + * This function does the following: + * - Disable MMU in H/W. + * - Free the pgt_infos pool. + * + * All contexts should be freed before calling this function. + */ +void hl_mmu_fini(struct hl_device *hdev) +{ + if (!hdev->mmu_enable) + return; + + if (hdev->mmu_func[MMU_DR_PGT].fini != NULL) + hdev->mmu_func[MMU_DR_PGT].fini(hdev); + + if (hdev->mmu_func[MMU_HR_PGT].fini != NULL) + hdev->mmu_func[MMU_HR_PGT].fini(hdev); + + mutex_destroy(&hdev->mmu_lock); +} + +/** + * hl_mmu_ctx_init() - initialize a context for using the MMU module. + * @ctx: pointer to the context structure to initialize. + * + * Initialize a mutex to protect the concurrent mapping flow, a hash to hold all + * page tables hops related to this context. + * Return: 0 on success, non-zero otherwise. + */ +int hl_mmu_ctx_init(struct hl_ctx *ctx) +{ + struct hl_device *hdev = ctx->hdev; + int rc = -EOPNOTSUPP; + + if (!hdev->mmu_enable) + return 0; + + if (hdev->mmu_func[MMU_DR_PGT].ctx_init != NULL) { + rc = hdev->mmu_func[MMU_DR_PGT].ctx_init(ctx); + if (rc) + return rc; + } + + if (hdev->mmu_func[MMU_HR_PGT].ctx_init != NULL) { + rc = hdev->mmu_func[MMU_HR_PGT].ctx_init(ctx); + if (rc) + goto fini_dr_ctx; + } + + return 0; + +fini_dr_ctx: + if (hdev->mmu_func[MMU_DR_PGT].fini != NULL) + hdev->mmu_func[MMU_DR_PGT].fini(hdev); + + return rc; +} + +/* + * hl_mmu_ctx_fini - disable a ctx from using the mmu module + * + * @ctx: pointer to the context structure + * + * This function does the following: + * - Free any pgts which were not freed yet + * - Free the mutex + * - Free DRAM default page mapping hops + */ +void hl_mmu_ctx_fini(struct hl_ctx *ctx) +{ + struct hl_device *hdev = ctx->hdev; + + if (!hdev->mmu_enable) + return; + + if (hdev->mmu_func[MMU_DR_PGT].ctx_fini != NULL) + hdev->mmu_func[MMU_DR_PGT].ctx_fini(ctx); + + if (hdev->mmu_func[MMU_HR_PGT].ctx_fini != NULL) + hdev->mmu_func[MMU_HR_PGT].ctx_fini(ctx); +} + +/* + * hl_mmu_get_real_page_size - get real page size to use in map/unmap operation + * + * @hdev: pointer to device data. + * @mmu_prop: MMU properties. + * @page_size: page size + * @real_page_size: set here the actual page size to use for the operation + * @is_dram_addr: true if DRAM address, otherwise false. + * + * @return 0 on success, otherwise non 0 error code + * + * note that this is general implementation that can fit most MMU arch. but as this is used as an + * MMU function: + * 1. it shall not be called directly- only from mmu_func structure instance + * 2. each MMU may modify the implementation internally + */ +int hl_mmu_get_real_page_size(struct hl_device *hdev, struct hl_mmu_properties *mmu_prop, + u32 page_size, u32 *real_page_size, bool is_dram_addr) +{ + /* + * The H/W handles mapping of specific page sizes. Hence if the page + * size is bigger, we break it to sub-pages and map them separately. + */ + if ((page_size % mmu_prop->page_size) == 0) { + *real_page_size = mmu_prop->page_size; + return 0; + } + + dev_err(hdev->dev, "page size of %u is not %uKB aligned, can't map\n", + page_size, mmu_prop->page_size >> 10); + + return -EFAULT; +} + +static struct hl_mmu_properties *hl_mmu_get_prop(struct hl_device *hdev, u32 page_size, + bool is_dram_addr) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + + if (is_dram_addr) + return &prop->dmmu; + else if ((page_size % prop->pmmu_huge.page_size) == 0) + return &prop->pmmu_huge; + + return &prop->pmmu; +} + +/* + * hl_mmu_unmap_page - unmaps a virtual addr + * + * @ctx: pointer to the context structure + * @virt_addr: virt addr to map from + * @page_size: size of the page to unmap + * @flush_pte: whether to do a PCI flush + * + * This function does the following: + * - Check that the virt addr is mapped + * - Unmap the virt addr and frees pgts if possible + * - Returns 0 on success, -EINVAL if the given addr is not mapped + * + * Because this function changes the page tables in the device and because it + * changes the MMU hash, it must be protected by a lock. + * However, because it maps only a single page, the lock should be implemented + * in a higher level in order to protect the entire mapping of the memory area + * + * For optimization reasons PCI flush may be requested once after unmapping of + * large area. + */ +int hl_mmu_unmap_page(struct hl_ctx *ctx, u64 virt_addr, u32 page_size, bool flush_pte) +{ + struct hl_device *hdev = ctx->hdev; + struct hl_mmu_properties *mmu_prop; + struct hl_mmu_funcs *mmu_funcs; + int i, pgt_residency, rc = 0; + u32 real_page_size, npages; + u64 real_virt_addr; + bool is_dram_addr; + + if (!hdev->mmu_enable) + return 0; + + is_dram_addr = hl_is_dram_va(hdev, virt_addr); + mmu_prop = hl_mmu_get_prop(hdev, page_size, is_dram_addr); + + pgt_residency = mmu_prop->host_resident ? MMU_HR_PGT : MMU_DR_PGT; + mmu_funcs = hl_mmu_get_funcs(hdev, pgt_residency, is_dram_addr); + + rc = hdev->asic_funcs->mmu_get_real_page_size(hdev, mmu_prop, page_size, &real_page_size, + is_dram_addr); + if (rc) + return rc; + + npages = page_size / real_page_size; + real_virt_addr = virt_addr; + + for (i = 0 ; i < npages ; i++) { + rc = mmu_funcs->unmap(ctx, real_virt_addr, is_dram_addr); + if (rc) + break; + + real_virt_addr += real_page_size; + } + + if (flush_pte) + mmu_funcs->flush(ctx); + + if (trace_habanalabs_mmu_unmap_enabled() && !rc) + trace_habanalabs_mmu_unmap(hdev->dev, virt_addr, 0, page_size, flush_pte); + + return rc; +} + +/* + * hl_mmu_map_page - maps a virtual addr to physical addr + * + * @ctx: pointer to the context structure + * @virt_addr: virt addr to map from + * @phys_addr: phys addr to map to + * @page_size: physical page size + * @flush_pte: whether to do a PCI flush + * + * This function does the following: + * - Check that the virt addr is not mapped + * - Allocate pgts as necessary in order to map the virt addr to the phys + * - Returns 0 on success, -EINVAL if addr is already mapped, or -ENOMEM. + * + * Because this function changes the page tables in the device and because it + * changes the MMU hash, it must be protected by a lock. + * However, because it maps only a single page, the lock should be implemented + * in a higher level in order to protect the entire mapping of the memory area + * + * For optimization reasons PCI flush may be requested once after mapping of + * large area. + */ +int hl_mmu_map_page(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size, + bool flush_pte) +{ + int i, rc, pgt_residency, mapped_cnt = 0; + struct hl_device *hdev = ctx->hdev; + struct hl_mmu_properties *mmu_prop; + u64 real_virt_addr, real_phys_addr; + struct hl_mmu_funcs *mmu_funcs; + u32 real_page_size, npages; + bool is_dram_addr; + + + if (!hdev->mmu_enable) + return 0; + + is_dram_addr = hl_is_dram_va(hdev, virt_addr); + mmu_prop = hl_mmu_get_prop(hdev, page_size, is_dram_addr); + + pgt_residency = mmu_prop->host_resident ? MMU_HR_PGT : MMU_DR_PGT; + mmu_funcs = hl_mmu_get_funcs(hdev, pgt_residency, is_dram_addr); + + rc = hdev->asic_funcs->mmu_get_real_page_size(hdev, mmu_prop, page_size, &real_page_size, + is_dram_addr); + if (rc) + return rc; + + /* + * Verify that the phys and virt addresses are aligned with the + * MMU page size (in dram this means checking the address and MMU + * after scrambling) + */ + if ((is_dram_addr && + ((hdev->asic_funcs->scramble_addr(hdev, phys_addr) & + (mmu_prop->page_size - 1)) || + (hdev->asic_funcs->scramble_addr(hdev, virt_addr) & + (mmu_prop->page_size - 1)))) || + (!is_dram_addr && ((phys_addr & (real_page_size - 1)) || + (virt_addr & (real_page_size - 1))))) + dev_crit(hdev->dev, + "Mapping address 0x%llx with virtual address 0x%llx and page size of 0x%x is erroneous! Addresses must be divisible by page size", + phys_addr, virt_addr, real_page_size); + + npages = page_size / real_page_size; + real_virt_addr = virt_addr; + real_phys_addr = phys_addr; + + for (i = 0 ; i < npages ; i++) { + rc = mmu_funcs->map(ctx, real_virt_addr, real_phys_addr, real_page_size, + is_dram_addr); + if (rc) + goto err; + + real_virt_addr += real_page_size; + real_phys_addr += real_page_size; + mapped_cnt++; + } + + if (flush_pte) + mmu_funcs->flush(ctx); + + trace_habanalabs_mmu_map(hdev->dev, virt_addr, phys_addr, page_size, flush_pte); + + return 0; + +err: + real_virt_addr = virt_addr; + for (i = 0 ; i < mapped_cnt ; i++) { + if (mmu_funcs->unmap(ctx, real_virt_addr, is_dram_addr)) + dev_warn_ratelimited(hdev->dev, + "failed to unmap va: 0x%llx\n", real_virt_addr); + + real_virt_addr += real_page_size; + } + + mmu_funcs->flush(ctx); + + return rc; +} + +/* + * hl_mmu_map_contiguous - implements a wrapper for hl_mmu_map_page + * for mapping contiguous physical memory + * + * @ctx: pointer to the context structure + * @virt_addr: virt addr to map from + * @phys_addr: phys addr to map to + * @size: size to map + * + */ +int hl_mmu_map_contiguous(struct hl_ctx *ctx, u64 virt_addr, + u64 phys_addr, u32 size) +{ + struct hl_device *hdev = ctx->hdev; + struct asic_fixed_properties *prop = &hdev->asic_prop; + u64 curr_va, curr_pa; + u32 page_size; + bool flush_pte; + int rc = 0, off; + + if (hl_mem_area_inside_range(virt_addr, size, + prop->dmmu.start_addr, prop->dmmu.end_addr)) + page_size = prop->dmmu.page_size; + else if (hl_mem_area_inside_range(virt_addr, size, + prop->pmmu.start_addr, prop->pmmu.end_addr)) + page_size = prop->pmmu.page_size; + else if (hl_mem_area_inside_range(virt_addr, size, + prop->pmmu_huge.start_addr, prop->pmmu_huge.end_addr)) + page_size = prop->pmmu_huge.page_size; + else + return -EINVAL; + + for (off = 0 ; off < size ; off += page_size) { + curr_va = virt_addr + off; + curr_pa = phys_addr + off; + flush_pte = (off + page_size) >= size; + rc = hl_mmu_map_page(ctx, curr_va, curr_pa, page_size, + flush_pte); + if (rc) { + dev_err(hdev->dev, + "Map failed for va 0x%llx to pa 0x%llx\n", + curr_va, curr_pa); + /* last mapping failed so don't try to unmap it - reduce off by page_size */ + off -= page_size; + goto unmap; + } + } + + return rc; + +unmap: + for (; off >= 0 ; off -= page_size) { + curr_va = virt_addr + off; + flush_pte = (off - (s32) page_size) < 0; + if (hl_mmu_unmap_page(ctx, curr_va, page_size, flush_pte)) + dev_warn_ratelimited(hdev->dev, + "failed to unmap va 0x%llx\n", curr_va); + } + + return rc; +} + +/* + * hl_mmu_unmap_contiguous - implements a wrapper for hl_mmu_unmap_page + * for unmapping contiguous physical memory + * + * @ctx: pointer to the context structure + * @virt_addr: virt addr to unmap + * @size: size to unmap + * + */ +int hl_mmu_unmap_contiguous(struct hl_ctx *ctx, u64 virt_addr, u32 size) +{ + struct hl_device *hdev = ctx->hdev; + struct asic_fixed_properties *prop = &hdev->asic_prop; + u64 curr_va; + u32 page_size; + bool flush_pte; + int rc = 0, off; + + if (hl_mem_area_inside_range(virt_addr, size, + prop->dmmu.start_addr, prop->dmmu.end_addr)) + page_size = prop->dmmu.page_size; + else if (hl_mem_area_inside_range(virt_addr, size, + prop->pmmu.start_addr, prop->pmmu.end_addr)) + page_size = prop->pmmu.page_size; + else if (hl_mem_area_inside_range(virt_addr, size, + prop->pmmu_huge.start_addr, prop->pmmu_huge.end_addr)) + page_size = prop->pmmu_huge.page_size; + else + return -EINVAL; + + for (off = 0 ; off < size ; off += page_size) { + curr_va = virt_addr + off; + flush_pte = (off + page_size) >= size; + rc = hl_mmu_unmap_page(ctx, curr_va, page_size, flush_pte); + if (rc) + dev_warn_ratelimited(hdev->dev, + "Unmap failed for va 0x%llx\n", curr_va); + } + + return rc; +} + +/* + * hl_mmu_swap_out - marks all mapping of the given ctx as swapped out + * + * @ctx: pointer to the context structure + * + */ +void hl_mmu_swap_out(struct hl_ctx *ctx) +{ + struct hl_device *hdev = ctx->hdev; + + if (!hdev->mmu_enable) + return; + + if (hdev->mmu_func[MMU_DR_PGT].swap_out != NULL) + hdev->mmu_func[MMU_DR_PGT].swap_out(ctx); + + if (hdev->mmu_func[MMU_HR_PGT].swap_out != NULL) + hdev->mmu_func[MMU_HR_PGT].swap_out(ctx); +} + +/* + * hl_mmu_swap_in - marks all mapping of the given ctx as swapped in + * + * @ctx: pointer to the context structure + * + */ +void hl_mmu_swap_in(struct hl_ctx *ctx) +{ + struct hl_device *hdev = ctx->hdev; + + if (!hdev->mmu_enable) + return; + + if (hdev->mmu_func[MMU_DR_PGT].swap_in != NULL) + hdev->mmu_func[MMU_DR_PGT].swap_in(ctx); + + if (hdev->mmu_func[MMU_HR_PGT].swap_in != NULL) + hdev->mmu_func[MMU_HR_PGT].swap_in(ctx); +} + +static void hl_mmu_pa_page_with_offset(struct hl_ctx *ctx, u64 virt_addr, + struct hl_mmu_hop_info *hops, + u64 *phys_addr) +{ + struct asic_fixed_properties *prop = &ctx->hdev->asic_prop; + u64 offset_mask, addr_mask, hop_shift, tmp_phys_addr; + struct hl_mmu_properties *mmu_prop; + + /* last hop holds the phys address and flags */ + if (hops->unscrambled_paddr) + tmp_phys_addr = hops->unscrambled_paddr; + else + tmp_phys_addr = hops->hop_info[hops->used_hops - 1].hop_pte_val; + + if (hops->range_type == HL_VA_RANGE_TYPE_HOST_HUGE) + mmu_prop = &prop->pmmu_huge; + else if (hops->range_type == HL_VA_RANGE_TYPE_HOST) + mmu_prop = &prop->pmmu; + else /* HL_VA_RANGE_TYPE_DRAM */ + mmu_prop = &prop->dmmu; + + if ((hops->range_type == HL_VA_RANGE_TYPE_DRAM) && + !is_power_of_2(prop->dram_page_size)) { + u64 dram_page_size, dram_base, abs_phys_addr, abs_virt_addr, + page_id, page_start; + u32 page_off; + + /* + * Bit arithmetics cannot be used for non power of two page + * sizes. In addition, since bit arithmetics is not used, + * we cannot ignore dram base. All that shall be considered. + */ + + dram_page_size = prop->dram_page_size; + dram_base = prop->dram_base_address; + abs_phys_addr = tmp_phys_addr - dram_base; + abs_virt_addr = virt_addr - dram_base; + page_id = DIV_ROUND_DOWN_ULL(abs_phys_addr, dram_page_size); + page_start = page_id * dram_page_size; + div_u64_rem(abs_virt_addr, dram_page_size, &page_off); + + *phys_addr = page_start + page_off + dram_base; + } else { + /* + * find the correct hop shift field in hl_mmu_properties + * structure in order to determine the right masks + * for the page offset. + */ + hop_shift = mmu_prop->hop_shifts[hops->used_hops - 1]; + offset_mask = (1ull << hop_shift) - 1; + addr_mask = ~(offset_mask); + *phys_addr = (tmp_phys_addr & addr_mask) | + (virt_addr & offset_mask); + } +} + +int hl_mmu_va_to_pa(struct hl_ctx *ctx, u64 virt_addr, u64 *phys_addr) +{ + struct hl_mmu_hop_info hops; + int rc; + + memset(&hops, 0, sizeof(hops)); + + rc = hl_mmu_get_tlb_info(ctx, virt_addr, &hops); + if (rc) + return rc; + + hl_mmu_pa_page_with_offset(ctx, virt_addr, &hops, phys_addr); + + return 0; +} + +int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, + struct hl_mmu_hop_info *hops) +{ + struct hl_device *hdev = ctx->hdev; + struct asic_fixed_properties *prop; + struct hl_mmu_properties *mmu_prop; + struct hl_mmu_funcs *mmu_funcs; + int pgt_residency, rc; + bool is_dram_addr; + + if (!hdev->mmu_enable) + return -EOPNOTSUPP; + + prop = &hdev->asic_prop; + hops->scrambled_vaddr = virt_addr; /* assume no scrambling */ + + is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size, + prop->dmmu.start_addr, + prop->dmmu.end_addr); + + /* host-residency is the same in PMMU and PMMU huge, no need to distinguish here */ + mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu; + pgt_residency = mmu_prop->host_resident ? MMU_HR_PGT : MMU_DR_PGT; + mmu_funcs = hl_mmu_get_funcs(hdev, pgt_residency, is_dram_addr); + + mutex_lock(&hdev->mmu_lock); + rc = mmu_funcs->get_tlb_info(ctx, virt_addr, hops); + mutex_unlock(&hdev->mmu_lock); + + if (rc) + return rc; + + /* add page offset to physical address */ + if (hops->unscrambled_paddr) + hl_mmu_pa_page_with_offset(ctx, virt_addr, hops, &hops->unscrambled_paddr); + + return 0; +} + +int hl_mmu_if_set_funcs(struct hl_device *hdev) +{ + if (!hdev->mmu_enable) + return 0; + + switch (hdev->asic_type) { + case ASIC_GOYA: + case ASIC_GAUDI: + case ASIC_GAUDI_SEC: + hl_mmu_v1_set_funcs(hdev, &hdev->mmu_func[MMU_DR_PGT]); + break; + case ASIC_GAUDI2: + case ASIC_GAUDI2_SEC: + /* MMUs in Gaudi2 are always host resident */ + hl_mmu_v2_hr_set_funcs(hdev, &hdev->mmu_func[MMU_HR_PGT]); + break; + default: + dev_err(hdev->dev, "Unrecognized ASIC type %d\n", + hdev->asic_type); + return -EOPNOTSUPP; + } + + return 0; +} + +/** + * hl_mmu_scramble_addr() - The generic mmu address scrambling routine. + * @hdev: pointer to device data. + * @addr: The address to scramble. + * + * Return: The scrambled address. + */ +u64 hl_mmu_scramble_addr(struct hl_device *hdev, u64 addr) +{ + return addr; +} + +/** + * hl_mmu_descramble_addr() - The generic mmu address descrambling + * routine. + * @hdev: pointer to device data. + * @addr: The address to descramble. + * + * Return: The un-scrambled address. + */ +u64 hl_mmu_descramble_addr(struct hl_device *hdev, u64 addr) +{ + return addr; +} + +int hl_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, u32 flags) +{ + int rc; + + rc = hdev->asic_funcs->mmu_invalidate_cache(hdev, is_hard, flags); + if (rc) + dev_err_ratelimited(hdev->dev, "MMU cache invalidation failed\n"); + + return rc; +} + +int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard, + u32 flags, u32 asid, u64 va, u64 size) +{ + int rc; + + rc = hdev->asic_funcs->mmu_invalidate_cache_range(hdev, is_hard, flags, + asid, va, size); + if (rc) + dev_err_ratelimited(hdev->dev, "MMU cache range invalidation failed\n"); + + return rc; +} + +static void hl_mmu_prefetch_work_function(struct work_struct *work) +{ + struct hl_prefetch_work *pfw = container_of(work, struct hl_prefetch_work, pf_work); + struct hl_ctx *ctx = pfw->ctx; + struct hl_device *hdev = ctx->hdev; + + if (!hl_device_operational(hdev, NULL)) + goto put_ctx; + + mutex_lock(&hdev->mmu_lock); + + hdev->asic_funcs->mmu_prefetch_cache_range(ctx, pfw->flags, pfw->asid, pfw->va, pfw->size); + + mutex_unlock(&hdev->mmu_lock); + +put_ctx: + /* + * context was taken in the common mmu prefetch function- see comment there about + * context handling. + */ + hl_ctx_put(ctx); + kfree(pfw); +} + +int hl_mmu_prefetch_cache_range(struct hl_ctx *ctx, u32 flags, u32 asid, u64 va, u64 size) +{ + struct hl_prefetch_work *handle_pf_work; + + handle_pf_work = kmalloc(sizeof(*handle_pf_work), GFP_KERNEL); + if (!handle_pf_work) + return -ENOMEM; + + INIT_WORK(&handle_pf_work->pf_work, hl_mmu_prefetch_work_function); + handle_pf_work->ctx = ctx; + handle_pf_work->va = va; + handle_pf_work->size = size; + handle_pf_work->flags = flags; + handle_pf_work->asid = asid; + + /* + * as actual prefetch is done in a WQ we must get the context (and put it + * at the end of the work function) + */ + hl_ctx_get(ctx); + queue_work(ctx->hdev->pf_wq, &handle_pf_work->pf_work); + + return 0; +} + +u64 hl_mmu_get_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte) +{ + return (curr_pte & PAGE_PRESENT_MASK) ? (curr_pte & HOP_PHYS_ADDR_MASK) : ULLONG_MAX; +} + +/** + * hl_mmu_get_hop_pte_phys_addr() - extract PTE address from HOP + * @ctx: pointer to the context structure to initialize. + * @mmu_prop: MMU properties. + * @hop_idx: HOP index. + * @hop_addr: HOP address. + * @virt_addr: virtual address fro the translation. + * + * @return the matching PTE value on success, otherwise U64_MAX. + */ +u64 hl_mmu_get_hop_pte_phys_addr(struct hl_ctx *ctx, struct hl_mmu_properties *mmu_prop, + u8 hop_idx, u64 hop_addr, u64 virt_addr) +{ + u64 mask, shift; + + if (hop_idx >= mmu_prop->num_hops) { + dev_err_ratelimited(ctx->hdev->dev, "Invalid hop index %d\n", hop_idx); + return U64_MAX; + } + + shift = mmu_prop->hop_shifts[hop_idx]; + mask = mmu_prop->hop_masks[hop_idx]; + + return hop_addr + ctx->hdev->asic_prop.mmu_pte_size * ((virt_addr & mask) >> shift); +} + +static void mmu_dma_mem_free_from_chunk(struct gen_pool *pool, + struct gen_pool_chunk *chunk, + void *data) +{ + struct hl_device *hdev = (struct hl_device *)data; + + hl_asic_dma_free_coherent(hdev, (chunk->end_addr - chunk->start_addr) + 1, + (void *)chunk->start_addr, chunk->phys_addr); +} + +void hl_mmu_hr_flush(struct hl_ctx *ctx) +{ + /* a flush operation requires memory barrier */ + mb(); +} + +/** + * hl_mmu_hr_pool_destroy() - destroy genpool + * @hdev: habanalabs device structure. + * @hr_priv: MMU HR private data. + * @hop_table_size: HOP table size. + * + * This function does the following: + * - free entries allocated for shadow HOP0 + * - free pool chunks + * - free pool + */ +static void hl_mmu_hr_pool_destroy(struct hl_device *hdev, struct hl_mmu_hr_priv *hr_priv, + u32 hop_table_size) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct gen_pool **pool = &hr_priv->mmu_pgt_pool; + struct pgt_info *hop0_pgt; + int asid; + + if (ZERO_OR_NULL_PTR(*pool)) + return; + + /* Free the Fixed allocation of HOPs0 */ + if (hr_priv->mmu_asid_hop0) { + for (asid = 0 ; asid < prop->max_asid ; asid++) { + hop0_pgt = &hr_priv->mmu_asid_hop0[asid]; + if (ZERO_OR_NULL_PTR(hop0_pgt->virt_addr)) + continue; + + gen_pool_free(*pool, (uintptr_t) hop0_pgt->virt_addr, hop_table_size); + } + } + + gen_pool_for_each_chunk(*pool, mmu_dma_mem_free_from_chunk, hdev); + gen_pool_destroy(*pool); + + /* Make sure that if we arrive here again without init was called we + * won't cause kernel panic. This can happen for example if we fail + * during hard reset code at certain points + */ + *pool = NULL; +} + +/** + * hl_mmu_hr_init() - initialize the MMU module. + * @hdev: habanalabs device structure. + * @hr_priv: MMU HR private data. + * @hop_table_size: HOP table size. + * @pgt_size: memory size allocated for the page table + * + * @return 0 on success otherwise non-zero error code + * + * This function does the following: + * - Create a pool of pages for pgt_infos. + * - Create a shadow table for pgt + */ +int hl_mmu_hr_init(struct hl_device *hdev, struct hl_mmu_hr_priv *hr_priv, u32 hop_table_size, + u64 pgt_size) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + size_t pool_chunk_size = SZ_4M; + struct pgt_info *hop0_pgt; + dma_addr_t dma_addr; + u64 virt_addr; + int i, rc; + + /* + * we set alloc size as PAGE_SIZE (sine dma_alloc_coherent allocation order/size is + * PAGE_SHIFT/PAGE_SIZE) in order to be able to control the allocations alignment. + * This way we can call "DMA alloc align" according to dma_alloc granularity and supply + * allocations with higher-order alignment restrictions + */ + hr_priv->mmu_pgt_pool = gen_pool_create(PAGE_SHIFT, -1); + if (ZERO_OR_NULL_PTR(hr_priv->mmu_pgt_pool)) { + dev_err(hdev->dev, "Failed to create hr page pool\n"); + return -ENOMEM; + } + + hr_priv->mmu_asid_hop0 = kvcalloc(prop->max_asid, sizeof(struct pgt_info), GFP_KERNEL); + if (ZERO_OR_NULL_PTR(hr_priv->mmu_asid_hop0)) { + dev_err(hdev->dev, "Failed to allocate hr-mmu hop0 table\n"); + rc = -ENOMEM; + goto destroy_mmu_pgt_pool; + } + + for (i = 0 ; i < pgt_size ; i += pool_chunk_size) { + virt_addr = (uintptr_t) hl_asic_dma_alloc_coherent(hdev, pool_chunk_size, + &dma_addr, + GFP_KERNEL | __GFP_ZERO); + if (ZERO_OR_NULL_PTR(virt_addr)) { + dev_err(hdev->dev, + "Failed to allocate memory for host-resident page pool\n"); + rc = -ENOMEM; + goto destroy_mmu_pgt_pool; + } + + rc = gen_pool_add_virt(hr_priv->mmu_pgt_pool, virt_addr, (phys_addr_t) dma_addr, + pool_chunk_size, -1); + if (rc) { + dev_err(hdev->dev, "Failed to fill host-resident page pool\n"); + goto destroy_mmu_pgt_pool; + } + } + + for (i = 0 ; i < prop->max_asid ; i++) { + hop0_pgt = &hr_priv->mmu_asid_hop0[i]; + hop0_pgt->virt_addr = (uintptr_t) + gen_pool_dma_zalloc_align(hr_priv->mmu_pgt_pool, + hop_table_size, + (dma_addr_t *) &hop0_pgt->phys_addr, + hop_table_size); + if (!hop0_pgt->virt_addr) { + dev_err(hdev->dev, "Failed to allocate HOP from pgt pool\n"); + rc = -ENOMEM; + goto destroy_mmu_pgt_pool; + } + } + + /* MMU H/W init will be done in device hw_init() */ + + return 0; + +destroy_mmu_pgt_pool: + hl_mmu_hr_pool_destroy(hdev, hr_priv, hop_table_size); + if (!ZERO_OR_NULL_PTR(hr_priv->mmu_asid_hop0)) + kvfree(hr_priv->mmu_asid_hop0); + + return rc; +} + +/** + * hl_mmu_hr_fini() - release the MMU module. + * @hdev: habanalabs device structure. + * @hr_priv: MMU host resident private info. + * @hop_table_size: HOP table size + * + * This function does the following: + * - Disable MMU in H/W. + * - Free the pgt_infos pool. + * + * All contexts should be freed before calling this function. + */ +void hl_mmu_hr_fini(struct hl_device *hdev, struct hl_mmu_hr_priv *hr_priv, u32 hop_table_size) +{ + /* MMU H/W fini was already done in device hw_fini() */ + + hl_mmu_hr_pool_destroy(hdev, hr_priv, hop_table_size); + + if (!ZERO_OR_NULL_PTR(hr_priv->mmu_asid_hop0)) { + kvfree(hr_priv->mmu_asid_hop0); + + /* Make sure that if we arrive here again without init was + * called we won't cause kernel panic. This can happen for + * example if we fail during hard reset code at certain points + */ + hr_priv->mmu_asid_hop0 = NULL; + } +} + +/** + * hl_mmu_hr_free_hop_remove_pgt() - free HOP and remove PGT from hash + * @pgt_info: page table info structure. + * @hr_priv: MMU HR private data. + * @hop_table_size: HOP table size. + */ +void hl_mmu_hr_free_hop_remove_pgt(struct pgt_info *pgt_info, struct hl_mmu_hr_priv *hr_priv, + u32 hop_table_size) +{ + gen_pool_free(hr_priv->mmu_pgt_pool, pgt_info->virt_addr, hop_table_size); + hash_del(&pgt_info->node); + kfree(pgt_info); +} + +/** + * hl_mmu_hr_pte_phys_to_virt() - translate PTE phys addr to virt addr + * @ctx: pointer to the context structure + * @pgt: pgt_info for the HOP hosting the PTE + * @phys_pte_addr: phys address of the PTE + * @hop_table_size: HOP table size + * + * @return PTE virtual address + * + * The function use the pgt_info to get HOP base virt addr and obtain the PTE's virt addr + * by adding the PTE offset. + */ +u64 hl_mmu_hr_pte_phys_to_virt(struct hl_ctx *ctx, struct pgt_info *pgt, + u64 phys_pte_addr, u32 hop_table_size) +{ + u64 page_mask = (hop_table_size - 1); + u64 pte_offset = phys_pte_addr & page_mask; + + return pgt->virt_addr + pte_offset; +} + +/** + * hl_mmu_hr_write_pte() - write HR PTE + * @ctx: pointer to the context structure + * @pgt_info: HOP's page table info structure + * @phys_pte_addr: phys PTE address + * @val: raw PTE data + * @hop_table_size: HOP table size + */ +void hl_mmu_hr_write_pte(struct hl_ctx *ctx, struct pgt_info *pgt_info, u64 phys_pte_addr, + u64 val, u32 hop_table_size) +{ + /* + * The value to write is the phys address of the next hop + + * flags at the 12 LSBs. + */ + u64 virt_addr = hl_mmu_hr_pte_phys_to_virt(ctx, pgt_info, phys_pte_addr, hop_table_size); + + *((u64 *) (uintptr_t) virt_addr) = val; +} + +/** + * hl_mmu_hr_clear_pte() - clear HR PTE + * @ctx: pointer to the context structure + * @pgt_info: HOP's page table info structure + * @phys_pte_addr: phys PTE address + * @hop_table_size: HOP table size + */ +void hl_mmu_hr_clear_pte(struct hl_ctx *ctx, struct pgt_info *pgt_info, u64 phys_pte_addr, + u32 hop_table_size) +{ + /* no need to transform the value to physical address */ + hl_mmu_hr_write_pte(ctx, pgt_info, phys_pte_addr, 0, hop_table_size); +} + +/** + * hl_mmu_hr_put_pte() - put HR PTE and remove it if necessary (no more PTEs) + * @ctx: pointer to the context structure + * @pgt_info: HOP's page table info structure + * @hr_priv: HR MMU private info + * @hop_table_size: HOP table size + * + * @return number of PTEs still in the HOP + */ +int hl_mmu_hr_put_pte(struct hl_ctx *ctx, struct pgt_info *pgt_info, + struct hl_mmu_hr_priv *hr_priv, + u32 hop_table_size) +{ + int num_of_ptes_left; + + pgt_info->num_of_ptes--; + + /* + * Need to save the number of ptes left because free_hop might free + * the pgt_info + */ + num_of_ptes_left = pgt_info->num_of_ptes; + if (!num_of_ptes_left) + hl_mmu_hr_free_hop_remove_pgt(pgt_info, hr_priv, hop_table_size); + + return num_of_ptes_left; +} + +/** + * hl_mmu_hr_get_pte() - increase PGT PTE count + * @ctx: pointer to the context structure + * @hr_func: host resident functions + * @phys_hop_addr: HOP phys address + */ +void hl_mmu_hr_get_pte(struct hl_ctx *ctx, struct hl_hr_mmu_funcs *hr_func, u64 phys_hop_addr) +{ + hr_func->get_pgt_info(ctx, phys_hop_addr)->num_of_ptes++; +} + +/** + * hl_mmu_hr_get_next_hop_pgt_info() - get pgt_info structure for the next HOP + * @ctx: pointer to the context structure. + * @hr_func: host resident functions. + * @curr_pte: current PTE value. + * + * @return pgt_info structure on success, otherwise NULL. + */ +struct pgt_info *hl_mmu_hr_get_next_hop_pgt_info(struct hl_ctx *ctx, + struct hl_hr_mmu_funcs *hr_func, + u64 curr_pte) +{ + u64 next_hop_phys_addr = hl_mmu_get_next_hop_addr(ctx, curr_pte); + + if (next_hop_phys_addr == ULLONG_MAX) + return NULL; + + return hr_func->get_pgt_info(ctx, next_hop_phys_addr); +} + +/** + * hl_mmu_hr_alloc_hop() - allocate HOP + * @ctx: pointer to the context structure. + * @hr_priv: host resident private info structure. + * @hr_func: host resident functions. + * @mmu_prop: MMU properties. + * + * @return pgt_info structure associated with the allocated HOP on success, otherwise NULL. + */ +struct pgt_info *hl_mmu_hr_alloc_hop(struct hl_ctx *ctx, struct hl_mmu_hr_priv *hr_priv, + struct hl_hr_mmu_funcs *hr_func, + struct hl_mmu_properties *mmu_prop) +{ + struct hl_device *hdev = ctx->hdev; + struct pgt_info *pgt_info; + dma_addr_t phys_addr; + void *virt_addr; + int i, retry = 1; + + pgt_info = kmalloc(sizeof(*pgt_info), GFP_KERNEL); + if (!pgt_info) + return NULL; + + for (i = 0; i <= retry; i++) { + virt_addr = gen_pool_dma_zalloc_align(hr_priv->mmu_pgt_pool, + mmu_prop->hop_table_size, + &phys_addr, + mmu_prop->hop_table_size); + if (virt_addr) + break; + + /* No memory in pool - get some and try again */ + virt_addr = hl_asic_dma_alloc_coherent(hdev, SZ_2M, &phys_addr, + GFP_KERNEL | __GFP_ZERO); + if (ZERO_OR_NULL_PTR(virt_addr)) + break; + + if (gen_pool_add_virt(hr_priv->mmu_pgt_pool, (unsigned long)virt_addr, + phys_addr, SZ_2M, -1)) { + hl_asic_dma_free_coherent(hdev, SZ_2M, virt_addr, phys_addr); + virt_addr = NULL; + break; + } + } + + if (ZERO_OR_NULL_PTR(virt_addr)) { + dev_err(hdev->dev, "failed to allocate page\n"); + goto pool_alloc_err; + } + + pgt_info->phys_addr = phys_addr; + pgt_info->shadow_addr = (unsigned long) NULL; + pgt_info->virt_addr = (unsigned long)virt_addr; + pgt_info->ctx = ctx; + pgt_info->num_of_ptes = 0; + hr_func->add_pgt_info(ctx, pgt_info, phys_addr); + + return pgt_info; + +pool_alloc_err: + kfree(pgt_info); + + return NULL; +} + +/** + * hl_mmu_hr_get_alloc_next_hop() - get the next HOP, allocate it if it does not exist + * @ctx: pointer to the context structure. + * @hr_priv: host resident private info structure. + * @hr_func: host resident functions. + * @mmu_prop: MMU properties. + * @curr_pte: current PTE value. + * @is_new_hop: set to true if HOP is new (caller responsibility to set it to false). + * + * @return pgt_info structure associated with the allocated HOP on success, otherwise NULL. + */ +struct pgt_info *hl_mmu_hr_get_alloc_next_hop(struct hl_ctx *ctx, + struct hl_mmu_hr_priv *hr_priv, + struct hl_hr_mmu_funcs *hr_func, + struct hl_mmu_properties *mmu_prop, + u64 curr_pte, bool *is_new_hop) +{ + u64 hop_addr = hl_mmu_get_next_hop_addr(ctx, curr_pte); + + if (hop_addr != ULLONG_MAX) + return hr_func->get_pgt_info(ctx, hop_addr); + + *is_new_hop = true; + return hl_mmu_hr_alloc_hop(ctx, hr_priv, hr_func, mmu_prop); +} + +/** + * hl_mmu_hr_get_tlb_info() - get the TLB info (info for a specific mapping) + * @ctx: pointer to the context structure. + * @virt_addr: the virt address for which to get info. + * @hops: HOPs info structure. + * @hr_func: host resident functions. + * + * @return 0 on success, otherwise non 0 error code.. + */ +int hl_mmu_hr_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_info *hops, + struct hl_hr_mmu_funcs *hr_func) +{ + /* using 6 HOPs as this is the maximum number of HOPs */ + struct pgt_info *hops_pgt_info[MMU_ARCH_6_HOPS] = { NULL }; + struct hl_device *hdev = ctx->hdev; + struct hl_mmu_properties *mmu_prop; + int rc, i, used_hops; + bool is_huge; + + rc = hr_func->get_tlb_mapping_params(hdev, &mmu_prop, hops, virt_addr, &is_huge); + if (rc) + return rc; + + used_hops = mmu_prop->num_hops; + + /* huge pages use one less hop */ + if (is_huge) + used_hops--; + + hops->scrambled_vaddr = hdev->asic_funcs->scramble_addr(hdev, virt_addr); + + for (i = 0 ; i < used_hops ; i++) { + if (i == 0) + hops_pgt_info[i] = hr_func->get_hop0_pgt_info(ctx); + else + hops_pgt_info[i] = hl_mmu_hr_get_next_hop_pgt_info(ctx, hr_func, + hops->hop_info[i - 1].hop_pte_val); + + if (!hops_pgt_info[i]) + return -EFAULT; + + hops->hop_info[i].hop_addr = hops_pgt_info[i]->phys_addr; + hops->hop_info[i].hop_pte_addr = + hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, i, + hops->hop_info[i].hop_addr, + hops->scrambled_vaddr); + hops->hop_info[i].hop_pte_val = *(u64 *) (uintptr_t) + hl_mmu_hr_pte_phys_to_virt(ctx, hops_pgt_info[i], + hops->hop_info[i].hop_pte_addr, + mmu_prop->hop_table_size); + + if (!(hops->hop_info[i].hop_pte_val & PAGE_PRESENT_MASK)) + return -EFAULT; + + if (hops->hop_info[i].hop_pte_val & mmu_prop->last_mask) + break; + } + + /* if passed over all hops then no last hop was found */ + if (i == mmu_prop->num_hops) + return -EFAULT; + + if (hops->scrambled_vaddr != virt_addr) + hops->unscrambled_paddr = hdev->asic_funcs->descramble_addr + (hdev, hops->hop_info[i].hop_pte_val); + else + hops->unscrambled_paddr = hops->hop_info[i].hop_pte_val; + + hops->used_hops = i + 1; + + return 0; +} + diff --git a/drivers/misc/habanalabs/common/mmu/mmu_v1.c b/drivers/misc/habanalabs/common/mmu/mmu_v1.c new file mode 100644 index 000000000..8a40de4a4 --- /dev/null +++ b/drivers/misc/habanalabs/common/mmu/mmu_v1.c @@ -0,0 +1,815 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2019 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "../habanalabs.h" +#include "../../include/hw_ip/mmu/mmu_general.h" + +#include <linux/slab.h> + +#define MMU_V1_MAX_HOPS (MMU_HOP4 + 1) + +static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr); + +static struct pgt_info *get_pgt_info(struct hl_ctx *ctx, u64 hop_addr) +{ + struct pgt_info *pgt_info = NULL; + + hash_for_each_possible(ctx->mmu_shadow_hash, pgt_info, node, + (unsigned long) hop_addr) + if (hop_addr == pgt_info->shadow_addr) + break; + + return pgt_info; +} + +static void _free_hop(struct hl_ctx *ctx, struct pgt_info *pgt_info) +{ + struct hl_device *hdev = ctx->hdev; + + gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool, pgt_info->phys_addr, + hdev->asic_prop.mmu_hop_table_size); + hash_del(&pgt_info->node); + kfree((u64 *) (uintptr_t) pgt_info->shadow_addr); + kfree(pgt_info); +} + +static void free_hop(struct hl_ctx *ctx, u64 hop_addr) +{ + struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr); + + _free_hop(ctx, pgt_info); +} + +static u64 alloc_hop(struct hl_ctx *ctx) +{ + struct hl_device *hdev = ctx->hdev; + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct pgt_info *pgt_info; + u64 phys_addr, shadow_addr; + + pgt_info = kmalloc(sizeof(*pgt_info), GFP_KERNEL); + if (!pgt_info) + return ULLONG_MAX; + + phys_addr = (u64) gen_pool_alloc(hdev->mmu_priv.dr.mmu_pgt_pool, + prop->mmu_hop_table_size); + if (!phys_addr) { + dev_err(hdev->dev, "failed to allocate page\n"); + goto pool_add_err; + } + + shadow_addr = (u64) (uintptr_t) kzalloc(prop->mmu_hop_table_size, + GFP_KERNEL); + if (!shadow_addr) + goto shadow_err; + + pgt_info->phys_addr = phys_addr; + pgt_info->shadow_addr = shadow_addr; + pgt_info->ctx = ctx; + pgt_info->num_of_ptes = 0; + hash_add(ctx->mmu_shadow_hash, &pgt_info->node, shadow_addr); + + return shadow_addr; + +shadow_err: + gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool, phys_addr, + prop->mmu_hop_table_size); +pool_add_err: + kfree(pgt_info); + + return ULLONG_MAX; +} + +static inline u64 get_phys_hop0_addr(struct hl_ctx *ctx) +{ + return ctx->hdev->asic_prop.mmu_pgt_addr + + (ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size); +} + +static inline u64 get_hop0_addr(struct hl_ctx *ctx) +{ + return (u64) (uintptr_t) ctx->hdev->mmu_priv.dr.mmu_shadow_hop0 + + (ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size); +} + +static void flush(struct hl_ctx *ctx) +{ + /* flush all writes from all cores to reach PCI */ + mb(); + ctx->hdev->asic_funcs->read_pte(ctx->hdev, get_phys_hop0_addr(ctx)); +} + +/* transform the value to physical address when writing to H/W */ +static inline void write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val) +{ + /* + * The value to write is actually the address of the next shadow hop + + * flags at the 12 LSBs. + * Hence in order to get the value to write to the physical PTE, we + * clear the 12 LSBs and translate the shadow hop to its associated + * physical hop, and add back the original 12 LSBs. + */ + u64 phys_val = get_phys_addr(ctx, val & HOP_PHYS_ADDR_MASK) | + (val & FLAGS_MASK); + + ctx->hdev->asic_funcs->write_pte(ctx->hdev, + get_phys_addr(ctx, shadow_pte_addr), + phys_val); + + *(u64 *) (uintptr_t) shadow_pte_addr = val; +} + +/* do not transform the value to physical address when writing to H/W */ +static inline void write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, + u64 val) +{ + ctx->hdev->asic_funcs->write_pte(ctx->hdev, + get_phys_addr(ctx, shadow_pte_addr), + val); + *(u64 *) (uintptr_t) shadow_pte_addr = val; +} + +/* clear the last and present bits */ +static inline void clear_pte(struct hl_ctx *ctx, u64 pte_addr) +{ + /* no need to transform the value to physical address */ + write_final_pte(ctx, pte_addr, 0); +} + +static inline void get_pte(struct hl_ctx *ctx, u64 hop_addr) +{ + get_pgt_info(ctx, hop_addr)->num_of_ptes++; +} + +/* + * put_pte - decrement the num of ptes and free the hop if possible + * + * @ctx: pointer to the context structure + * @hop_addr: addr of the hop + * + * This function returns the number of ptes left on this hop. If the number is + * 0, it means the pte was freed. + */ +static inline int put_pte(struct hl_ctx *ctx, u64 hop_addr) +{ + struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr); + int num_of_ptes_left; + + pgt_info->num_of_ptes--; + + /* + * Need to save the number of ptes left because free_hop might free + * the pgt_info + */ + num_of_ptes_left = pgt_info->num_of_ptes; + if (!num_of_ptes_left) + _free_hop(ctx, pgt_info); + + return num_of_ptes_left; +} + +static inline u64 get_hop_pte_addr(struct hl_ctx *ctx, struct hl_mmu_properties *mmu_prop, + u64 *hop_addr_arr, u64 virt_addr, enum mmu_hop_num hop_idx) +{ + u64 mask, shift; + + mask = mmu_prop->hop_masks[hop_idx]; + shift = mmu_prop->hop_shifts[hop_idx]; + return hop_addr_arr[hop_idx] + + ctx->hdev->asic_prop.mmu_pte_size * ((virt_addr & mask) >> shift); +} + +static inline u64 get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte, + bool *is_new_hop) +{ + u64 hop_addr = hl_mmu_get_next_hop_addr(ctx, curr_pte); + + if (hop_addr == ULLONG_MAX) { + hop_addr = alloc_hop(ctx); + *is_new_hop = (hop_addr != ULLONG_MAX); + } + + return hop_addr; +} + +/* translates shadow address inside hop to a physical address */ +static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr) +{ + u64 page_mask = (ctx->hdev->asic_prop.mmu_hop_table_size - 1); + u64 shadow_hop_addr = shadow_addr & ~page_mask; + u64 pte_offset = shadow_addr & page_mask; + u64 phys_hop_addr; + + if (shadow_hop_addr != get_hop0_addr(ctx)) + phys_hop_addr = get_pgt_info(ctx, shadow_hop_addr)->phys_addr; + else + phys_hop_addr = get_phys_hop0_addr(ctx); + + return phys_hop_addr + pte_offset; +} + +static int dram_default_mapping_init(struct hl_ctx *ctx) +{ + struct hl_device *hdev = ctx->hdev; + struct asic_fixed_properties *prop = &hdev->asic_prop; + u64 num_of_hop3, total_hops, hop0_addr, hop1_addr, hop2_addr, + hop2_pte_addr, hop3_pte_addr, pte_val; + int rc, i, j, hop3_allocated = 0; + + if ((!prop->dram_supports_virtual_memory) || + (!hdev->dram_default_page_mapping) || + (ctx->asid == HL_KERNEL_ASID_ID)) + return 0; + + num_of_hop3 = prop->dram_size_for_default_page_mapping; + do_div(num_of_hop3, prop->dram_page_size); + do_div(num_of_hop3, HOP_PTE_ENTRIES_512); + + /* add hop1 and hop2 */ + total_hops = num_of_hop3 + 2; + + ctx->dram_default_hops = kzalloc(HL_PTE_SIZE * total_hops, GFP_KERNEL); + if (!ctx->dram_default_hops) + return -ENOMEM; + + hop0_addr = get_hop0_addr(ctx); + + hop1_addr = alloc_hop(ctx); + if (hop1_addr == ULLONG_MAX) { + dev_err(hdev->dev, "failed to alloc hop 1\n"); + rc = -ENOMEM; + goto hop1_err; + } + + ctx->dram_default_hops[total_hops - 1] = hop1_addr; + + hop2_addr = alloc_hop(ctx); + if (hop2_addr == ULLONG_MAX) { + dev_err(hdev->dev, "failed to alloc hop 2\n"); + rc = -ENOMEM; + goto hop2_err; + } + + ctx->dram_default_hops[total_hops - 2] = hop2_addr; + + for (i = 0 ; i < num_of_hop3 ; i++) { + ctx->dram_default_hops[i] = alloc_hop(ctx); + if (ctx->dram_default_hops[i] == ULLONG_MAX) { + dev_err(hdev->dev, "failed to alloc hop 3, i: %d\n", i); + rc = -ENOMEM; + goto hop3_err; + } + hop3_allocated++; + } + + /* need only pte 0 in hops 0 and 1 */ + pte_val = (hop1_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK; + write_pte(ctx, hop0_addr, pte_val); + + pte_val = (hop2_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK; + write_pte(ctx, hop1_addr, pte_val); + get_pte(ctx, hop1_addr); + + hop2_pte_addr = hop2_addr; + for (i = 0 ; i < num_of_hop3 ; i++) { + pte_val = (ctx->dram_default_hops[i] & HOP_PHYS_ADDR_MASK) | + PAGE_PRESENT_MASK; + write_pte(ctx, hop2_pte_addr, pte_val); + get_pte(ctx, hop2_addr); + hop2_pte_addr += HL_PTE_SIZE; + } + + pte_val = (prop->mmu_dram_default_page_addr & HOP_PHYS_ADDR_MASK) | + LAST_MASK | PAGE_PRESENT_MASK; + + for (i = 0 ; i < num_of_hop3 ; i++) { + hop3_pte_addr = ctx->dram_default_hops[i]; + for (j = 0 ; j < HOP_PTE_ENTRIES_512 ; j++) { + write_final_pte(ctx, hop3_pte_addr, pte_val); + get_pte(ctx, ctx->dram_default_hops[i]); + hop3_pte_addr += HL_PTE_SIZE; + } + } + + flush(ctx); + + return 0; + +hop3_err: + for (i = 0 ; i < hop3_allocated ; i++) + free_hop(ctx, ctx->dram_default_hops[i]); + + free_hop(ctx, hop2_addr); +hop2_err: + free_hop(ctx, hop1_addr); +hop1_err: + kfree(ctx->dram_default_hops); + + return rc; +} + +static void dram_default_mapping_fini(struct hl_ctx *ctx) +{ + struct hl_device *hdev = ctx->hdev; + struct asic_fixed_properties *prop = &hdev->asic_prop; + u64 num_of_hop3, total_hops, hop0_addr, hop1_addr, hop2_addr, + hop2_pte_addr, hop3_pte_addr; + int i, j; + + if ((!prop->dram_supports_virtual_memory) || + (!hdev->dram_default_page_mapping) || + (ctx->asid == HL_KERNEL_ASID_ID)) + return; + + num_of_hop3 = prop->dram_size_for_default_page_mapping; + do_div(num_of_hop3, prop->dram_page_size); + do_div(num_of_hop3, HOP_PTE_ENTRIES_512); + + hop0_addr = get_hop0_addr(ctx); + /* add hop1 and hop2 */ + total_hops = num_of_hop3 + 2; + hop1_addr = ctx->dram_default_hops[total_hops - 1]; + hop2_addr = ctx->dram_default_hops[total_hops - 2]; + + for (i = 0 ; i < num_of_hop3 ; i++) { + hop3_pte_addr = ctx->dram_default_hops[i]; + for (j = 0 ; j < HOP_PTE_ENTRIES_512 ; j++) { + clear_pte(ctx, hop3_pte_addr); + put_pte(ctx, ctx->dram_default_hops[i]); + hop3_pte_addr += HL_PTE_SIZE; + } + } + + hop2_pte_addr = hop2_addr; + hop2_pte_addr = hop2_addr; + for (i = 0 ; i < num_of_hop3 ; i++) { + clear_pte(ctx, hop2_pte_addr); + put_pte(ctx, hop2_addr); + hop2_pte_addr += HL_PTE_SIZE; + } + + clear_pte(ctx, hop1_addr); + put_pte(ctx, hop1_addr); + clear_pte(ctx, hop0_addr); + + kfree(ctx->dram_default_hops); + + flush(ctx); +} + +/** + * hl_mmu_v1_init() - initialize the MMU module. + * @hdev: habanalabs device structure. + * + * This function does the following: + * - Create a pool of pages for pgt_infos. + * - Create a shadow table for pgt + * + * Return: 0 for success, non-zero for failure. + */ +static int hl_mmu_v1_init(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + int rc; + + hdev->mmu_priv.dr.mmu_pgt_pool = + gen_pool_create(__ffs(prop->mmu_hop_table_size), -1); + + if (!hdev->mmu_priv.dr.mmu_pgt_pool) { + dev_err(hdev->dev, "Failed to create page gen pool\n"); + return -ENOMEM; + } + + rc = gen_pool_add(hdev->mmu_priv.dr.mmu_pgt_pool, prop->mmu_pgt_addr + + prop->mmu_hop0_tables_total_size, + prop->mmu_pgt_size - prop->mmu_hop0_tables_total_size, + -1); + if (rc) { + dev_err(hdev->dev, "Failed to add memory to page gen pool\n"); + goto err_pool_add; + } + + hdev->mmu_priv.dr.mmu_shadow_hop0 = kvcalloc(prop->max_asid, prop->mmu_hop_table_size, + GFP_KERNEL); + if (ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0)) { + rc = -ENOMEM; + goto err_pool_add; + } + + /* MMU H/W init will be done in device hw_init() */ + + return 0; + +err_pool_add: + gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool); + + return rc; +} + +/** + * hl_mmu_v1_fini() - release the MMU module. + * @hdev: habanalabs device structure. + * + * This function does the following: + * - Disable MMU in H/W. + * - Free the pgt_infos pool. + * + * All contexts should be freed before calling this function. + */ +static void hl_mmu_v1_fini(struct hl_device *hdev) +{ + /* MMU H/W fini was already done in device hw_fini() */ + + if (!ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0)) { + kvfree(hdev->mmu_priv.dr.mmu_shadow_hop0); + gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool); + + /* Make sure that if we arrive here again without init was + * called we won't cause kernel panic. This can happen for + * example if we fail during hard reset code at certain points + */ + hdev->mmu_priv.dr.mmu_shadow_hop0 = NULL; + } +} + +/** + * hl_mmu_v1_ctx_init() - initialize a context for using the MMU module. + * @ctx: pointer to the context structure to initialize. + * + * Initialize a mutex to protect the concurrent mapping flow, a hash to hold all + * page tables hops related to this context. + * Return: 0 on success, non-zero otherwise. + */ +static int hl_mmu_v1_ctx_init(struct hl_ctx *ctx) +{ + hash_init(ctx->mmu_shadow_hash); + return dram_default_mapping_init(ctx); +} + +/* + * hl_mmu_ctx_fini - disable a ctx from using the mmu module + * + * @ctx: pointer to the context structure + * + * This function does the following: + * - Free any pgts which were not freed yet + * - Free the mutex + * - Free DRAM default page mapping hops + */ +static void hl_mmu_v1_ctx_fini(struct hl_ctx *ctx) +{ + struct hl_device *hdev = ctx->hdev; + struct pgt_info *pgt_info; + struct hlist_node *tmp; + int i; + + dram_default_mapping_fini(ctx); + + if (!hash_empty(ctx->mmu_shadow_hash)) + dev_err(hdev->dev, "ctx %d is freed while it has pgts in use\n", + ctx->asid); + + hash_for_each_safe(ctx->mmu_shadow_hash, i, tmp, pgt_info, node) { + dev_err_ratelimited(hdev->dev, + "pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n", + pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes); + _free_hop(ctx, pgt_info); + } +} + +static int hl_mmu_v1_unmap(struct hl_ctx *ctx, + u64 virt_addr, bool is_dram_addr) +{ + u64 hop_addr[MMU_V1_MAX_HOPS] = {0}, hop_pte_addr[MMU_V1_MAX_HOPS] = {0}, curr_pte = 0; + struct hl_device *hdev = ctx->hdev; + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct hl_mmu_properties *mmu_prop; + bool is_huge, clear_hop3 = true; + int hop_idx; + + /* shifts and masks are the same in PMMU and HPMMU, use one of them */ + mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu; + + for (hop_idx = MMU_HOP0; hop_idx < MMU_HOP4; hop_idx++) { + if (hop_idx == MMU_HOP0) { + hop_addr[hop_idx] = get_hop0_addr(ctx); + } else { + hop_addr[hop_idx] = hl_mmu_get_next_hop_addr(ctx, curr_pte); + if (hop_addr[hop_idx] == ULLONG_MAX) + goto not_mapped; + } + + hop_pte_addr[hop_idx] = + get_hop_pte_addr(ctx, mmu_prop, hop_addr, virt_addr, hop_idx); + + curr_pte = *(u64 *) (uintptr_t) hop_pte_addr[hop_idx]; + } + + is_huge = curr_pte & mmu_prop->last_mask; + + if (is_dram_addr && !is_huge) { + dev_err(hdev->dev, "DRAM unmapping should use huge pages only\n"); + return -EFAULT; + } + + if (!is_huge) { + hop_idx = MMU_HOP4; + hop_addr[hop_idx] = hl_mmu_get_next_hop_addr(ctx, curr_pte); + if (hop_addr[hop_idx] == ULLONG_MAX) + goto not_mapped; + + hop_pte_addr[hop_idx] = + get_hop_pte_addr(ctx, mmu_prop, hop_addr, virt_addr, hop_idx); + curr_pte = *(u64 *) (uintptr_t) hop_pte_addr[hop_idx]; + clear_hop3 = false; + } + + if (hdev->dram_default_page_mapping && is_dram_addr) { + u64 default_pte = (prop->mmu_dram_default_page_addr & + HOP_PHYS_ADDR_MASK) | mmu_prop->last_mask | + PAGE_PRESENT_MASK; + if (curr_pte == default_pte) { + dev_err(hdev->dev, + "DRAM: hop3 PTE points to zero page, can't unmap, va: 0x%llx\n", + virt_addr); + goto not_mapped; + } + + if (!(curr_pte & PAGE_PRESENT_MASK)) { + dev_err(hdev->dev, + "DRAM: hop3 PTE is cleared! can't unmap, va: 0x%llx\n", + virt_addr); + goto not_mapped; + } + + hop_idx = MMU_HOP3; + write_final_pte(ctx, hop_pte_addr[hop_idx], default_pte); + put_pte(ctx, hop_addr[hop_idx]); + } else { + if (!(curr_pte & PAGE_PRESENT_MASK)) + goto not_mapped; + + if (hop_addr[MMU_HOP4]) + clear_pte(ctx, hop_pte_addr[MMU_HOP4]); + else + clear_pte(ctx, hop_pte_addr[MMU_HOP3]); + + if (hop_addr[MMU_HOP4] && !put_pte(ctx, hop_addr[MMU_HOP4])) + clear_hop3 = true; + + if (!clear_hop3) + goto mapped; + + for (hop_idx = MMU_HOP3; hop_idx >= 0; hop_idx--) { + clear_pte(ctx, hop_pte_addr[hop_idx]); + + if (hop_idx == MMU_HOP0) + break; + + if (put_pte(ctx, hop_addr[hop_idx])) + goto mapped; + } + } + +mapped: + return 0; + +not_mapped: + dev_err(hdev->dev, "virt addr 0x%llx is not mapped to phys addr\n", + virt_addr); + + return -EINVAL; +} + +static int hl_mmu_v1_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, + u32 page_size, bool is_dram_addr) +{ + u64 hop_addr[MMU_V1_MAX_HOPS] = {0}, hop_pte_addr[MMU_V1_MAX_HOPS] = {0}, curr_pte = 0; + struct hl_device *hdev = ctx->hdev; + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct hl_mmu_properties *mmu_prop; + bool is_huge, hop_new[MMU_V1_MAX_HOPS] = {false}; + int num_hops, hop_idx, prev_hop, rc = -ENOMEM; + + /* + * This mapping function can map a page or a huge page. For huge page + * there are only 3 hops rather than 4. Currently the DRAM allocation + * uses huge pages only but user memory could have been allocated with + * one of the two page sizes. Since this is a common code for all the + * three cases, we need this hugs page check. + */ + if (is_dram_addr) { + mmu_prop = &prop->dmmu; + is_huge = true; + } else if (page_size == prop->pmmu_huge.page_size) { + mmu_prop = &prop->pmmu_huge; + is_huge = true; + } else { + mmu_prop = &prop->pmmu; + is_huge = false; + } + + num_hops = is_huge ? (MMU_V1_MAX_HOPS - 1) : MMU_V1_MAX_HOPS; + + for (hop_idx = MMU_HOP0; hop_idx < num_hops; hop_idx++) { + if (hop_idx == MMU_HOP0) { + hop_addr[hop_idx] = get_hop0_addr(ctx); + } else { + hop_addr[hop_idx] = + get_alloc_next_hop_addr(ctx, curr_pte, &hop_new[hop_idx]); + if (hop_addr[hop_idx] == ULLONG_MAX) + goto err; + } + + hop_pte_addr[hop_idx] = + get_hop_pte_addr(ctx, mmu_prop, hop_addr, virt_addr, hop_idx); + curr_pte = *(u64 *) (uintptr_t) hop_pte_addr[hop_idx]; + } + + if (hdev->dram_default_page_mapping && is_dram_addr) { + u64 default_pte = (prop->mmu_dram_default_page_addr & + HOP_PHYS_ADDR_MASK) | mmu_prop->last_mask | + PAGE_PRESENT_MASK; + + if (curr_pte != default_pte) { + dev_err(hdev->dev, + "DRAM: mapping already exists for virt_addr 0x%llx\n", + virt_addr); + rc = -EINVAL; + goto err; + } + + for (hop_idx = MMU_HOP1; hop_idx < num_hops; hop_idx++) { + if (hop_new[hop_idx]) { + dev_err(hdev->dev, "DRAM mapping should not allocate more hops\n"); + rc = -EFAULT; + goto err; + } + } + } else if (curr_pte & PAGE_PRESENT_MASK) { + dev_err(hdev->dev, + "mapping already exists for virt_addr 0x%llx\n", + virt_addr); + + for (hop_idx = MMU_HOP0; hop_idx < num_hops; hop_idx++) + dev_dbg(hdev->dev, "hop%d pte: 0x%llx (0x%llx)\n", hop_idx, + *(u64 *) (uintptr_t) hop_pte_addr[hop_idx], + hop_pte_addr[hop_idx]); + + rc = -EINVAL; + goto err; + } + + curr_pte = (phys_addr & HOP_PHYS_ADDR_MASK) | mmu_prop->last_mask + | PAGE_PRESENT_MASK; + + write_final_pte(ctx, hop_pte_addr[num_hops - 1], curr_pte); + + for (hop_idx = MMU_HOP1; hop_idx < num_hops; hop_idx++) { + prev_hop = hop_idx - 1; + + if (hop_new[hop_idx]) { + curr_pte = (hop_addr[hop_idx] & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK; + write_pte(ctx, hop_pte_addr[prev_hop], curr_pte); + if (hop_idx != MMU_HOP1) + get_pte(ctx, hop_addr[prev_hop]); + } + } + + get_pte(ctx, hop_addr[num_hops - 1]); + + return 0; + +err: + for (hop_idx = num_hops; hop_idx > MMU_HOP0; hop_idx--) { + if (hop_new[hop_idx]) + free_hop(ctx, hop_addr[hop_idx]); + } + + return rc; +} + +/* + * hl_mmu_v1_swap_out - marks all mapping of the given ctx as swapped out + * + * @ctx: pointer to the context structure + * + */ +static void hl_mmu_v1_swap_out(struct hl_ctx *ctx) +{ + +} + +/* + * hl_mmu_v1_swap_in - marks all mapping of the given ctx as swapped in + * + * @ctx: pointer to the context structure + * + */ +static void hl_mmu_v1_swap_in(struct hl_ctx *ctx) +{ + +} + +static int hl_mmu_v1_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, + struct hl_mmu_hop_info *hops) +{ + struct hl_device *hdev = ctx->hdev; + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct hl_mmu_properties *mmu_prop; + bool is_dram_addr, is_pmmu_addr, is_pmmu_h_addr, is_huge; + int i, used_hops; + + is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size, + prop->dmmu.start_addr, + prop->dmmu.end_addr); + is_pmmu_addr = hl_mem_area_inside_range(virt_addr, prop->pmmu.page_size, + prop->pmmu.start_addr, + prop->pmmu.end_addr); + is_pmmu_h_addr = hl_mem_area_inside_range(virt_addr, + prop->pmmu_huge.page_size, + prop->pmmu_huge.start_addr, + prop->pmmu_huge.end_addr); + if (is_dram_addr) { + mmu_prop = &prop->dmmu; + is_huge = true; + } else if (is_pmmu_addr) { + mmu_prop = &prop->pmmu; + is_huge = false; + } else if (is_pmmu_h_addr) { + mmu_prop = &prop->pmmu_huge; + is_huge = true; + } else { + return -EINVAL; + } + + used_hops = mmu_prop->num_hops; + + /* huge pages use lesser hops */ + if (is_huge) + used_hops--; + + hops->hop_info[0].hop_addr = get_phys_hop0_addr(ctx); + hops->hop_info[0].hop_pte_addr = + hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, 0, + hops->hop_info[0].hop_addr, virt_addr); + hops->hop_info[0].hop_pte_val = + hdev->asic_funcs->read_pte(hdev, + hops->hop_info[0].hop_pte_addr); + + for (i = 1 ; i < used_hops ; i++) { + hops->hop_info[i].hop_addr = + hl_mmu_get_next_hop_addr(ctx, + hops->hop_info[i - 1].hop_pte_val); + if (hops->hop_info[i].hop_addr == ULLONG_MAX) + return -EFAULT; + + hops->hop_info[i].hop_pte_addr = + hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, i, + hops->hop_info[i].hop_addr, + virt_addr); + hops->hop_info[i].hop_pte_val = + hdev->asic_funcs->read_pte(hdev, + hops->hop_info[i].hop_pte_addr); + + if (!(hops->hop_info[i].hop_pte_val & PAGE_PRESENT_MASK)) + return -EFAULT; + + if (hops->hop_info[i].hop_pte_val & mmu_prop->last_mask) + break; + } + + /* if passed over all hops then no last hop was found */ + if (i == mmu_prop->num_hops) + return -EFAULT; + + if (!(hops->hop_info[i].hop_pte_val & PAGE_PRESENT_MASK)) + return -EFAULT; + + hops->used_hops = i + 1; + + return 0; +} + +/* + * hl_mmu_v1_prepare - prepare mmu for working with mmu v1 + * + * @hdev: pointer to the device structure + */ +void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu) +{ + mmu->init = hl_mmu_v1_init; + mmu->fini = hl_mmu_v1_fini; + mmu->ctx_init = hl_mmu_v1_ctx_init; + mmu->ctx_fini = hl_mmu_v1_ctx_fini; + mmu->map = hl_mmu_v1_map; + mmu->unmap = hl_mmu_v1_unmap; + mmu->flush = flush; + mmu->swap_out = hl_mmu_v1_swap_out; + mmu->swap_in = hl_mmu_v1_swap_in; + mmu->get_tlb_info = hl_mmu_v1_get_tlb_info; +} diff --git a/drivers/misc/habanalabs/common/mmu/mmu_v2_hr.c b/drivers/misc/habanalabs/common/mmu/mmu_v2_hr.c new file mode 100644 index 000000000..afe7ef964 --- /dev/null +++ b/drivers/misc/habanalabs/common/mmu/mmu_v2_hr.c @@ -0,0 +1,399 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2020-2022 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "../habanalabs.h" +#include "../../include/hw_ip/mmu/mmu_general.h" + +#include <linux/slab.h> + +static struct pgt_info *hl_mmu_v2_hr_get_pgt_info(struct hl_ctx *ctx, u64 phys_hop_addr) +{ + struct pgt_info *pgt_info = NULL; + + hash_for_each_possible(ctx->hr_mmu_phys_hash, pgt_info, node, + (unsigned long) phys_hop_addr) + if (phys_hop_addr == pgt_info->phys_addr) + break; + + return pgt_info; +} + +static void hl_mmu_v2_hr_add_pgt_info(struct hl_ctx *ctx, struct pgt_info *pgt_info, + dma_addr_t phys_addr) +{ + hash_add(ctx->hr_mmu_phys_hash, &pgt_info->node, phys_addr); +} + +static struct pgt_info *hl_mmu_v2_hr_get_hop0_pgt_info(struct hl_ctx *ctx) +{ + return &ctx->hdev->mmu_priv.hr.mmu_asid_hop0[ctx->asid]; +} + +/** + * hl_mmu_v2_hr_init() - initialize the MMU module. + * @hdev: habanalabs device structure. + * + * This function does the following: + * - Create a pool of pages for pgt_infos. + * - Create a shadow table for pgt + * + * Return: 0 for success, non-zero for failure. + */ +static inline int hl_mmu_v2_hr_init(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + + return hl_mmu_hr_init(hdev, &hdev->mmu_priv.hr, prop->mmu_hop_table_size, + prop->mmu_pgt_size); +} + +/** + * hl_mmu_v2_hr_fini() - release the MMU module. + * @hdev: habanalabs device structure. + * + * This function does the following: + * - Disable MMU in H/W. + * - Free the pgt_infos pool. + * + * All contexts should be freed before calling this function. + */ +static inline void hl_mmu_v2_hr_fini(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + + hl_mmu_hr_fini(hdev, &hdev->mmu_priv.hr, prop->mmu_hop_table_size); +} + +/** + * hl_mmu_v2_hr_ctx_init() - initialize a context for using the MMU module. + * @ctx: pointer to the context structure to initialize. + * + * Initialize a mutex to protect the concurrent mapping flow, a hash to hold all + * page tables hops related to this context. + * Return: 0 on success, non-zero otherwise. + */ +static int hl_mmu_v2_hr_ctx_init(struct hl_ctx *ctx) +{ + hash_init(ctx->hr_mmu_phys_hash); + return 0; +} + +/* + * hl_mmu_v2_hr_ctx_fini - disable a ctx from using the mmu module + * + * @ctx: pointer to the context structure + * + * This function does the following: + * - Free any pgts which were not freed yet + * - Free the mutex + * - Free DRAM default page mapping hops + */ +static void hl_mmu_v2_hr_ctx_fini(struct hl_ctx *ctx) +{ + struct hl_device *hdev = ctx->hdev; + struct pgt_info *pgt_info; + struct hlist_node *tmp; + int i; + + if (!hash_empty(ctx->hr_mmu_phys_hash)) + dev_err(hdev->dev, "ctx %d is freed while it has pgts in use\n", + ctx->asid); + + hash_for_each_safe(ctx->hr_mmu_phys_hash, i, tmp, pgt_info, node) { + dev_err_ratelimited(hdev->dev, + "pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n", + pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes); + hl_mmu_hr_free_hop_remove_pgt(pgt_info, &ctx->hdev->mmu_priv.hr, + ctx->hdev->asic_prop.mmu_hop_table_size); + } +} + +static int _hl_mmu_v2_hr_unmap(struct hl_ctx *ctx, + u64 virt_addr, bool is_dram_addr) +{ + u64 curr_pte, scrambled_virt_addr, hop_pte_phys_addr[MMU_ARCH_6_HOPS] = { 0 }; + struct pgt_info *hops_pgt_info[MMU_ARCH_6_HOPS] = { NULL }; + struct hl_device *hdev = ctx->hdev; + struct asic_fixed_properties *prop; + struct hl_mmu_properties *mmu_prop; + bool is_huge = false; + int i, hop_last; + + prop = &hdev->asic_prop; + + /* shifts and masks are the same in PMMU and HMMU, use one of them */ + mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu; + hop_last = mmu_prop->num_hops - 1; + + scrambled_virt_addr = hdev->asic_funcs->scramble_addr(hdev, virt_addr); + curr_pte = 0; + + for (i = 0 ; i < mmu_prop->num_hops ; i++) { + /* we get HOP0 differently, it doesn't need curr_pte */ + if (i == 0) + hops_pgt_info[i] = hl_mmu_v2_hr_get_hop0_pgt_info(ctx); + else + hops_pgt_info[i] = hl_mmu_hr_get_next_hop_pgt_info(ctx, + &ctx->hdev->mmu_func[MMU_HR_PGT].hr_funcs, curr_pte); + if (!hops_pgt_info[i]) + goto not_mapped; + + hop_pte_phys_addr[i] = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, i, + hops_pgt_info[i]->phys_addr, + scrambled_virt_addr); + if (hop_pte_phys_addr[i] == U64_MAX) + return -EFAULT; + + curr_pte = *(u64 *) (uintptr_t) hl_mmu_hr_pte_phys_to_virt(ctx, hops_pgt_info[i], + hop_pte_phys_addr[i], + ctx->hdev->asic_prop.mmu_hop_table_size); + + if ((i < hop_last) && (curr_pte & mmu_prop->last_mask)) { + hop_last = i; + is_huge = true; + break; + } + } + + if (is_dram_addr && !is_huge) { + dev_err(hdev->dev, "DRAM unmapping should use huge pages only\n"); + return -EFAULT; + } + + if (!(curr_pte & PAGE_PRESENT_MASK)) + goto not_mapped; + + for (i = hop_last ; i > 0 ; i--) { + hl_mmu_hr_clear_pte(ctx, hops_pgt_info[i], hop_pte_phys_addr[i], + ctx->hdev->asic_prop.mmu_hop_table_size); + + if (hl_mmu_hr_put_pte(ctx, hops_pgt_info[i], &ctx->hdev->mmu_priv.hr, + ctx->hdev->asic_prop.mmu_hop_table_size)) + goto mapped; + } + hl_mmu_hr_clear_pte(ctx, hops_pgt_info[0], hop_pte_phys_addr[0], + ctx->hdev->asic_prop.mmu_hop_table_size); + +mapped: + return 0; + +not_mapped: + dev_err(hdev->dev, "virt addr 0x%llx is not mapped to phys addr\n", virt_addr); + + return -EINVAL; +} + +static int hl_mmu_v2_get_last_hop(struct hl_mmu_properties *mmu_prop, u32 page_size) +{ + int hop; + + for (hop = (mmu_prop->num_hops - 1); hop; hop--) { + if (mmu_prop->hop_shifts[hop] == 0) + continue; + + if (page_size <= (1 << mmu_prop->hop_shifts[hop])) + break; + } + + return hop; +} + +static int _hl_mmu_v2_hr_map(struct hl_ctx *ctx, + u64 virt_addr, u64 phys_addr, + u32 page_size, bool is_dram_addr) +{ + u64 hop_pte_phys_addr[MMU_ARCH_6_HOPS] = { 0 }, + curr_pte = 0, scrambled_virt_addr, scrambled_phys_addr; + struct pgt_info *hops_pgt_info[MMU_ARCH_6_HOPS] = { NULL }; + bool hop_new[MMU_ARCH_6_HOPS] = { false }; + struct hl_device *hdev = ctx->hdev; + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct hl_mmu_properties *mmu_prop; + int i, hop_last, rc = -ENOMEM; + + /* + * This mapping function can map a page or a huge page. For huge page + * there are only 4 hops rather than 5. Currently the DRAM allocation + * uses huge pages only but user memory could have been allocated with + * one of the two page sizes. Since this is a common code for all the + * three cases, we need this hugs page check. + */ + if (is_dram_addr) + mmu_prop = &prop->dmmu; + else if (page_size == prop->pmmu_huge.page_size) + mmu_prop = &prop->pmmu_huge; + else + mmu_prop = &prop->pmmu; + + hop_last = hl_mmu_v2_get_last_hop(mmu_prop, page_size); + if (hop_last <= 0) { + dev_err(ctx->hdev->dev, "Invalid last HOP %d\n", hop_last); + return -EFAULT; + } + + scrambled_virt_addr = hdev->asic_funcs->scramble_addr(hdev, virt_addr); + scrambled_phys_addr = hdev->asic_funcs->scramble_addr(hdev, phys_addr); + + for (i = 0 ; i <= hop_last ; i++) { + + if (i == 0) + hops_pgt_info[i] = hl_mmu_v2_hr_get_hop0_pgt_info(ctx); + else + hops_pgt_info[i] = hl_mmu_hr_get_alloc_next_hop(ctx, + &ctx->hdev->mmu_priv.hr, + &ctx->hdev->mmu_func[MMU_HR_PGT].hr_funcs, + mmu_prop, curr_pte, &hop_new[i]); + if (!hops_pgt_info[i]) + goto err; + + hop_pte_phys_addr[i] = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, i, + hops_pgt_info[i]->phys_addr, + scrambled_virt_addr); + curr_pte = *(u64 *) (uintptr_t) hl_mmu_hr_pte_phys_to_virt(ctx, hops_pgt_info[i], + hop_pte_phys_addr[i], + ctx->hdev->asic_prop.mmu_hop_table_size); + } + + if (curr_pte & PAGE_PRESENT_MASK) { + dev_err(hdev->dev, "mapping already exists for virt_addr 0x%llx\n", + scrambled_virt_addr); + + for (i = 0 ; i <= hop_last ; i++) + dev_dbg(hdev->dev, "hop%d pte: 0x%llx (0x%llx)\n", + i, + *(u64 *) (uintptr_t) + hl_mmu_hr_pte_phys_to_virt(ctx, hops_pgt_info[i], + hop_pte_phys_addr[i], + ctx->hdev->asic_prop.mmu_hop_table_size), + hop_pte_phys_addr[i]); + rc = -EINVAL; + goto err; + } + + curr_pte = (scrambled_phys_addr & HOP_PHYS_ADDR_MASK) | mmu_prop->last_mask + | PAGE_PRESENT_MASK; + + /* Write the PTEs */ + hl_mmu_hr_write_pte(ctx, hops_pgt_info[hop_last], hop_pte_phys_addr[hop_last], curr_pte, + ctx->hdev->asic_prop.mmu_hop_table_size); + + /* for each new hop, add its address to the table of previous-hop */ + for (i = 1 ; i <= hop_last ; i++) { + if (hop_new[i]) { + curr_pte = (hops_pgt_info[i]->phys_addr & HOP_PHYS_ADDR_MASK) | + PAGE_PRESENT_MASK; + hl_mmu_hr_write_pte(ctx, hops_pgt_info[i - 1], hop_pte_phys_addr[i - 1], + curr_pte, ctx->hdev->asic_prop.mmu_hop_table_size); + if (i - 1) + hl_mmu_hr_get_pte(ctx, &ctx->hdev->mmu_func[MMU_HR_PGT].hr_funcs, + hops_pgt_info[i - 1]->phys_addr); + } + } + + hl_mmu_hr_get_pte(ctx, &ctx->hdev->mmu_func[MMU_HR_PGT].hr_funcs, + hops_pgt_info[hop_last]->phys_addr); + + return 0; + +err: + for (i = 1 ; i <= hop_last ; i++) + if (hop_new[i] && hops_pgt_info[i]) + hl_mmu_hr_free_hop_remove_pgt(hops_pgt_info[i], &ctx->hdev->mmu_priv.hr, + ctx->hdev->asic_prop.mmu_hop_table_size); + + return rc; +} + +/* + * hl_mmu_v2_swap_out - marks all mapping of the given ctx as swapped out + * + * @ctx: pointer to the context structure + * + */ +static void hl_mmu_v2_hr_swap_out(struct hl_ctx *ctx) +{ + +} + +/* + * hl_mmu_v2_swap_in - marks all mapping of the given ctx as swapped in + * + * @ctx: pointer to the context structure + * + */ +static void hl_mmu_v2_hr_swap_in(struct hl_ctx *ctx) +{ + +} + +static int hl_mmu_v2_hr_get_tlb_mapping_params(struct hl_device *hdev, + struct hl_mmu_properties **mmu_prop, + struct hl_mmu_hop_info *hops, + u64 virt_addr, bool *is_huge) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + bool is_dram_addr, is_pmmu_addr, is_pmmu_h_addr; + + is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size, + prop->dmmu.start_addr, + prop->dmmu.end_addr); + is_pmmu_addr = hl_mem_area_inside_range(virt_addr, prop->pmmu.page_size, + prop->pmmu.start_addr, + prop->pmmu.end_addr); + is_pmmu_h_addr = hl_mem_area_inside_range(virt_addr, + prop->pmmu_huge.page_size, + prop->pmmu_huge.start_addr, + prop->pmmu_huge.end_addr); + if (is_dram_addr) { + *mmu_prop = &prop->dmmu; + *is_huge = true; + hops->range_type = HL_VA_RANGE_TYPE_DRAM; + } else if (is_pmmu_addr) { + *mmu_prop = &prop->pmmu; + *is_huge = false; + hops->range_type = HL_VA_RANGE_TYPE_HOST; + } else if (is_pmmu_h_addr) { + *mmu_prop = &prop->pmmu_huge; + *is_huge = true; + hops->range_type = HL_VA_RANGE_TYPE_HOST_HUGE; + } else { + return -EINVAL; + } + + return 0; +} + +static int hl_mmu_v2_hr_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, + struct hl_mmu_hop_info *hops) +{ + return hl_mmu_hr_get_tlb_info(ctx, virt_addr, hops, + &ctx->hdev->mmu_func[MMU_HR_PGT].hr_funcs); +} + +/* + * hl_mmu_v2_prepare - prepare mmu_if for working with mmu v2 + * + * @hdev: pointer to the device structure + * @mmu_if: pointer to the mmu interface structure + */ +void hl_mmu_v2_hr_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu) +{ + mmu->init = hl_mmu_v2_hr_init; + mmu->fini = hl_mmu_v2_hr_fini; + mmu->ctx_init = hl_mmu_v2_hr_ctx_init; + mmu->ctx_fini = hl_mmu_v2_hr_ctx_fini; + mmu->map = _hl_mmu_v2_hr_map; + mmu->unmap = _hl_mmu_v2_hr_unmap; + mmu->flush = hl_mmu_hr_flush; + mmu->swap_out = hl_mmu_v2_hr_swap_out; + mmu->swap_in = hl_mmu_v2_hr_swap_in; + mmu->get_tlb_info = hl_mmu_v2_hr_get_tlb_info; + mmu->hr_funcs.get_hop0_pgt_info = hl_mmu_v2_hr_get_hop0_pgt_info; + mmu->hr_funcs.get_pgt_info = hl_mmu_v2_hr_get_pgt_info; + mmu->hr_funcs.add_pgt_info = hl_mmu_v2_hr_add_pgt_info; + mmu->hr_funcs.get_tlb_mapping_params = hl_mmu_v2_hr_get_tlb_mapping_params; +} diff --git a/drivers/misc/habanalabs/common/pci/Makefile b/drivers/misc/habanalabs/common/pci/Makefile new file mode 100644 index 000000000..dc922a686 --- /dev/null +++ b/drivers/misc/habanalabs/common/pci/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +HL_COMMON_PCI_FILES := common/pci/pci.o diff --git a/drivers/misc/habanalabs/common/pci/pci.c b/drivers/misc/habanalabs/common/pci/pci.c new file mode 100644 index 000000000..5fe3da5fb --- /dev/null +++ b/drivers/misc/habanalabs/common/pci/pci.c @@ -0,0 +1,433 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2019 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "../habanalabs.h" +#include "../../include/hw_ip/pci/pci_general.h" + +#include <linux/pci.h> + +#define HL_PLDM_PCI_ELBI_TIMEOUT_MSEC (HL_PCI_ELBI_TIMEOUT_MSEC * 100) + +#define IATU_REGION_CTRL_REGION_EN_MASK BIT(31) +#define IATU_REGION_CTRL_MATCH_MODE_MASK BIT(30) +#define IATU_REGION_CTRL_NUM_MATCH_EN_MASK BIT(19) +#define IATU_REGION_CTRL_BAR_NUM_MASK GENMASK(10, 8) + +/** + * hl_pci_bars_map() - Map PCI BARs. + * @hdev: Pointer to hl_device structure. + * @name: Array of BAR names. + * @is_wc: Array with flag per BAR whether a write-combined mapping is needed. + * + * Request PCI regions and map them to kernel virtual addresses. + * + * Return: 0 on success, non-zero for failure. + */ +int hl_pci_bars_map(struct hl_device *hdev, const char * const name[3], + bool is_wc[3]) +{ + struct pci_dev *pdev = hdev->pdev; + int rc, i, bar; + + rc = pci_request_regions(pdev, HL_NAME); + if (rc) { + dev_err(hdev->dev, "Cannot obtain PCI resources\n"); + return rc; + } + + for (i = 0 ; i < 3 ; i++) { + bar = i * 2; /* 64-bit BARs */ + hdev->pcie_bar[bar] = is_wc[i] ? + pci_ioremap_wc_bar(pdev, bar) : + pci_ioremap_bar(pdev, bar); + if (!hdev->pcie_bar[bar]) { + dev_err(hdev->dev, "pci_ioremap%s_bar failed for %s\n", + is_wc[i] ? "_wc" : "", name[i]); + rc = -ENODEV; + goto err; + } + } + + return 0; + +err: + for (i = 2 ; i >= 0 ; i--) { + bar = i * 2; /* 64-bit BARs */ + if (hdev->pcie_bar[bar]) + iounmap(hdev->pcie_bar[bar]); + } + + pci_release_regions(pdev); + + return rc; +} + +/** + * hl_pci_bars_unmap() - Unmap PCI BARS. + * @hdev: Pointer to hl_device structure. + * + * Release all PCI BARs and unmap their virtual addresses. + */ +static void hl_pci_bars_unmap(struct hl_device *hdev) +{ + struct pci_dev *pdev = hdev->pdev; + int i, bar; + + for (i = 2 ; i >= 0 ; i--) { + bar = i * 2; /* 64-bit BARs */ + iounmap(hdev->pcie_bar[bar]); + } + + pci_release_regions(pdev); +} + +int hl_pci_elbi_read(struct hl_device *hdev, u64 addr, u32 *data) +{ + struct pci_dev *pdev = hdev->pdev; + ktime_t timeout; + u64 msec; + u32 val; + + if (hdev->pldm) + msec = HL_PLDM_PCI_ELBI_TIMEOUT_MSEC; + else + msec = HL_PCI_ELBI_TIMEOUT_MSEC; + + /* Clear previous status */ + pci_write_config_dword(pdev, mmPCI_CONFIG_ELBI_STS, 0); + + pci_write_config_dword(pdev, mmPCI_CONFIG_ELBI_ADDR, (u32) addr); + pci_write_config_dword(pdev, mmPCI_CONFIG_ELBI_CTRL, 0); + + timeout = ktime_add_ms(ktime_get(), msec); + for (;;) { + pci_read_config_dword(pdev, mmPCI_CONFIG_ELBI_STS, &val); + if (val & PCI_CONFIG_ELBI_STS_MASK) + break; + if (ktime_compare(ktime_get(), timeout) > 0) { + pci_read_config_dword(pdev, mmPCI_CONFIG_ELBI_STS, + &val); + break; + } + + usleep_range(300, 500); + } + + if ((val & PCI_CONFIG_ELBI_STS_MASK) == PCI_CONFIG_ELBI_STS_DONE) { + pci_read_config_dword(pdev, mmPCI_CONFIG_ELBI_DATA, data); + + return 0; + } + + if (val & PCI_CONFIG_ELBI_STS_ERR) { + dev_err(hdev->dev, "Error reading from ELBI\n"); + return -EIO; + } + + if (!(val & PCI_CONFIG_ELBI_STS_MASK)) { + dev_err(hdev->dev, "ELBI read didn't finish in time\n"); + return -EIO; + } + + dev_err(hdev->dev, "ELBI read has undefined bits in status\n"); + return -EIO; +} + +/** + * hl_pci_elbi_write() - Write through the ELBI interface. + * @hdev: Pointer to hl_device structure. + * @addr: Address to write to + * @data: Data to write + * + * Return: 0 on success, negative value for failure. + */ +static int hl_pci_elbi_write(struct hl_device *hdev, u64 addr, u32 data) +{ + struct pci_dev *pdev = hdev->pdev; + ktime_t timeout; + u64 msec; + u32 val; + + if (hdev->pldm) + msec = HL_PLDM_PCI_ELBI_TIMEOUT_MSEC; + else + msec = HL_PCI_ELBI_TIMEOUT_MSEC; + + /* Clear previous status */ + pci_write_config_dword(pdev, mmPCI_CONFIG_ELBI_STS, 0); + + pci_write_config_dword(pdev, mmPCI_CONFIG_ELBI_ADDR, (u32) addr); + pci_write_config_dword(pdev, mmPCI_CONFIG_ELBI_DATA, data); + pci_write_config_dword(pdev, mmPCI_CONFIG_ELBI_CTRL, + PCI_CONFIG_ELBI_CTRL_WRITE); + + timeout = ktime_add_ms(ktime_get(), msec); + for (;;) { + pci_read_config_dword(pdev, mmPCI_CONFIG_ELBI_STS, &val); + if (val & PCI_CONFIG_ELBI_STS_MASK) + break; + if (ktime_compare(ktime_get(), timeout) > 0) { + pci_read_config_dword(pdev, mmPCI_CONFIG_ELBI_STS, + &val); + break; + } + + usleep_range(300, 500); + } + + if ((val & PCI_CONFIG_ELBI_STS_MASK) == PCI_CONFIG_ELBI_STS_DONE) + return 0; + + if (val & PCI_CONFIG_ELBI_STS_ERR) + return -EIO; + + if (!(val & PCI_CONFIG_ELBI_STS_MASK)) { + dev_err(hdev->dev, "ELBI write didn't finish in time\n"); + return -EIO; + } + + dev_err(hdev->dev, "ELBI write has undefined bits in status\n"); + return -EIO; +} + +/** + * hl_pci_iatu_write() - iatu write routine. + * @hdev: Pointer to hl_device structure. + * @addr: Address to write to + * @data: Data to write + * + * Return: 0 on success, negative value for failure. + */ +int hl_pci_iatu_write(struct hl_device *hdev, u32 addr, u32 data) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + u32 dbi_offset; + int rc; + + dbi_offset = addr & 0xFFF; + + /* Ignore result of writing to pcie_aux_dbi_reg_addr as it could fail + * in case the firmware security is enabled + */ + hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr, 0x00300000); + + rc = hl_pci_elbi_write(hdev, prop->pcie_dbi_base_address + dbi_offset, + data); + + if (rc) + return -EIO; + + return 0; +} + +/** + * hl_pci_set_inbound_region() - Configure inbound region + * @hdev: Pointer to hl_device structure. + * @region: Inbound region number. + * @pci_region: Inbound region parameters. + * + * Configure the iATU inbound region. + * + * Return: 0 on success, negative value for failure. + */ +int hl_pci_set_inbound_region(struct hl_device *hdev, u8 region, + struct hl_inbound_pci_region *pci_region) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + u64 bar_phys_base, region_base, region_end_address; + u32 offset, ctrl_reg_val; + int rc = 0; + + /* region offset */ + offset = (0x200 * region) + 0x100; + + if (pci_region->mode == PCI_ADDRESS_MATCH_MODE) { + bar_phys_base = hdev->pcie_bar_phys[pci_region->bar]; + region_base = bar_phys_base + pci_region->offset_in_bar; + region_end_address = region_base + pci_region->size - 1; + + rc |= hl_pci_iatu_write(hdev, offset + 0x8, + lower_32_bits(region_base)); + rc |= hl_pci_iatu_write(hdev, offset + 0xC, + upper_32_bits(region_base)); + rc |= hl_pci_iatu_write(hdev, offset + 0x10, + lower_32_bits(region_end_address)); + } + + /* Point to the specified address */ + rc |= hl_pci_iatu_write(hdev, offset + 0x14, lower_32_bits(pci_region->addr)); + rc |= hl_pci_iatu_write(hdev, offset + 0x18, upper_32_bits(pci_region->addr)); + + /* Set bar type as memory */ + rc |= hl_pci_iatu_write(hdev, offset + 0x0, 0); + + /* Enable + bar/address match + match enable + bar number */ + ctrl_reg_val = FIELD_PREP(IATU_REGION_CTRL_REGION_EN_MASK, 1); + ctrl_reg_val |= FIELD_PREP(IATU_REGION_CTRL_MATCH_MODE_MASK, pci_region->mode); + ctrl_reg_val |= FIELD_PREP(IATU_REGION_CTRL_NUM_MATCH_EN_MASK, 1); + + if (pci_region->mode == PCI_BAR_MATCH_MODE) + ctrl_reg_val |= FIELD_PREP(IATU_REGION_CTRL_BAR_NUM_MASK, pci_region->bar); + + rc |= hl_pci_iatu_write(hdev, offset + 0x4, ctrl_reg_val); + + /* Return the DBI window to the default location + * Ignore result of writing to pcie_aux_dbi_reg_addr as it could fail + * in case the firmware security is enabled + */ + hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr, 0); + + if (rc) + dev_err(hdev->dev, "failed to map bar %u to 0x%08llx\n", + pci_region->bar, pci_region->addr); + + return rc; +} + +/** + * hl_pci_set_outbound_region() - Configure outbound region 0 + * @hdev: Pointer to hl_device structure. + * @pci_region: Outbound region parameters. + * + * Configure the iATU outbound region 0. + * + * Return: 0 on success, negative value for failure. + */ +int hl_pci_set_outbound_region(struct hl_device *hdev, + struct hl_outbound_pci_region *pci_region) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + u64 outbound_region_end_address; + int rc = 0; + + /* Outbound Region 0 */ + outbound_region_end_address = + pci_region->addr + pci_region->size - 1; + rc |= hl_pci_iatu_write(hdev, 0x008, + lower_32_bits(pci_region->addr)); + rc |= hl_pci_iatu_write(hdev, 0x00C, + upper_32_bits(pci_region->addr)); + rc |= hl_pci_iatu_write(hdev, 0x010, + lower_32_bits(outbound_region_end_address)); + rc |= hl_pci_iatu_write(hdev, 0x014, 0); + + rc |= hl_pci_iatu_write(hdev, 0x018, 0); + + rc |= hl_pci_iatu_write(hdev, 0x020, + upper_32_bits(outbound_region_end_address)); + /* Increase region size */ + rc |= hl_pci_iatu_write(hdev, 0x000, 0x00002000); + /* Enable */ + rc |= hl_pci_iatu_write(hdev, 0x004, 0x80000000); + + /* Return the DBI window to the default location + * Ignore result of writing to pcie_aux_dbi_reg_addr as it could fail + * in case the firmware security is enabled + */ + hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr, 0); + + return rc; +} + +/** + * hl_get_pci_memory_region() - get PCI region for given address + * @hdev: Pointer to hl_device structure. + * @addr: device address + * + * @return region index on success, otherwise PCI_REGION_NUMBER (invalid + * region index) + */ +enum pci_region hl_get_pci_memory_region(struct hl_device *hdev, u64 addr) +{ + int i; + + for (i = 0 ; i < PCI_REGION_NUMBER ; i++) { + struct pci_mem_region *region = &hdev->pci_mem_region[i]; + + if (!region->used) + continue; + + if ((addr >= region->region_base) && + (addr < region->region_base + region->region_size)) + return i; + } + + return PCI_REGION_NUMBER; +} + +/** + * hl_pci_init() - PCI initialization code. + * @hdev: Pointer to hl_device structure. + * + * Set DMA masks, initialize the PCI controller and map the PCI BARs. + * + * Return: 0 on success, non-zero for failure. + */ +int hl_pci_init(struct hl_device *hdev) +{ + struct asic_fixed_properties *prop = &hdev->asic_prop; + struct pci_dev *pdev = hdev->pdev; + int rc; + + rc = pci_enable_device_mem(pdev); + if (rc) { + dev_err(hdev->dev, "can't enable PCI device\n"); + return rc; + } + + pci_set_master(pdev); + + rc = hdev->asic_funcs->pci_bars_map(hdev); + if (rc) { + dev_err(hdev->dev, "Failed to map PCI BAR addresses\n"); + goto disable_device; + } + + rc = hdev->asic_funcs->init_iatu(hdev); + if (rc) { + dev_err(hdev->dev, "PCI controller was not initialized successfully\n"); + goto unmap_pci_bars; + } + + /* Driver must sleep in order for FW to finish the iATU configuration */ + if (hdev->asic_prop.iatu_done_by_fw) + usleep_range(2000, 3000); + + rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(prop->dma_mask)); + if (rc) { + dev_err(hdev->dev, + "Failed to set dma mask to %d bits, error %d\n", + prop->dma_mask, rc); + goto unmap_pci_bars; + } + + dma_set_max_seg_size(&pdev->dev, U32_MAX); + + return 0; + +unmap_pci_bars: + hl_pci_bars_unmap(hdev); +disable_device: + pci_clear_master(pdev); + pci_disable_device(pdev); + + return rc; +} + +/** + * hl_pci_fini() - PCI finalization code. + * @hdev: Pointer to hl_device structure + * + * Unmap PCI bars and disable PCI device. + */ +void hl_pci_fini(struct hl_device *hdev) +{ + hl_pci_bars_unmap(hdev); + + pci_clear_master(hdev->pdev); + pci_disable_device(hdev->pdev); +} diff --git a/drivers/misc/habanalabs/common/security.c b/drivers/misc/habanalabs/common/security.c new file mode 100644 index 000000000..6196c0487 --- /dev/null +++ b/drivers/misc/habanalabs/common/security.c @@ -0,0 +1,600 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2020 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "habanalabs.h" + +/** + * hl_get_pb_block - return the relevant block within the block array + * + * @hdev: pointer to hl_device structure + * @mm_reg_addr: register address in the desired block + * @pb_blocks: blocks array + * @array_size: blocks array size + * + */ +static int hl_get_pb_block(struct hl_device *hdev, u32 mm_reg_addr, + const u32 pb_blocks[], int array_size) +{ + int i; + u32 start_addr, end_addr; + + for (i = 0 ; i < array_size ; i++) { + start_addr = pb_blocks[i]; + end_addr = start_addr + HL_BLOCK_SIZE; + + if ((mm_reg_addr >= start_addr) && (mm_reg_addr < end_addr)) + return i; + } + + dev_err(hdev->dev, "No protection domain was found for 0x%x\n", + mm_reg_addr); + return -EDOM; +} + +/** + * hl_unset_pb_in_block - clear a specific protection bit in a block + * + * @hdev: pointer to hl_device structure + * @reg_offset: register offset will be converted to bit offset in pb block + * @sgs_entry: pb array + * + */ +static int hl_unset_pb_in_block(struct hl_device *hdev, u32 reg_offset, + struct hl_block_glbl_sec *sgs_entry) +{ + if ((reg_offset >= HL_BLOCK_SIZE) || (reg_offset & 0x3)) { + dev_err(hdev->dev, + "Register offset(%d) is out of range(%d) or invalid\n", + reg_offset, HL_BLOCK_SIZE); + return -EINVAL; + } + + UNSET_GLBL_SEC_BIT(sgs_entry->sec_array, + (reg_offset & (HL_BLOCK_SIZE - 1)) >> 2); + + return 0; +} + +/** + * hl_unsecure_register - locate the relevant block for this register and + * remove corresponding protection bit + * + * @hdev: pointer to hl_device structure + * @mm_reg_addr: register address to unsecure + * @offset: additional offset to the register address + * @pb_blocks: blocks array + * @sgs_array: pb array + * @array_size: blocks array size + * + */ +int hl_unsecure_register(struct hl_device *hdev, u32 mm_reg_addr, int offset, + const u32 pb_blocks[], struct hl_block_glbl_sec sgs_array[], + int array_size) +{ + u32 reg_offset; + int block_num; + + block_num = hl_get_pb_block(hdev, mm_reg_addr + offset, pb_blocks, + array_size); + if (block_num < 0) + return block_num; + + reg_offset = (mm_reg_addr + offset) - pb_blocks[block_num]; + + return hl_unset_pb_in_block(hdev, reg_offset, &sgs_array[block_num]); +} + +/** + * hl_unsecure_register_range - locate the relevant block for this register + * range and remove corresponding protection bit + * + * @hdev: pointer to hl_device structure + * @mm_reg_range: register address range to unsecure + * @offset: additional offset to the register address + * @pb_blocks: blocks array + * @sgs_array: pb array + * @array_size: blocks array size + * + */ +static int hl_unsecure_register_range(struct hl_device *hdev, + struct range mm_reg_range, int offset, const u32 pb_blocks[], + struct hl_block_glbl_sec sgs_array[], + int array_size) +{ + u32 reg_offset; + int i, block_num, rc = 0; + + block_num = hl_get_pb_block(hdev, + mm_reg_range.start + offset, pb_blocks, + array_size); + if (block_num < 0) + return block_num; + + for (i = mm_reg_range.start ; i <= mm_reg_range.end ; i += 4) { + reg_offset = (i + offset) - pb_blocks[block_num]; + rc |= hl_unset_pb_in_block(hdev, reg_offset, + &sgs_array[block_num]); + } + + return rc; +} + +/** + * hl_unsecure_registers - locate the relevant block for all registers and + * remove corresponding protection bit + * + * @hdev: pointer to hl_device structure + * @mm_reg_array: register address array to unsecure + * @mm_array_size: register array size + * @offset: additional offset to the register address + * @pb_blocks: blocks array + * @sgs_array: pb array + * @blocks_array_size: blocks array size + * + */ +int hl_unsecure_registers(struct hl_device *hdev, const u32 mm_reg_array[], + int mm_array_size, int offset, const u32 pb_blocks[], + struct hl_block_glbl_sec sgs_array[], int blocks_array_size) +{ + int i, rc = 0; + + for (i = 0 ; i < mm_array_size ; i++) { + rc = hl_unsecure_register(hdev, mm_reg_array[i], offset, + pb_blocks, sgs_array, blocks_array_size); + + if (rc) + return rc; + } + + return rc; +} + +/** + * hl_unsecure_registers_range - locate the relevant block for all register + * ranges and remove corresponding protection bit + * + * @hdev: pointer to hl_device structure + * @mm_reg_range_array: register address range array to unsecure + * @mm_array_size: register array size + * @offset: additional offset to the register address + * @pb_blocks: blocks array + * @sgs_array: pb array + * @blocks_array_size: blocks array size + * + */ +static int hl_unsecure_registers_range(struct hl_device *hdev, + const struct range mm_reg_range_array[], int mm_array_size, + int offset, const u32 pb_blocks[], + struct hl_block_glbl_sec sgs_array[], int blocks_array_size) +{ + int i, rc = 0; + + for (i = 0 ; i < mm_array_size ; i++) { + rc = hl_unsecure_register_range(hdev, mm_reg_range_array[i], + offset, pb_blocks, sgs_array, blocks_array_size); + + if (rc) + return rc; + } + + return rc; +} + +/** + * hl_ack_pb_security_violations - Ack security violation + * + * @hdev: pointer to hl_device structure + * @pb_blocks: blocks array + * @block_offset: additional offset to the block + * @array_size: blocks array size + * + */ +static void hl_ack_pb_security_violations(struct hl_device *hdev, + const u32 pb_blocks[], u32 block_offset, int array_size) +{ + int i; + u32 cause, addr, block_base; + + for (i = 0 ; i < array_size ; i++) { + block_base = pb_blocks[i] + block_offset; + cause = RREG32(block_base + HL_BLOCK_GLBL_ERR_CAUSE); + if (cause) { + addr = RREG32(block_base + HL_BLOCK_GLBL_ERR_ADDR); + hdev->asic_funcs->pb_print_security_errors(hdev, + block_base, cause, addr); + WREG32(block_base + HL_BLOCK_GLBL_ERR_CAUSE, cause); + } + } +} + +/** + * hl_config_glbl_sec - set pb in HW according to given pb array + * + * @hdev: pointer to hl_device structure + * @pb_blocks: blocks array + * @sgs_array: pb array + * @block_offset: additional offset to the block + * @array_size: blocks array size + * + */ +void hl_config_glbl_sec(struct hl_device *hdev, const u32 pb_blocks[], + struct hl_block_glbl_sec sgs_array[], u32 block_offset, + int array_size) +{ + int i, j; + u32 sgs_base; + + if (hdev->pldm) + usleep_range(100, 1000); + + for (i = 0 ; i < array_size ; i++) { + sgs_base = block_offset + pb_blocks[i] + + HL_BLOCK_GLBL_SEC_OFFS; + + for (j = 0 ; j < HL_BLOCK_GLBL_SEC_LEN ; j++) + WREG32(sgs_base + j * sizeof(u32), + sgs_array[i].sec_array[j]); + } +} + +/** + * hl_secure_block - locally memsets a block to 0 + * + * @hdev: pointer to hl_device structure + * @sgs_array: pb array to clear + * @array_size: blocks array size + * + */ +void hl_secure_block(struct hl_device *hdev, + struct hl_block_glbl_sec sgs_array[], int array_size) +{ + int i; + + for (i = 0 ; i < array_size ; i++) + memset((char *)(sgs_array[i].sec_array), 0, + HL_BLOCK_GLBL_SEC_SIZE); +} + +/** + * hl_init_pb_with_mask - set selected pb instances with mask in HW according + * to given configuration + * + * @hdev: pointer to hl_device structure + * @num_dcores: number of decores to apply configuration to + * set to HL_PB_SHARED if need to apply only once + * @dcore_offset: offset between dcores + * @num_instances: number of instances to apply configuration to + * @instance_offset: offset between instances + * @pb_blocks: blocks array + * @blocks_array_size: blocks array size + * @regs_array: register array + * @regs_array_size: register array size + * @mask: enabled instances mask: 1- enabled, 0- disabled + */ +int hl_init_pb_with_mask(struct hl_device *hdev, u32 num_dcores, + u32 dcore_offset, u32 num_instances, u32 instance_offset, + const u32 pb_blocks[], u32 blocks_array_size, + const u32 *regs_array, u32 regs_array_size, u64 mask) +{ + int i, j; + struct hl_block_glbl_sec *glbl_sec; + + glbl_sec = kcalloc(blocks_array_size, + sizeof(struct hl_block_glbl_sec), + GFP_KERNEL); + if (!glbl_sec) + return -ENOMEM; + + hl_secure_block(hdev, glbl_sec, blocks_array_size); + hl_unsecure_registers(hdev, regs_array, regs_array_size, 0, pb_blocks, + glbl_sec, blocks_array_size); + + /* Fill all blocks with the same configuration */ + for (i = 0 ; i < num_dcores ; i++) { + for (j = 0 ; j < num_instances ; j++) { + int seq = i * num_instances + j; + + if (!(mask & BIT_ULL(seq))) + continue; + + hl_config_glbl_sec(hdev, pb_blocks, glbl_sec, + i * dcore_offset + j * instance_offset, + blocks_array_size); + } + } + + kfree(glbl_sec); + + return 0; +} + +/** + * hl_init_pb - set pb in HW according to given configuration + * + * @hdev: pointer to hl_device structure + * @num_dcores: number of decores to apply configuration to + * set to HL_PB_SHARED if need to apply only once + * @dcore_offset: offset between dcores + * @num_instances: number of instances to apply configuration to + * @instance_offset: offset between instances + * @pb_blocks: blocks array + * @blocks_array_size: blocks array size + * @regs_array: register array + * @regs_array_size: register array size + * + */ +int hl_init_pb(struct hl_device *hdev, u32 num_dcores, u32 dcore_offset, + u32 num_instances, u32 instance_offset, + const u32 pb_blocks[], u32 blocks_array_size, + const u32 *regs_array, u32 regs_array_size) +{ + return hl_init_pb_with_mask(hdev, num_dcores, dcore_offset, + num_instances, instance_offset, pb_blocks, + blocks_array_size, regs_array, regs_array_size, + ULLONG_MAX); +} + +/** + * hl_init_pb_ranges_with_mask - set pb instances using mask in HW according to + * given configuration unsecurring registers + * ranges instead of specific registers + * + * @hdev: pointer to hl_device structure + * @num_dcores: number of decores to apply configuration to + * set to HL_PB_SHARED if need to apply only once + * @dcore_offset: offset between dcores + * @num_instances: number of instances to apply configuration to + * @instance_offset: offset between instances + * @pb_blocks: blocks array + * @blocks_array_size: blocks array size + * @regs_range_array: register range array + * @regs_range_array_size: register range array size + * @mask: enabled instances mask: 1- enabled, 0- disabled + */ +int hl_init_pb_ranges_with_mask(struct hl_device *hdev, u32 num_dcores, + u32 dcore_offset, u32 num_instances, u32 instance_offset, + const u32 pb_blocks[], u32 blocks_array_size, + const struct range *regs_range_array, u32 regs_range_array_size, + u64 mask) +{ + int i, j, rc = 0; + struct hl_block_glbl_sec *glbl_sec; + + glbl_sec = kcalloc(blocks_array_size, + sizeof(struct hl_block_glbl_sec), + GFP_KERNEL); + if (!glbl_sec) + return -ENOMEM; + + hl_secure_block(hdev, glbl_sec, blocks_array_size); + rc = hl_unsecure_registers_range(hdev, regs_range_array, + regs_range_array_size, 0, pb_blocks, glbl_sec, + blocks_array_size); + if (rc) + goto free_glbl_sec; + + /* Fill all blocks with the same configuration */ + for (i = 0 ; i < num_dcores ; i++) { + for (j = 0 ; j < num_instances ; j++) { + int seq = i * num_instances + j; + + if (!(mask & BIT_ULL(seq))) + continue; + + hl_config_glbl_sec(hdev, pb_blocks, glbl_sec, + i * dcore_offset + j * instance_offset, + blocks_array_size); + } + } + +free_glbl_sec: + kfree(glbl_sec); + + return rc; +} + +/** + * hl_init_pb_ranges - set pb in HW according to given configuration unsecurring + * registers ranges instead of specific registers + * + * @hdev: pointer to hl_device structure + * @num_dcores: number of decores to apply configuration to + * set to HL_PB_SHARED if need to apply only once + * @dcore_offset: offset between dcores + * @num_instances: number of instances to apply configuration to + * @instance_offset: offset between instances + * @pb_blocks: blocks array + * @blocks_array_size: blocks array size + * @regs_range_array: register range array + * @regs_range_array_size: register range array size + * + */ +int hl_init_pb_ranges(struct hl_device *hdev, u32 num_dcores, + u32 dcore_offset, u32 num_instances, u32 instance_offset, + const u32 pb_blocks[], u32 blocks_array_size, + const struct range *regs_range_array, u32 regs_range_array_size) +{ + return hl_init_pb_ranges_with_mask(hdev, num_dcores, dcore_offset, + num_instances, instance_offset, pb_blocks, + blocks_array_size, regs_range_array, + regs_range_array_size, ULLONG_MAX); +} + +/** + * hl_init_pb_single_dcore - set pb for a single docre in HW + * according to given configuration + * + * @hdev: pointer to hl_device structure + * @dcore_offset: offset from the dcore0 + * @num_instances: number of instances to apply configuration to + * @instance_offset: offset between instances + * @pb_blocks: blocks array + * @blocks_array_size: blocks array size + * @regs_array: register array + * @regs_array_size: register array size + * + */ +int hl_init_pb_single_dcore(struct hl_device *hdev, u32 dcore_offset, + u32 num_instances, u32 instance_offset, + const u32 pb_blocks[], u32 blocks_array_size, + const u32 *regs_array, u32 regs_array_size) +{ + int i, rc = 0; + struct hl_block_glbl_sec *glbl_sec; + + glbl_sec = kcalloc(blocks_array_size, + sizeof(struct hl_block_glbl_sec), + GFP_KERNEL); + if (!glbl_sec) + return -ENOMEM; + + hl_secure_block(hdev, glbl_sec, blocks_array_size); + rc = hl_unsecure_registers(hdev, regs_array, regs_array_size, 0, + pb_blocks, glbl_sec, blocks_array_size); + if (rc) + goto free_glbl_sec; + + /* Fill all blocks with the same configuration */ + for (i = 0 ; i < num_instances ; i++) + hl_config_glbl_sec(hdev, pb_blocks, glbl_sec, + dcore_offset + i * instance_offset, + blocks_array_size); + +free_glbl_sec: + kfree(glbl_sec); + + return rc; +} + +/** + * hl_init_pb_ranges_single_dcore - set pb for a single docre in HW according + * to given configuration unsecurring + * registers ranges instead of specific + * registers + * + * @hdev: pointer to hl_device structure + * @dcore_offset: offset from the dcore0 + * @num_instances: number of instances to apply configuration to + * @instance_offset: offset between instances + * @pb_blocks: blocks array + * @blocks_array_size: blocks array size + * @regs_range_array: register range array + * @regs_range_array_size: register range array size + * + */ +int hl_init_pb_ranges_single_dcore(struct hl_device *hdev, u32 dcore_offset, + u32 num_instances, u32 instance_offset, + const u32 pb_blocks[], u32 blocks_array_size, + const struct range *regs_range_array, u32 regs_range_array_size) +{ + int i; + struct hl_block_glbl_sec *glbl_sec; + + glbl_sec = kcalloc(blocks_array_size, + sizeof(struct hl_block_glbl_sec), + GFP_KERNEL); + if (!glbl_sec) + return -ENOMEM; + + hl_secure_block(hdev, glbl_sec, blocks_array_size); + hl_unsecure_registers_range(hdev, regs_range_array, + regs_range_array_size, 0, pb_blocks, glbl_sec, + blocks_array_size); + + /* Fill all blocks with the same configuration */ + for (i = 0 ; i < num_instances ; i++) + hl_config_glbl_sec(hdev, pb_blocks, glbl_sec, + dcore_offset + i * instance_offset, + blocks_array_size); + + kfree(glbl_sec); + + return 0; +} + +/** + * hl_ack_pb_with_mask - ack pb with mask in HW according to given configuration + * + * @hdev: pointer to hl_device structure + * @num_dcores: number of decores to apply configuration to + * set to HL_PB_SHARED if need to apply only once + * @dcore_offset: offset between dcores + * @num_instances: number of instances to apply configuration to + * @instance_offset: offset between instances + * @pb_blocks: blocks array + * @blocks_array_size: blocks array size + * @mask: enabled instances mask: 1- enabled, 0- disabled + * + */ +void hl_ack_pb_with_mask(struct hl_device *hdev, u32 num_dcores, + u32 dcore_offset, u32 num_instances, u32 instance_offset, + const u32 pb_blocks[], u32 blocks_array_size, u64 mask) +{ + int i, j; + + /* ack all blocks */ + for (i = 0 ; i < num_dcores ; i++) { + for (j = 0 ; j < num_instances ; j++) { + int seq = i * num_instances + j; + + if (!(mask & BIT_ULL(seq))) + continue; + + hl_ack_pb_security_violations(hdev, pb_blocks, + i * dcore_offset + j * instance_offset, + blocks_array_size); + } + } +} + +/** + * hl_ack_pb - ack pb in HW according to given configuration + * + * @hdev: pointer to hl_device structure + * @num_dcores: number of decores to apply configuration to + * set to HL_PB_SHARED if need to apply only once + * @dcore_offset: offset between dcores + * @num_instances: number of instances to apply configuration to + * @instance_offset: offset between instances + * @pb_blocks: blocks array + * @blocks_array_size: blocks array size + * + */ +void hl_ack_pb(struct hl_device *hdev, u32 num_dcores, u32 dcore_offset, + u32 num_instances, u32 instance_offset, + const u32 pb_blocks[], u32 blocks_array_size) +{ + hl_ack_pb_with_mask(hdev, num_dcores, dcore_offset, num_instances, + instance_offset, pb_blocks, blocks_array_size, + ULLONG_MAX); +} + +/** + * hl_ack_pb_single_dcore - ack pb for single docre in HW + * according to given configuration + * + * @hdev: pointer to hl_device structure + * @dcore_offset: offset from dcore0 + * @num_instances: number of instances to apply configuration to + * @instance_offset: offset between instances + * @pb_blocks: blocks array + * @blocks_array_size: blocks array size + * + */ +void hl_ack_pb_single_dcore(struct hl_device *hdev, u32 dcore_offset, + u32 num_instances, u32 instance_offset, + const u32 pb_blocks[], u32 blocks_array_size) +{ + int i; + + /* ack all blocks */ + for (i = 0 ; i < num_instances ; i++) + hl_ack_pb_security_violations(hdev, pb_blocks, + dcore_offset + i * instance_offset, + blocks_array_size); + +} diff --git a/drivers/misc/habanalabs/common/state_dump.c b/drivers/misc/habanalabs/common/state_dump.c new file mode 100644 index 000000000..74726907c --- /dev/null +++ b/drivers/misc/habanalabs/common/state_dump.c @@ -0,0 +1,718 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2021 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include <linux/vmalloc.h> +#include <uapi/misc/habanalabs.h> +#include "habanalabs.h" + +/** + * hl_format_as_binary - helper function, format an integer as binary + * using supplied scratch buffer + * @buf: the buffer to use + * @buf_len: buffer capacity + * @n: number to format + * + * Returns pointer to buffer + */ +char *hl_format_as_binary(char *buf, size_t buf_len, u32 n) +{ + int i; + u32 bit; + bool leading0 = true; + char *wrptr = buf; + + if (buf_len > 0 && buf_len < 3) { + *wrptr = '\0'; + return buf; + } + + wrptr[0] = '0'; + wrptr[1] = 'b'; + wrptr += 2; + /* Remove 3 characters from length for '0b' and '\0' termination */ + buf_len -= 3; + + for (i = 0; i < sizeof(n) * BITS_PER_BYTE && buf_len; ++i, n <<= 1) { + /* Writing bit calculation in one line would cause a false + * positive static code analysis error, so splitting. + */ + bit = n & (1 << (sizeof(n) * BITS_PER_BYTE - 1)); + bit = !!bit; + leading0 &= !bit; + if (!leading0) { + *wrptr = '0' + bit; + ++wrptr; + } + } + + *wrptr = '\0'; + + return buf; +} + +/** + * resize_to_fit - helper function, resize buffer to fit given amount of data + * @buf: destination buffer double pointer + * @size: pointer to the size container + * @desired_size: size the buffer must contain + * + * Returns 0 on success or error code on failure. + * On success, the size of buffer is at least desired_size. Buffer is allocated + * via vmalloc and must be freed with vfree. + */ +static int resize_to_fit(char **buf, size_t *size, size_t desired_size) +{ + char *resized_buf; + size_t new_size; + + if (*size >= desired_size) + return 0; + + /* Not enough space to print all, have to resize */ + new_size = max_t(size_t, PAGE_SIZE, round_up(desired_size, PAGE_SIZE)); + resized_buf = vmalloc(new_size); + if (!resized_buf) + return -ENOMEM; + memcpy(resized_buf, *buf, *size); + vfree(*buf); + *buf = resized_buf; + *size = new_size; + + return 1; +} + +/** + * hl_snprintf_resize() - print formatted data to buffer, resize as needed + * @buf: buffer double pointer, to be written to and resized, must be either + * NULL or allocated with vmalloc. + * @size: current size of the buffer + * @offset: current offset to write to + * @format: format of the data + * + * This function will write formatted data into the buffer. If buffer is not + * large enough, it will be resized using vmalloc. Size may be modified if the + * buffer was resized, offset will be advanced by the number of bytes written + * not including the terminating character + * + * Returns 0 on success or error code on failure + * + * Note that the buffer has to be manually released using vfree. + */ +int hl_snprintf_resize(char **buf, size_t *size, size_t *offset, + const char *format, ...) +{ + va_list args; + size_t length; + int rc; + + if (*buf == NULL && (*size != 0 || *offset != 0)) + return -EINVAL; + + va_start(args, format); + length = vsnprintf(*buf + *offset, *size - *offset, format, args); + va_end(args); + + rc = resize_to_fit(buf, size, *offset + length + 1); + if (rc < 0) + return rc; + else if (rc > 0) { + /* Resize was needed, write again */ + va_start(args, format); + length = vsnprintf(*buf + *offset, *size - *offset, format, + args); + va_end(args); + } + + *offset += length; + + return 0; +} + +/** + * hl_sync_engine_to_string - convert engine type enum to string literal + * @engine_type: engine type (TPC/MME/DMA) + * + * Return the resolved string literal + */ +const char *hl_sync_engine_to_string(enum hl_sync_engine_type engine_type) +{ + switch (engine_type) { + case ENGINE_DMA: + return "DMA"; + case ENGINE_MME: + return "MME"; + case ENGINE_TPC: + return "TPC"; + } + return "Invalid Engine Type"; +} + +/** + * hl_print_resize_sync_engine - helper function, format engine name and ID + * using hl_snprintf_resize + * @buf: destination buffer double pointer to be used with hl_snprintf_resize + * @size: pointer to the size container + * @offset: pointer to the offset container + * @engine_type: engine type (TPC/MME/DMA) + * @engine_id: engine numerical id + * + * Returns 0 on success or error code on failure + */ +static int hl_print_resize_sync_engine(char **buf, size_t *size, size_t *offset, + enum hl_sync_engine_type engine_type, + u32 engine_id) +{ + return hl_snprintf_resize(buf, size, offset, "%s%u", + hl_sync_engine_to_string(engine_type), engine_id); +} + +/** + * hl_state_dump_get_sync_name - transform sync object id to name if available + * @hdev: pointer to the device + * @sync_id: sync object id + * + * Returns a name literal or NULL if not resolved. + * Note: returning NULL shall not be considered as a failure, as not all + * sync objects are named. + */ +const char *hl_state_dump_get_sync_name(struct hl_device *hdev, u32 sync_id) +{ + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; + struct hl_hw_obj_name_entry *entry; + + hash_for_each_possible(sds->so_id_to_str_tb, entry, + node, sync_id) + if (sync_id == entry->id) + return entry->name; + + return NULL; +} + +/** + * hl_state_dump_get_monitor_name - transform monitor object dump to monitor + * name if available + * @hdev: pointer to the device + * @mon: monitor state dump + * + * Returns a name literal or NULL if not resolved. + * Note: returning NULL shall not be considered as a failure, as not all + * monitors are named. + */ +const char *hl_state_dump_get_monitor_name(struct hl_device *hdev, + struct hl_mon_state_dump *mon) +{ + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; + struct hl_hw_obj_name_entry *entry; + + hash_for_each_possible(sds->monitor_id_to_str_tb, + entry, node, mon->id) + if (mon->id == entry->id) + return entry->name; + + return NULL; +} + +/** + * hl_state_dump_free_sync_to_engine_map - free sync object to engine map + * @map: sync object to engine map + * + * Note: generic free implementation, the allocation is implemented per ASIC. + */ +void hl_state_dump_free_sync_to_engine_map(struct hl_sync_to_engine_map *map) +{ + struct hl_sync_to_engine_map_entry *entry; + struct hlist_node *tmp_node; + int i; + + hash_for_each_safe(map->tb, i, tmp_node, entry, node) { + hash_del(&entry->node); + kfree(entry); + } +} + +/** + * hl_state_dump_get_sync_to_engine - transform sync_id to + * hl_sync_to_engine_map_entry if available for current id + * @map: sync object to engine map + * @sync_id: sync object id + * + * Returns the translation entry if found or NULL if not. + * Note, returned NULL shall not be considered as a failure as the map + * does not cover all possible, it is a best effort sync ids. + */ +static struct hl_sync_to_engine_map_entry * +hl_state_dump_get_sync_to_engine(struct hl_sync_to_engine_map *map, u32 sync_id) +{ + struct hl_sync_to_engine_map_entry *entry; + + hash_for_each_possible(map->tb, entry, node, sync_id) + if (entry->sync_id == sync_id) + return entry; + return NULL; +} + +/** + * hl_state_dump_read_sync_objects - read sync objects array + * @hdev: pointer to the device + * @index: sync manager block index starting with E_N + * + * Returns array of size SP_SYNC_OBJ_AMOUNT on success or NULL on failure + */ +static u32 *hl_state_dump_read_sync_objects(struct hl_device *hdev, u32 index) +{ + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; + u32 *sync_objects; + s64 base_addr; /* Base addr can be negative */ + int i; + + base_addr = sds->props[SP_SYNC_OBJ_BASE_ADDR] + + sds->props[SP_NEXT_SYNC_OBJ_ADDR] * index; + + sync_objects = vmalloc(sds->props[SP_SYNC_OBJ_AMOUNT] * sizeof(u32)); + if (!sync_objects) + return NULL; + + for (i = 0; i < sds->props[SP_SYNC_OBJ_AMOUNT]; ++i) + sync_objects[i] = RREG32(base_addr + i * sizeof(u32)); + + return sync_objects; +} + +/** + * hl_state_dump_free_sync_objects - free sync objects array allocated by + * hl_state_dump_read_sync_objects + * @sync_objects: sync objects array + */ +static void hl_state_dump_free_sync_objects(u32 *sync_objects) +{ + vfree(sync_objects); +} + + +/** + * hl_state_dump_print_syncs_single_block - print active sync objects on a + * single block + * @hdev: pointer to the device + * @index: sync manager block index starting with E_N + * @buf: destination buffer double pointer to be used with hl_snprintf_resize + * @size: pointer to the size container + * @offset: pointer to the offset container + * @map: sync engines names map + * + * Returns 0 on success or error code on failure + */ +static int +hl_state_dump_print_syncs_single_block(struct hl_device *hdev, u32 index, + char **buf, size_t *size, size_t *offset, + struct hl_sync_to_engine_map *map) +{ + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; + const char *sync_name; + u32 *sync_objects = NULL; + int rc = 0, i; + + if (sds->sync_namager_names) { + rc = hl_snprintf_resize( + buf, size, offset, "%s\n", + sds->sync_namager_names[index]); + if (rc) + goto out; + } + + sync_objects = hl_state_dump_read_sync_objects(hdev, index); + if (!sync_objects) { + rc = -ENOMEM; + goto out; + } + + for (i = 0; i < sds->props[SP_SYNC_OBJ_AMOUNT]; ++i) { + struct hl_sync_to_engine_map_entry *entry; + u64 sync_object_addr; + + if (!sync_objects[i]) + continue; + + sync_object_addr = sds->props[SP_SYNC_OBJ_BASE_ADDR] + + sds->props[SP_NEXT_SYNC_OBJ_ADDR] * index + + i * sizeof(u32); + + rc = hl_snprintf_resize(buf, size, offset, "sync id: %u", i); + if (rc) + goto free_sync_objects; + sync_name = hl_state_dump_get_sync_name(hdev, i); + if (sync_name) { + rc = hl_snprintf_resize(buf, size, offset, " %s", + sync_name); + if (rc) + goto free_sync_objects; + } + rc = hl_snprintf_resize(buf, size, offset, ", value: %u", + sync_objects[i]); + if (rc) + goto free_sync_objects; + + /* Append engine string */ + entry = hl_state_dump_get_sync_to_engine(map, + (u32)sync_object_addr); + if (entry) { + rc = hl_snprintf_resize(buf, size, offset, + ", Engine: "); + if (rc) + goto free_sync_objects; + rc = hl_print_resize_sync_engine(buf, size, offset, + entry->engine_type, + entry->engine_id); + if (rc) + goto free_sync_objects; + } + + rc = hl_snprintf_resize(buf, size, offset, "\n"); + if (rc) + goto free_sync_objects; + } + +free_sync_objects: + hl_state_dump_free_sync_objects(sync_objects); +out: + return rc; +} + +/** + * hl_state_dump_print_syncs - print active sync objects + * @hdev: pointer to the device + * @buf: destination buffer double pointer to be used with hl_snprintf_resize + * @size: pointer to the size container + * @offset: pointer to the offset container + * + * Returns 0 on success or error code on failure + */ +static int hl_state_dump_print_syncs(struct hl_device *hdev, + char **buf, size_t *size, + size_t *offset) + +{ + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; + struct hl_sync_to_engine_map *map; + u32 index; + int rc = 0; + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return -ENOMEM; + + rc = sds->funcs.gen_sync_to_engine_map(hdev, map); + if (rc) + goto free_map_mem; + + rc = hl_snprintf_resize(buf, size, offset, "Non zero sync objects:\n"); + if (rc) + goto out; + + if (sds->sync_namager_names) { + for (index = 0; sds->sync_namager_names[index]; ++index) { + rc = hl_state_dump_print_syncs_single_block( + hdev, index, buf, size, offset, map); + if (rc) + goto out; + } + } else { + for (index = 0; index < sds->props[SP_NUM_CORES]; ++index) { + rc = hl_state_dump_print_syncs_single_block( + hdev, index, buf, size, offset, map); + if (rc) + goto out; + } + } + +out: + hl_state_dump_free_sync_to_engine_map(map); +free_map_mem: + kfree(map); + + return rc; +} + +/** + * hl_state_dump_alloc_read_sm_block_monitors - read monitors for a specific + * block + * @hdev: pointer to the device + * @index: sync manager block index starting with E_N + * + * Returns an array of monitor data of size SP_MONITORS_AMOUNT or NULL + * on error + */ +static struct hl_mon_state_dump * +hl_state_dump_alloc_read_sm_block_monitors(struct hl_device *hdev, u32 index) +{ + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; + struct hl_mon_state_dump *monitors; + s64 base_addr; /* Base addr can be negative */ + int i; + + monitors = vmalloc(sds->props[SP_MONITORS_AMOUNT] * + sizeof(struct hl_mon_state_dump)); + if (!monitors) + return NULL; + + base_addr = sds->props[SP_NEXT_SYNC_OBJ_ADDR] * index; + + for (i = 0; i < sds->props[SP_MONITORS_AMOUNT]; ++i) { + monitors[i].id = i; + monitors[i].wr_addr_low = + RREG32(base_addr + sds->props[SP_MON_OBJ_WR_ADDR_LOW] + + i * sizeof(u32)); + + monitors[i].wr_addr_high = + RREG32(base_addr + sds->props[SP_MON_OBJ_WR_ADDR_HIGH] + + i * sizeof(u32)); + + monitors[i].wr_data = + RREG32(base_addr + sds->props[SP_MON_OBJ_WR_DATA] + + i * sizeof(u32)); + + monitors[i].arm_data = + RREG32(base_addr + sds->props[SP_MON_OBJ_ARM_DATA] + + i * sizeof(u32)); + + monitors[i].status = + RREG32(base_addr + sds->props[SP_MON_OBJ_STATUS] + + i * sizeof(u32)); + } + + return monitors; +} + +/** + * hl_state_dump_free_monitors - free the monitors structure + * @monitors: monitors array created with + * hl_state_dump_alloc_read_sm_block_monitors + */ +static void hl_state_dump_free_monitors(struct hl_mon_state_dump *monitors) +{ + vfree(monitors); +} + +/** + * hl_state_dump_print_monitors_single_block - print active monitors on a + * single block + * @hdev: pointer to the device + * @index: sync manager block index starting with E_N + * @buf: destination buffer double pointer to be used with hl_snprintf_resize + * @size: pointer to the size container + * @offset: pointer to the offset container + * + * Returns 0 on success or error code on failure + */ +static int hl_state_dump_print_monitors_single_block(struct hl_device *hdev, + u32 index, + char **buf, size_t *size, + size_t *offset) +{ + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; + struct hl_mon_state_dump *monitors = NULL; + int rc = 0, i; + + if (sds->sync_namager_names) { + rc = hl_snprintf_resize( + buf, size, offset, "%s\n", + sds->sync_namager_names[index]); + if (rc) + goto out; + } + + monitors = hl_state_dump_alloc_read_sm_block_monitors(hdev, index); + if (!monitors) { + rc = -ENOMEM; + goto out; + } + + for (i = 0; i < sds->props[SP_MONITORS_AMOUNT]; ++i) { + if (!(sds->funcs.monitor_valid(&monitors[i]))) + continue; + + /* Monitor is valid, dump it */ + rc = sds->funcs.print_single_monitor(buf, size, offset, hdev, + &monitors[i]); + if (rc) + goto free_monitors; + + hl_snprintf_resize(buf, size, offset, "\n"); + } + +free_monitors: + hl_state_dump_free_monitors(monitors); +out: + return rc; +} + +/** + * hl_state_dump_print_monitors - print active monitors + * @hdev: pointer to the device + * @buf: destination buffer double pointer to be used with hl_snprintf_resize + * @size: pointer to the size container + * @offset: pointer to the offset container + * + * Returns 0 on success or error code on failure + */ +static int hl_state_dump_print_monitors(struct hl_device *hdev, + char **buf, size_t *size, + size_t *offset) +{ + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; + u32 index; + int rc = 0; + + rc = hl_snprintf_resize(buf, size, offset, + "Valid (armed) monitor objects:\n"); + if (rc) + goto out; + + if (sds->sync_namager_names) { + for (index = 0; sds->sync_namager_names[index]; ++index) { + rc = hl_state_dump_print_monitors_single_block( + hdev, index, buf, size, offset); + if (rc) + goto out; + } + } else { + for (index = 0; index < sds->props[SP_NUM_CORES]; ++index) { + rc = hl_state_dump_print_monitors_single_block( + hdev, index, buf, size, offset); + if (rc) + goto out; + } + } + +out: + return rc; +} + +/** + * hl_state_dump_print_engine_fences - print active fences for a specific + * engine + * @hdev: pointer to the device + * @engine_type: engine type to use + * @buf: destination buffer double pointer to be used with hl_snprintf_resize + * @size: pointer to the size container + * @offset: pointer to the offset container + */ +static int +hl_state_dump_print_engine_fences(struct hl_device *hdev, + enum hl_sync_engine_type engine_type, + char **buf, size_t *size, size_t *offset) +{ + struct hl_state_dump_specs *sds = &hdev->state_dump_specs; + int rc = 0, i, n_fences; + u64 base_addr, next_fence; + + switch (engine_type) { + case ENGINE_TPC: + n_fences = sds->props[SP_NUM_OF_TPC_ENGINES]; + base_addr = sds->props[SP_TPC0_CMDQ]; + next_fence = sds->props[SP_NEXT_TPC]; + break; + case ENGINE_MME: + n_fences = sds->props[SP_NUM_OF_MME_ENGINES]; + base_addr = sds->props[SP_MME_CMDQ]; + next_fence = sds->props[SP_NEXT_MME]; + break; + case ENGINE_DMA: + n_fences = sds->props[SP_NUM_OF_DMA_ENGINES]; + base_addr = sds->props[SP_DMA_CMDQ]; + next_fence = sds->props[SP_DMA_QUEUES_OFFSET]; + break; + default: + return -EINVAL; + } + for (i = 0; i < n_fences; ++i) { + rc = sds->funcs.print_fences_single_engine( + hdev, + base_addr + next_fence * i + + sds->props[SP_FENCE0_CNT_OFFSET], + base_addr + next_fence * i + + sds->props[SP_CP_STS_OFFSET], + engine_type, i, buf, size, offset); + if (rc) + goto out; + } +out: + return rc; +} + +/** + * hl_state_dump_print_fences - print active fences + * @hdev: pointer to the device + * @buf: destination buffer double pointer to be used with hl_snprintf_resize + * @size: pointer to the size container + * @offset: pointer to the offset container + */ +static int hl_state_dump_print_fences(struct hl_device *hdev, char **buf, + size_t *size, size_t *offset) +{ + int rc = 0; + + rc = hl_snprintf_resize(buf, size, offset, "Valid (armed) fences:\n"); + if (rc) + goto out; + + rc = hl_state_dump_print_engine_fences(hdev, ENGINE_TPC, buf, size, offset); + if (rc) + goto out; + + rc = hl_state_dump_print_engine_fences(hdev, ENGINE_MME, buf, size, offset); + if (rc) + goto out; + + rc = hl_state_dump_print_engine_fences(hdev, ENGINE_DMA, buf, size, offset); + if (rc) + goto out; + +out: + return rc; +} + +/** + * hl_state_dump() - dump system state + * @hdev: pointer to device structure + */ +int hl_state_dump(struct hl_device *hdev) +{ + char *buf = NULL; + size_t offset = 0, size = 0; + int rc; + + rc = hl_snprintf_resize(&buf, &size, &offset, + "Timestamp taken on: %llu\n\n", + ktime_to_ns(ktime_get())); + if (rc) + goto err; + + rc = hl_state_dump_print_syncs(hdev, &buf, &size, &offset); + if (rc) + goto err; + + hl_snprintf_resize(&buf, &size, &offset, "\n"); + + rc = hl_state_dump_print_monitors(hdev, &buf, &size, &offset); + if (rc) + goto err; + + hl_snprintf_resize(&buf, &size, &offset, "\n"); + + rc = hl_state_dump_print_fences(hdev, &buf, &size, &offset); + if (rc) + goto err; + + hl_snprintf_resize(&buf, &size, &offset, "\n"); + + hl_debugfs_set_state_dump(hdev, buf, size); + + return 0; +err: + vfree(buf); + return rc; +} diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c new file mode 100644 index 000000000..36e981413 --- /dev/null +++ b/drivers/misc/habanalabs/common/sysfs.c @@ -0,0 +1,514 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2016-2022 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "habanalabs.h" + +#include <linux/pci.h> + +static ssize_t clk_max_freq_mhz_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + long value; + + if (!hl_device_operational(hdev, NULL)) + return -ENODEV; + + value = hl_fw_get_frequency(hdev, hdev->asic_prop.clk_pll_index, false); + if (value < 0) + return value; + + hdev->asic_prop.max_freq_value = value; + + return sprintf(buf, "%lu\n", (value / 1000 / 1000)); +} + +static ssize_t clk_max_freq_mhz_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + int rc; + u64 value; + + if (!hl_device_operational(hdev, NULL)) { + count = -ENODEV; + goto fail; + } + + rc = kstrtoull(buf, 0, &value); + if (rc) { + count = -EINVAL; + goto fail; + } + + hdev->asic_prop.max_freq_value = value * 1000 * 1000; + + hl_fw_set_frequency(hdev, hdev->asic_prop.clk_pll_index, hdev->asic_prop.max_freq_value); + +fail: + return count; +} + +static ssize_t clk_cur_freq_mhz_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + long value; + + if (!hl_device_operational(hdev, NULL)) + return -ENODEV; + + value = hl_fw_get_frequency(hdev, hdev->asic_prop.clk_pll_index, true); + if (value < 0) + return value; + + return sprintf(buf, "%lu\n", (value / 1000 / 1000)); +} + +static DEVICE_ATTR_RW(clk_max_freq_mhz); +static DEVICE_ATTR_RO(clk_cur_freq_mhz); + +static struct attribute *hl_dev_clk_attrs[] = { + &dev_attr_clk_max_freq_mhz.attr, + &dev_attr_clk_cur_freq_mhz.attr, + NULL, +}; + +static ssize_t vrm_ver_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + struct cpucp_info *cpucp_info; + + cpucp_info = &hdev->asic_prop.cpucp_info; + + if (cpucp_info->infineon_second_stage_version) + return sprintf(buf, "%#04x %#04x\n", le32_to_cpu(cpucp_info->infineon_version), + le32_to_cpu(cpucp_info->infineon_second_stage_version)); + else + return sprintf(buf, "%#04x\n", le32_to_cpu(cpucp_info->infineon_version)); +} + +static DEVICE_ATTR_RO(vrm_ver); + +static struct attribute *hl_dev_vrm_attrs[] = { + &dev_attr_vrm_ver.attr, + NULL, +}; + +static ssize_t uboot_ver_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + + return sprintf(buf, "%s\n", hdev->asic_prop.uboot_ver); +} + +static ssize_t armcp_kernel_ver_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + + return sprintf(buf, "%s", hdev->asic_prop.cpucp_info.kernel_version); +} + +static ssize_t armcp_ver_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + + return sprintf(buf, "%s\n", hdev->asic_prop.cpucp_info.cpucp_version); +} + +static ssize_t cpld_ver_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + + return sprintf(buf, "0x%08x\n", + le32_to_cpu(hdev->asic_prop.cpucp_info.cpld_version)); +} + +static ssize_t cpucp_kernel_ver_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + + return sprintf(buf, "%s", hdev->asic_prop.cpucp_info.kernel_version); +} + +static ssize_t cpucp_ver_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + + return sprintf(buf, "%s\n", hdev->asic_prop.cpucp_info.cpucp_version); +} + +static ssize_t fuse_ver_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + + return sprintf(buf, "%s\n", hdev->asic_prop.cpucp_info.fuse_version); +} + +static ssize_t thermal_ver_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + + return sprintf(buf, "%s", hdev->asic_prop.cpucp_info.thermal_version); +} + +static ssize_t fw_os_ver_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + + return sprintf(buf, "%s", hdev->asic_prop.cpucp_info.fw_os_version); +} + +static ssize_t preboot_btl_ver_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + + return sprintf(buf, "%s\n", hdev->asic_prop.preboot_ver); +} + +static ssize_t soft_reset_store(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + long value; + int rc; + + rc = kstrtoul(buf, 0, &value); + + if (rc) { + count = -EINVAL; + goto out; + } + + if (!hdev->asic_prop.allow_inference_soft_reset) { + dev_err(hdev->dev, "Device does not support inference soft-reset\n"); + goto out; + } + + dev_warn(hdev->dev, "Inference Soft-Reset requested through sysfs\n"); + + hl_device_reset(hdev, 0); + +out: + return count; +} + +static ssize_t hard_reset_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + long value; + int rc; + + rc = kstrtoul(buf, 0, &value); + + if (rc) { + count = -EINVAL; + goto out; + } + + dev_warn(hdev->dev, "Hard-Reset requested through sysfs\n"); + + hl_device_reset(hdev, HL_DRV_RESET_HARD); + +out: + return count; +} + +static ssize_t device_type_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + char *str; + + switch (hdev->asic_type) { + case ASIC_GOYA: + str = "GOYA"; + break; + case ASIC_GAUDI: + str = "GAUDI"; + break; + case ASIC_GAUDI_SEC: + str = "GAUDI SEC"; + break; + case ASIC_GAUDI2: + str = "GAUDI2"; + break; + case ASIC_GAUDI2_SEC: + str = "GAUDI2 SEC"; + break; + default: + dev_err(hdev->dev, "Unrecognized ASIC type %d\n", + hdev->asic_type); + return -EINVAL; + } + + return sprintf(buf, "%s\n", str); +} + +static ssize_t pci_addr_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + + return sprintf(buf, "%04x:%02x:%02x.%x\n", + pci_domain_nr(hdev->pdev->bus), + hdev->pdev->bus->number, + PCI_SLOT(hdev->pdev->devfn), + PCI_FUNC(hdev->pdev->devfn)); +} + +static ssize_t status_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + char str[HL_STR_MAX]; + + strscpy(str, hdev->status[hl_device_status(hdev)], HL_STR_MAX); + + /* use uppercase for backward compatibility */ + str[0] = 'A' + (str[0] - 'a'); + + return sprintf(buf, "%s\n", str); +} + +static ssize_t soft_reset_cnt_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + + return sprintf(buf, "%d\n", hdev->reset_info.compute_reset_cnt); +} + +static ssize_t hard_reset_cnt_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + + return sprintf(buf, "%d\n", hdev->reset_info.hard_reset_cnt); +} + +static ssize_t max_power_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + long val; + + if (!hl_device_operational(hdev, NULL)) + return -ENODEV; + + val = hl_fw_get_max_power(hdev); + if (val < 0) + return val; + + return sprintf(buf, "%lu\n", val); +} + +static ssize_t max_power_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + unsigned long value; + int rc; + + if (!hl_device_operational(hdev, NULL)) { + count = -ENODEV; + goto out; + } + + rc = kstrtoul(buf, 0, &value); + + if (rc) { + count = -EINVAL; + goto out; + } + + hdev->max_power = value; + hl_fw_set_max_power(hdev); + +out: + return count; +} + +static ssize_t eeprom_read_handler(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, loff_t offset, + size_t max_size) +{ + struct device *dev = kobj_to_dev(kobj); + struct hl_device *hdev = dev_get_drvdata(dev); + char *data; + int rc; + + if (!hl_device_operational(hdev, NULL)) + return -ENODEV; + + if (!max_size) + return -EINVAL; + + data = kzalloc(max_size, GFP_KERNEL); + if (!data) + return -ENOMEM; + + rc = hdev->asic_funcs->get_eeprom_data(hdev, data, max_size); + if (rc) + goto out; + + memcpy(buf, data, max_size); + +out: + kfree(data); + + return max_size; +} + +static ssize_t security_enabled_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + + return sprintf(buf, "%d\n", hdev->asic_prop.fw_security_enabled); +} + +static DEVICE_ATTR_RO(armcp_kernel_ver); +static DEVICE_ATTR_RO(armcp_ver); +static DEVICE_ATTR_RO(cpld_ver); +static DEVICE_ATTR_RO(cpucp_kernel_ver); +static DEVICE_ATTR_RO(cpucp_ver); +static DEVICE_ATTR_RO(device_type); +static DEVICE_ATTR_RO(fuse_ver); +static DEVICE_ATTR_WO(hard_reset); +static DEVICE_ATTR_RO(hard_reset_cnt); +static DEVICE_ATTR_RW(max_power); +static DEVICE_ATTR_RO(pci_addr); +static DEVICE_ATTR_RO(preboot_btl_ver); +static DEVICE_ATTR_WO(soft_reset); +static DEVICE_ATTR_RO(soft_reset_cnt); +static DEVICE_ATTR_RO(status); +static DEVICE_ATTR_RO(thermal_ver); +static DEVICE_ATTR_RO(uboot_ver); +static DEVICE_ATTR_RO(fw_os_ver); +static DEVICE_ATTR_RO(security_enabled); + +static struct bin_attribute bin_attr_eeprom = { + .attr = {.name = "eeprom", .mode = (0444)}, + .size = PAGE_SIZE, + .read = eeprom_read_handler +}; + +static struct attribute *hl_dev_attrs[] = { + &dev_attr_armcp_kernel_ver.attr, + &dev_attr_armcp_ver.attr, + &dev_attr_cpld_ver.attr, + &dev_attr_cpucp_kernel_ver.attr, + &dev_attr_cpucp_ver.attr, + &dev_attr_device_type.attr, + &dev_attr_fuse_ver.attr, + &dev_attr_hard_reset.attr, + &dev_attr_hard_reset_cnt.attr, + &dev_attr_max_power.attr, + &dev_attr_pci_addr.attr, + &dev_attr_preboot_btl_ver.attr, + &dev_attr_status.attr, + &dev_attr_thermal_ver.attr, + &dev_attr_uboot_ver.attr, + &dev_attr_fw_os_ver.attr, + &dev_attr_security_enabled.attr, + NULL, +}; + +static struct bin_attribute *hl_dev_bin_attrs[] = { + &bin_attr_eeprom, + NULL +}; + +static struct attribute_group hl_dev_attr_group = { + .attrs = hl_dev_attrs, + .bin_attrs = hl_dev_bin_attrs, +}; + +static struct attribute_group hl_dev_clks_attr_group; +static struct attribute_group hl_dev_vrm_attr_group; + +static const struct attribute_group *hl_dev_attr_groups[] = { + &hl_dev_attr_group, + &hl_dev_clks_attr_group, + &hl_dev_vrm_attr_group, + NULL, +}; + +static struct attribute *hl_dev_inference_attrs[] = { + &dev_attr_soft_reset.attr, + &dev_attr_soft_reset_cnt.attr, + NULL, +}; + +static struct attribute_group hl_dev_inference_attr_group = { + .attrs = hl_dev_inference_attrs, +}; + +static const struct attribute_group *hl_dev_inference_attr_groups[] = { + &hl_dev_inference_attr_group, + NULL, +}; + +void hl_sysfs_add_dev_clk_attr(struct hl_device *hdev, struct attribute_group *dev_clk_attr_grp) +{ + dev_clk_attr_grp->attrs = hl_dev_clk_attrs; +} + +void hl_sysfs_add_dev_vrm_attr(struct hl_device *hdev, struct attribute_group *dev_vrm_attr_grp) +{ + dev_vrm_attr_grp->attrs = hl_dev_vrm_attrs; +} + +int hl_sysfs_init(struct hl_device *hdev) +{ + int rc; + + hdev->max_power = hdev->asic_prop.max_power_default; + + hdev->asic_funcs->add_device_attr(hdev, &hl_dev_clks_attr_group, &hl_dev_vrm_attr_group); + + rc = device_add_groups(hdev->dev, hl_dev_attr_groups); + if (rc) { + dev_err(hdev->dev, + "Failed to add groups to device, error %d\n", rc); + return rc; + } + + if (!hdev->asic_prop.allow_inference_soft_reset) + return 0; + + rc = device_add_groups(hdev->dev, hl_dev_inference_attr_groups); + if (rc) { + dev_err(hdev->dev, + "Failed to add groups to device, error %d\n", rc); + return rc; + } + + return 0; +} + +void hl_sysfs_fini(struct hl_device *hdev) +{ + device_remove_groups(hdev->dev, hl_dev_attr_groups); + + if (!hdev->asic_prop.allow_inference_soft_reset) + return; + + device_remove_groups(hdev->dev, hl_dev_inference_attr_groups); +} |