diff options
Diffstat (limited to 'drivers/infiniband/hw/mlx5/umr.c')
-rw-r--r-- | drivers/infiniband/hw/mlx5/umr.c | 762 |
1 files changed, 762 insertions, 0 deletions
diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c new file mode 100644 index 0000000000..234bf30db7 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -0,0 +1,762 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. */ + +#include <rdma/ib_umem_odp.h> +#include "mlx5_ib.h" +#include "umr.h" +#include "wr.h" + +/* + * We can't use an array for xlt_emergency_page because dma_map_single doesn't + * work on kernel modules memory + */ +void *xlt_emergency_page; +static DEFINE_MUTEX(xlt_emergency_page_mutex); + +static __be64 get_umr_enable_mr_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_disable_mr_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_translation_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_access_mask(struct mlx5_ib_dev *dev) +{ + u64 result; + + result = MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW; + + if (MLX5_CAP_GEN(dev->mdev, atomic)) + result |= MLX5_MKEY_MASK_A; + + if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)) + result |= MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE; + + if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)) + result |= MLX5_MKEY_MASK_RELAXED_ORDERING_READ; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_pd_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_PD; + + return cpu_to_be64(result); +} + +static int umr_check_mkey_mask(struct mlx5_ib_dev *dev, u64 mask) +{ + if (mask & MLX5_MKEY_MASK_PAGE_SIZE && + MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled)) + return -EPERM; + + if (mask & MLX5_MKEY_MASK_A && + MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled)) + return -EPERM; + + if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE && + !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)) + return -EPERM; + + if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_READ && + !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)) + return -EPERM; + + return 0; +} + +enum { + MAX_UMR_WR = 128, +}; + +static int mlx5r_umr_qp_rst2rts(struct mlx5_ib_dev *dev, struct ib_qp *qp) +{ + struct ib_qp_attr attr = {}; + int ret; + + attr.qp_state = IB_QPS_INIT; + attr.port_num = 1; + ret = ib_modify_qp(qp, &attr, + IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); + return ret; + } + + memset(&attr, 0, sizeof(attr)); + attr.qp_state = IB_QPS_RTR; + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n"); + return ret; + } + + memset(&attr, 0, sizeof(attr)); + attr.qp_state = IB_QPS_RTS; + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n"); + return ret; + } + + return 0; +} + +int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev) +{ + struct ib_qp_init_attr init_attr = {}; + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_qp *qp; + int ret; + + pd = ib_alloc_pd(&dev->ib_dev, 0); + if (IS_ERR(pd)) { + mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); + return PTR_ERR(pd); + } + + cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ); + if (IS_ERR(cq)) { + mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n"); + ret = PTR_ERR(cq); + goto destroy_pd; + } + + init_attr.send_cq = cq; + init_attr.recv_cq = cq; + init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + init_attr.cap.max_send_wr = MAX_UMR_WR; + init_attr.cap.max_send_sge = 1; + init_attr.qp_type = MLX5_IB_QPT_REG_UMR; + init_attr.port_num = 1; + qp = ib_create_qp(pd, &init_attr); + if (IS_ERR(qp)) { + mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n"); + ret = PTR_ERR(qp); + goto destroy_cq; + } + + ret = mlx5r_umr_qp_rst2rts(dev, qp); + if (ret) + goto destroy_qp; + + dev->umrc.qp = qp; + dev->umrc.cq = cq; + dev->umrc.pd = pd; + + sema_init(&dev->umrc.sem, MAX_UMR_WR); + mutex_init(&dev->umrc.lock); + dev->umrc.state = MLX5_UMR_STATE_ACTIVE; + + return 0; + +destroy_qp: + ib_destroy_qp(qp); +destroy_cq: + ib_free_cq(cq); +destroy_pd: + ib_dealloc_pd(pd); + return ret; +} + +void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev) +{ + if (dev->umrc.state == MLX5_UMR_STATE_UNINIT) + return; + ib_destroy_qp(dev->umrc.qp); + ib_free_cq(dev->umrc.cq); + ib_dealloc_pd(dev->umrc.pd); +} + +static int mlx5r_umr_recover(struct mlx5_ib_dev *dev) +{ + struct umr_common *umrc = &dev->umrc; + struct ib_qp_attr attr; + int err; + + attr.qp_state = IB_QPS_RESET; + err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE); + if (err) { + mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); + goto err; + } + + err = mlx5r_umr_qp_rst2rts(dev, umrc->qp); + if (err) + goto err; + + umrc->state = MLX5_UMR_STATE_ACTIVE; + return 0; + +err: + umrc->state = MLX5_UMR_STATE_ERR; + return err; +} + +static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe, + struct mlx5r_umr_wqe *wqe, bool with_data) +{ + unsigned int wqe_size = + with_data ? sizeof(struct mlx5r_umr_wqe) : + sizeof(struct mlx5r_umr_wqe) - + sizeof(struct mlx5_wqe_data_seg); + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_wqe_ctrl_seg *ctrl; + union { + struct ib_cqe *ib_cqe; + u64 wr_id; + } id; + void *cur_edge, *seg; + unsigned long flags; + unsigned int idx; + int size, err; + + if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)) + return -EIO; + + spin_lock_irqsave(&qp->sq.lock, flags); + + err = mlx5r_begin_wqe(qp, &seg, &ctrl, &idx, &size, &cur_edge, 0, + cpu_to_be32(mkey), false, false); + if (WARN_ON(err)) + goto out; + + qp->sq.wr_data[idx] = MLX5_IB_WR_UMR; + + mlx5r_memcpy_send_wqe(&qp->sq, &cur_edge, &seg, &size, wqe, wqe_size); + + id.ib_cqe = cqe; + mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0, + MLX5_FENCE_MODE_INITIATOR_SMALL, MLX5_OPCODE_UMR); + + mlx5r_ring_db(qp, 1, ctrl); + +out: + spin_unlock_irqrestore(&qp->sq.lock, flags); + + return err; +} + +static void mlx5r_umr_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct mlx5_ib_umr_context *context = + container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe); + + context->status = wc->status; + complete(&context->done); +} + +static inline void mlx5r_umr_init_context(struct mlx5r_umr_context *context) +{ + context->cqe.done = mlx5r_umr_done; + init_completion(&context->done); +} + +static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey, + struct mlx5r_umr_wqe *wqe, bool with_data) +{ + struct umr_common *umrc = &dev->umrc; + struct mlx5r_umr_context umr_context; + int err; + + err = umr_check_mkey_mask(dev, be64_to_cpu(wqe->ctrl_seg.mkey_mask)); + if (WARN_ON(err)) + return err; + + mlx5r_umr_init_context(&umr_context); + + down(&umrc->sem); + while (true) { + mutex_lock(&umrc->lock); + if (umrc->state == MLX5_UMR_STATE_ERR) { + mutex_unlock(&umrc->lock); + err = -EFAULT; + break; + } + + if (umrc->state == MLX5_UMR_STATE_RECOVER) { + mutex_unlock(&umrc->lock); + usleep_range(3000, 5000); + continue; + } + + err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe, + with_data); + mutex_unlock(&umrc->lock); + if (err) { + mlx5_ib_warn(dev, "UMR post send failed, err %d\n", + err); + break; + } + + wait_for_completion(&umr_context.done); + + if (umr_context.status == IB_WC_SUCCESS) + break; + + if (umr_context.status == IB_WC_WR_FLUSH_ERR) + continue; + + WARN_ON_ONCE(1); + mlx5_ib_warn(dev, + "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs\n", + umr_context.status); + mutex_lock(&umrc->lock); + err = mlx5r_umr_recover(dev); + mutex_unlock(&umrc->lock); + if (err) + mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n", + err); + err = -EFAULT; + break; + } + up(&umrc->sem); + return err; +} + +/** + * mlx5r_umr_revoke_mr - Fence all DMA on the MR + * @mr: The MR to fence + * + * Upon return the NIC will not be doing any DMA to the pages under the MR, + * and any DMA in progress will be completed. Failure of this function + * indicates the HW has failed catastrophically. + */ +int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr) +{ + struct mlx5_ib_dev *dev = mr_to_mdev(mr); + struct mlx5r_umr_wqe wqe = {}; + + if (dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + return 0; + + wqe.ctrl_seg.mkey_mask |= get_umr_update_pd_mask(); + wqe.ctrl_seg.mkey_mask |= get_umr_disable_mr_mask(); + wqe.ctrl_seg.flags |= MLX5_UMR_INLINE; + + MLX5_SET(mkc, &wqe.mkey_seg, free, 1); + MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(dev->umrc.pd)->pdn); + MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff); + MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0, + mlx5_mkey_variant(mr->mmkey.key)); + + return mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false); +} + +static void mlx5r_umr_set_access_flags(struct mlx5_ib_dev *dev, + struct mlx5_mkey_seg *seg, + unsigned int access_flags) +{ + bool ro_read = (access_flags & IB_ACCESS_RELAXED_ORDERING) && + (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) || + pcie_relaxed_ordering_enabled(dev->mdev->pdev)); + + MLX5_SET(mkc, seg, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC)); + MLX5_SET(mkc, seg, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE)); + MLX5_SET(mkc, seg, rr, !!(access_flags & IB_ACCESS_REMOTE_READ)); + MLX5_SET(mkc, seg, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE)); + MLX5_SET(mkc, seg, lr, 1); + MLX5_SET(mkc, seg, relaxed_ordering_write, + !!(access_flags & IB_ACCESS_RELAXED_ORDERING)); + MLX5_SET(mkc, seg, relaxed_ordering_read, ro_read); +} + +int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd, + int access_flags) +{ + struct mlx5_ib_dev *dev = mr_to_mdev(mr); + struct mlx5r_umr_wqe wqe = {}; + int err; + + wqe.ctrl_seg.mkey_mask = get_umr_update_access_mask(dev); + wqe.ctrl_seg.mkey_mask |= get_umr_update_pd_mask(); + wqe.ctrl_seg.flags = MLX5_UMR_CHECK_FREE; + wqe.ctrl_seg.flags |= MLX5_UMR_INLINE; + + mlx5r_umr_set_access_flags(dev, &wqe.mkey_seg, access_flags); + MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(pd)->pdn); + MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff); + MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0, + mlx5_mkey_variant(mr->mmkey.key)); + + err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false); + if (err) + return err; + + mr->access_flags = access_flags; + return 0; +} + +#define MLX5_MAX_UMR_CHUNK \ + ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - MLX5_UMR_FLEX_ALIGNMENT) +#define MLX5_SPARE_UMR_CHUNK 0x10000 + +/* + * Allocate a temporary buffer to hold the per-page information to transfer to + * HW. For efficiency this should be as large as it can be, but buffer + * allocation failure is not allowed, so try smaller sizes. + */ +static void *mlx5r_umr_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask) +{ + const size_t xlt_chunk_align = MLX5_UMR_FLEX_ALIGNMENT / ent_size; + size_t size; + void *res = NULL; + + static_assert(PAGE_SIZE % MLX5_UMR_FLEX_ALIGNMENT == 0); + + /* + * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the + * allocation can't trigger any kind of reclaim. + */ + might_sleep(); + + gfp_mask |= __GFP_ZERO | __GFP_NORETRY; + + /* + * If the system already has a suitable high order page then just use + * that, but don't try hard to create one. This max is about 1M, so a + * free x86 huge page will satisfy it. + */ + size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align), + MLX5_MAX_UMR_CHUNK); + *nents = size / ent_size; + res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, + get_order(size)); + if (res) + return res; + + if (size > MLX5_SPARE_UMR_CHUNK) { + size = MLX5_SPARE_UMR_CHUNK; + *nents = size / ent_size; + res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, + get_order(size)); + if (res) + return res; + } + + *nents = PAGE_SIZE / ent_size; + res = (void *)__get_free_page(gfp_mask); + if (res) + return res; + + mutex_lock(&xlt_emergency_page_mutex); + memset(xlt_emergency_page, 0, PAGE_SIZE); + return xlt_emergency_page; +} + +static void mlx5r_umr_free_xlt(void *xlt, size_t length) +{ + if (xlt == xlt_emergency_page) { + mutex_unlock(&xlt_emergency_page_mutex); + return; + } + + free_pages((unsigned long)xlt, get_order(length)); +} + +static void mlx5r_umr_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt, + struct ib_sge *sg) +{ + struct device *ddev = &dev->mdev->pdev->dev; + + dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE); + mlx5r_umr_free_xlt(xlt, sg->length); +} + +/* + * Create an XLT buffer ready for submission. + */ +static void *mlx5r_umr_create_xlt(struct mlx5_ib_dev *dev, struct ib_sge *sg, + size_t nents, size_t ent_size, + unsigned int flags) +{ + struct device *ddev = &dev->mdev->pdev->dev; + dma_addr_t dma; + void *xlt; + + xlt = mlx5r_umr_alloc_xlt(&nents, ent_size, + flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC : + GFP_KERNEL); + sg->length = nents * ent_size; + dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE); + if (dma_mapping_error(ddev, dma)) { + mlx5_ib_err(dev, "unable to map DMA during XLT update.\n"); + mlx5r_umr_free_xlt(xlt, sg->length); + return NULL; + } + sg->addr = dma; + sg->lkey = dev->umrc.pd->local_dma_lkey; + + return xlt; +} + +static void +mlx5r_umr_set_update_xlt_ctrl_seg(struct mlx5_wqe_umr_ctrl_seg *ctrl_seg, + unsigned int flags, struct ib_sge *sg) +{ + if (!(flags & MLX5_IB_UPD_XLT_ENABLE)) + /* fail if free */ + ctrl_seg->flags = MLX5_UMR_CHECK_FREE; + else + /* fail if not free */ + ctrl_seg->flags = MLX5_UMR_CHECK_NOT_FREE; + ctrl_seg->xlt_octowords = + cpu_to_be16(mlx5r_umr_get_xlt_octo(sg->length)); +} + +static void mlx5r_umr_set_update_xlt_mkey_seg(struct mlx5_ib_dev *dev, + struct mlx5_mkey_seg *mkey_seg, + struct mlx5_ib_mr *mr, + unsigned int page_shift) +{ + mlx5r_umr_set_access_flags(dev, mkey_seg, mr->access_flags); + MLX5_SET(mkc, mkey_seg, pd, to_mpd(mr->ibmr.pd)->pdn); + MLX5_SET64(mkc, mkey_seg, start_addr, mr->ibmr.iova); + MLX5_SET64(mkc, mkey_seg, len, mr->ibmr.length); + MLX5_SET(mkc, mkey_seg, log_page_size, page_shift); + MLX5_SET(mkc, mkey_seg, qpn, 0xffffff); + MLX5_SET(mkc, mkey_seg, mkey_7_0, mlx5_mkey_variant(mr->mmkey.key)); +} + +static void +mlx5r_umr_set_update_xlt_data_seg(struct mlx5_wqe_data_seg *data_seg, + struct ib_sge *sg) +{ + data_seg->byte_count = cpu_to_be32(sg->length); + data_seg->lkey = cpu_to_be32(sg->lkey); + data_seg->addr = cpu_to_be64(sg->addr); +} + +static void mlx5r_umr_update_offset(struct mlx5_wqe_umr_ctrl_seg *ctrl_seg, + u64 offset) +{ + u64 octo_offset = mlx5r_umr_get_xlt_octo(offset); + + ctrl_seg->xlt_offset = cpu_to_be16(octo_offset & 0xffff); + ctrl_seg->xlt_offset_47_16 = cpu_to_be32(octo_offset >> 16); + ctrl_seg->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; +} + +static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev, + struct mlx5r_umr_wqe *wqe, + struct mlx5_ib_mr *mr, struct ib_sge *sg, + unsigned int flags) +{ + bool update_pd_access, update_translation; + + if (flags & MLX5_IB_UPD_XLT_ENABLE) + wqe->ctrl_seg.mkey_mask |= get_umr_enable_mr_mask(); + + update_pd_access = flags & MLX5_IB_UPD_XLT_ENABLE || + flags & MLX5_IB_UPD_XLT_PD || + flags & MLX5_IB_UPD_XLT_ACCESS; + + if (update_pd_access) { + wqe->ctrl_seg.mkey_mask |= get_umr_update_access_mask(dev); + wqe->ctrl_seg.mkey_mask |= get_umr_update_pd_mask(); + } + + update_translation = + flags & MLX5_IB_UPD_XLT_ENABLE || flags & MLX5_IB_UPD_XLT_ADDR; + + if (update_translation) { + wqe->ctrl_seg.mkey_mask |= get_umr_update_translation_mask(); + if (!mr->ibmr.length) + MLX5_SET(mkc, &wqe->mkey_seg, length64, 1); + } + + wqe->ctrl_seg.xlt_octowords = + cpu_to_be16(mlx5r_umr_get_xlt_octo(sg->length)); + wqe->data_seg.byte_count = cpu_to_be32(sg->length); +} + +/* + * Send the DMA list to the HW for a normal MR using UMR. + * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP + * flag may be used. + */ +int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) +{ + struct mlx5_ib_dev *dev = mr_to_mdev(mr); + struct device *ddev = &dev->mdev->pdev->dev; + struct mlx5r_umr_wqe wqe = {}; + struct ib_block_iter biter; + struct mlx5_mtt *cur_mtt; + size_t orig_sg_length; + struct mlx5_mtt *mtt; + size_t final_size; + struct ib_sge sg; + u64 offset = 0; + int err = 0; + + if (WARN_ON(mr->umem->is_odp)) + return -EINVAL; + + mtt = mlx5r_umr_create_xlt( + dev, &sg, ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift), + sizeof(*mtt), flags); + if (!mtt) + return -ENOMEM; + + orig_sg_length = sg.length; + + mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg); + mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr, + mr->page_shift); + mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg); + + cur_mtt = mtt; + rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) { + if (cur_mtt == (void *)mtt + sg.length) { + dma_sync_single_for_device(ddev, sg.addr, sg.length, + DMA_TO_DEVICE); + + err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, + true); + if (err) + goto err; + dma_sync_single_for_cpu(ddev, sg.addr, sg.length, + DMA_TO_DEVICE); + offset += sg.length; + mlx5r_umr_update_offset(&wqe.ctrl_seg, offset); + + cur_mtt = mtt; + } + + cur_mtt->ptag = + cpu_to_be64(rdma_block_iter_dma_address(&biter) | + MLX5_IB_MTT_PRESENT); + + if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) + cur_mtt->ptag = 0; + + cur_mtt++; + } + + final_size = (void *)cur_mtt - (void *)mtt; + sg.length = ALIGN(final_size, MLX5_UMR_FLEX_ALIGNMENT); + memset(cur_mtt, 0, sg.length - final_size); + mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags); + + dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); + err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, true); + +err: + sg.length = orig_sg_length; + mlx5r_umr_unmap_free_xlt(dev, mtt, &sg); + return err; +} + +static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) +{ + return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled); +} + +int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, + int page_shift, int flags) +{ + int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT) + ? sizeof(struct mlx5_klm) + : sizeof(struct mlx5_mtt); + const int page_align = MLX5_UMR_FLEX_ALIGNMENT / desc_size; + struct mlx5_ib_dev *dev = mr_to_mdev(mr); + struct device *ddev = &dev->mdev->pdev->dev; + const int page_mask = page_align - 1; + struct mlx5r_umr_wqe wqe = {}; + size_t pages_mapped = 0; + size_t pages_to_map = 0; + size_t size_to_map = 0; + size_t orig_sg_length; + size_t pages_iter; + struct ib_sge sg; + int err = 0; + void *xlt; + + if ((flags & MLX5_IB_UPD_XLT_INDIRECT) && + !umr_can_use_indirect_mkey(dev)) + return -EPERM; + + if (WARN_ON(!mr->umem->is_odp)) + return -EINVAL; + + /* UMR copies MTTs in units of MLX5_UMR_FLEX_ALIGNMENT bytes, + * so we need to align the offset and length accordingly + */ + if (idx & page_mask) { + npages += idx & page_mask; + idx &= ~page_mask; + } + pages_to_map = ALIGN(npages, page_align); + + xlt = mlx5r_umr_create_xlt(dev, &sg, npages, desc_size, flags); + if (!xlt) + return -ENOMEM; + + pages_iter = sg.length / desc_size; + orig_sg_length = sg.length; + + if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) { + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + size_t max_pages = ib_umem_odp_num_pages(odp) - idx; + + pages_to_map = min_t(size_t, pages_to_map, max_pages); + } + + mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg); + mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr, page_shift); + mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg); + + for (pages_mapped = 0; + pages_mapped < pages_to_map && !err; + pages_mapped += pages_iter, idx += pages_iter) { + npages = min_t(int, pages_iter, pages_to_map - pages_mapped); + size_to_map = npages * desc_size; + dma_sync_single_for_cpu(ddev, sg.addr, sg.length, + DMA_TO_DEVICE); + mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags); + dma_sync_single_for_device(ddev, sg.addr, sg.length, + DMA_TO_DEVICE); + sg.length = ALIGN(size_to_map, MLX5_UMR_FLEX_ALIGNMENT); + + if (pages_mapped + pages_iter >= pages_to_map) + mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags); + mlx5r_umr_update_offset(&wqe.ctrl_seg, idx * desc_size); + err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, true); + } + sg.length = orig_sg_length; + mlx5r_umr_unmap_free_xlt(dev, xlt, &sg); + return err; +} |