From 34996e42f82bfd60bc2c191e5cae3c6ab233ec6c Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 7 Aug 2024 15:11:27 +0200 Subject: Merging upstream version 6.9.7. Signed-off-by: Daniel Baumann --- drivers/vfio/pci/Kconfig | 2 + drivers/vfio/pci/Makefile | 2 + drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 48 +- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h | 6 +- drivers/vfio/pci/mlx5/cmd.c | 157 ++++- drivers/vfio/pci/mlx5/cmd.h | 11 +- drivers/vfio/pci/mlx5/main.c | 148 ++--- drivers/vfio/pci/nvgrace-gpu/Kconfig | 10 + drivers/vfio/pci/nvgrace-gpu/Makefile | 3 + drivers/vfio/pci/nvgrace-gpu/main.c | 888 +++++++++++++++++++++++++ drivers/vfio/pci/pds/dirty.c | 6 +- drivers/vfio/pci/pds/pci_drv.c | 27 +- drivers/vfio/pci/pds/vfio_dev.c | 45 +- drivers/vfio/pci/pds/vfio_dev.h | 8 +- drivers/vfio/pci/vfio_pci_config.c | 42 ++ drivers/vfio/pci/vfio_pci_core.c | 98 ++- drivers/vfio/pci/vfio_pci_intrs.c | 4 +- drivers/vfio/pci/vfio_pci_rdwr.c | 16 +- drivers/vfio/pci/virtio/main.c | 72 +- 19 files changed, 1262 insertions(+), 331 deletions(-) create mode 100644 drivers/vfio/pci/nvgrace-gpu/Kconfig create mode 100644 drivers/vfio/pci/nvgrace-gpu/Makefile create mode 100644 drivers/vfio/pci/nvgrace-gpu/main.c (limited to 'drivers/vfio/pci') diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 18c397df56..15821a2d77 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -67,4 +67,6 @@ source "drivers/vfio/pci/pds/Kconfig" source "drivers/vfio/pci/virtio/Kconfig" +source "drivers/vfio/pci/nvgrace-gpu/Kconfig" + endmenu diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index 046139a4ec..ce7a61f1d9 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -15,3 +15,5 @@ obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/ obj-$(CONFIG_PDS_VFIO_PCI) += pds/ obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/ + +obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu/ diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 4d27465c8f..9a3e97108a 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -630,25 +630,11 @@ static void hisi_acc_vf_disable_fds(struct hisi_acc_vf_core_device *hisi_acc_vde } } -/* - * This function is called in all state_mutex unlock cases to - * handle a 'deferred_reset' if exists. - */ -static void -hisi_acc_vf_state_mutex_unlock(struct hisi_acc_vf_core_device *hisi_acc_vdev) +static void hisi_acc_vf_reset(struct hisi_acc_vf_core_device *hisi_acc_vdev) { -again: - spin_lock(&hisi_acc_vdev->reset_lock); - if (hisi_acc_vdev->deferred_reset) { - hisi_acc_vdev->deferred_reset = false; - spin_unlock(&hisi_acc_vdev->reset_lock); - hisi_acc_vdev->vf_qm_state = QM_NOT_READY; - hisi_acc_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; - hisi_acc_vf_disable_fds(hisi_acc_vdev); - goto again; - } - mutex_unlock(&hisi_acc_vdev->state_mutex); - spin_unlock(&hisi_acc_vdev->reset_lock); + hisi_acc_vdev->vf_qm_state = QM_NOT_READY; + hisi_acc_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; + hisi_acc_vf_disable_fds(hisi_acc_vdev); } static void hisi_acc_vf_start_device(struct hisi_acc_vf_core_device *hisi_acc_vdev) @@ -804,8 +790,10 @@ static long hisi_acc_vf_precopy_ioctl(struct file *filp, info.dirty_bytes = 0; info.initial_bytes = migf->total_length - *pos; + mutex_unlock(&migf->lock); + mutex_unlock(&hisi_acc_vdev->state_mutex); - ret = copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; + return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; out: mutex_unlock(&migf->lock); mutex_unlock(&hisi_acc_vdev->state_mutex); @@ -1071,7 +1059,7 @@ hisi_acc_vfio_pci_set_device_state(struct vfio_device *vdev, break; } } - hisi_acc_vf_state_mutex_unlock(hisi_acc_vdev); + mutex_unlock(&hisi_acc_vdev->state_mutex); return res; } @@ -1092,7 +1080,7 @@ hisi_acc_vfio_pci_get_device_state(struct vfio_device *vdev, mutex_lock(&hisi_acc_vdev->state_mutex); *curr_state = hisi_acc_vdev->mig_state; - hisi_acc_vf_state_mutex_unlock(hisi_acc_vdev); + mutex_unlock(&hisi_acc_vdev->state_mutex); return 0; } @@ -1104,21 +1092,9 @@ static void hisi_acc_vf_pci_aer_reset_done(struct pci_dev *pdev) VFIO_MIGRATION_STOP_COPY) return; - /* - * As the higher VFIO layers are holding locks across reset and using - * those same locks with the mm_lock we need to prevent ABBA deadlock - * with the state_mutex and mm_lock. - * In case the state_mutex was taken already we defer the cleanup work - * to the unlock flow of the other running context. - */ - spin_lock(&hisi_acc_vdev->reset_lock); - hisi_acc_vdev->deferred_reset = true; - if (!mutex_trylock(&hisi_acc_vdev->state_mutex)) { - spin_unlock(&hisi_acc_vdev->reset_lock); - return; - } - spin_unlock(&hisi_acc_vdev->reset_lock); - hisi_acc_vf_state_mutex_unlock(hisi_acc_vdev); + mutex_lock(&hisi_acc_vdev->state_mutex); + hisi_acc_vf_reset(hisi_acc_vdev); + mutex_unlock(&hisi_acc_vdev->state_mutex); } static int hisi_acc_vf_qm_init(struct hisi_acc_vf_core_device *hisi_acc_vdev) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index dcabfeec6c..5bab46602f 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -98,8 +98,8 @@ struct hisi_acc_vf_migration_file { struct hisi_acc_vf_core_device { struct vfio_pci_core_device core_device; - u8 match_done:1; - u8 deferred_reset:1; + u8 match_done; + /* For migration state */ struct mutex state_mutex; enum vfio_device_mig_state mig_state; @@ -109,8 +109,6 @@ struct hisi_acc_vf_core_device { struct hisi_qm vf_qm; u32 vf_qm_state; int vf_id; - /* For reset handler */ - spinlock_t reset_lock; struct hisi_acc_vf_migration_file *resuming_migf; struct hisi_acc_vf_migration_file *saving_migf; }; diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index efd1d252cd..41a4b0cf42 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -108,8 +108,9 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp); if (ret) return ret; - if (mvdev->saving_migf->state == - MLX5_MIGF_STATE_PRE_COPY_ERROR) { + /* Upon cleanup, ignore previous pre_copy error state */ + if (mvdev->saving_migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR && + !(query_flags & MLX5VF_QUERY_CLEANUP)) { /* * In case we had a PRE_COPY error, only query full * image for final image @@ -121,6 +122,11 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, } query_flags &= ~MLX5VF_QUERY_INC; } + /* Block incremental query which is state-dependent */ + if (mvdev->saving_migf->state == MLX5_MIGF_STATE_ERROR) { + complete(&mvdev->saving_migf->save_comp); + return -ENODEV; + } } MLX5_SET(query_vhca_migration_state_in, in, opcode, @@ -149,6 +155,12 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, return 0; } +static void set_tracker_change_event(struct mlx5vf_pci_core_device *mvdev) +{ + mvdev->tracker.object_changed = true; + complete(&mvdev->tracker_comp); +} + static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev) { /* Mark the tracker under an error and wake it up if it's running */ @@ -189,7 +201,7 @@ void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev) /* Must be done outside the lock to let it progress */ set_tracker_error(mvdev); mutex_lock(&mvdev->state_mutex); - mlx5vf_disable_fds(mvdev); + mlx5vf_disable_fds(mvdev, NULL); _mlx5vf_free_page_tracker_resources(mvdev); mlx5vf_state_mutex_unlock(mvdev); } @@ -221,6 +233,10 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, if (!MLX5_CAP_GEN(mvdev->mdev, migration)) goto end; + if (!(MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) && + MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state))) + goto end; + mvdev->vf_id = pci_iov_vf_id(pdev); if (mvdev->vf_id < 0) goto end; @@ -250,17 +266,14 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, mvdev->migrate_cap = 1; mvdev->core_device.vdev.migration_flags = VFIO_MIGRATION_STOP_COPY | - VFIO_MIGRATION_P2P; + VFIO_MIGRATION_P2P | + VFIO_MIGRATION_PRE_COPY; + mvdev->core_device.vdev.mig_ops = mig_ops; init_completion(&mvdev->tracker_comp); if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization)) mvdev->core_device.vdev.log_ops = log_ops; - if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) && - MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)) - mvdev->core_device.vdev.migration_flags |= - VFIO_MIGRATION_PRE_COPY; - if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks)) mvdev->chunk_mode = 1; @@ -402,6 +415,50 @@ void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) kfree(buf); } +static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, + unsigned int npages) +{ + unsigned int to_alloc = npages; + struct page **page_list; + unsigned long filled; + unsigned int to_fill; + int ret; + + to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); + page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT); + if (!page_list) + return -ENOMEM; + + do { + filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, + page_list); + if (!filled) { + ret = -ENOMEM; + goto err; + } + to_alloc -= filled; + ret = sg_alloc_append_table_from_pages( + &buf->table, page_list, filled, 0, + filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, + GFP_KERNEL_ACCOUNT); + + if (ret) + goto err; + buf->allocated_length += filled * PAGE_SIZE; + /* clean input for another bulk allocation */ + memset(page_list, 0, filled * sizeof(*page_list)); + to_fill = min_t(unsigned int, to_alloc, + PAGE_SIZE / sizeof(*page_list)); + } while (to_alloc > 0); + + kvfree(page_list); + return 0; + +err: + kvfree(page_list); + return ret; +} + struct mlx5_vhca_data_buffer * mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, size_t length, @@ -608,8 +665,13 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) err: /* The error flow can't run from an interrupt context */ - if (status == -EREMOTEIO) + if (status == -EREMOTEIO) { status = MLX5_GET(save_vhca_state_out, async_data->out, status); + /* Failed in FW, print cmd out failure details */ + mlx5_cmd_out_err(migf->mvdev->mdev, MLX5_CMD_OP_SAVE_VHCA_STATE, 0, + async_data->out); + } + async_data->status = status; queue_work(migf->mvdev->cb_wq, &async_data->work); } @@ -623,6 +685,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; struct mlx5_vhca_data_buffer *header_buf = NULL; struct mlx5vf_async_data *async_data; + bool pre_copy_cleanup = false; int err; lockdep_assert_held(&mvdev->state_mutex); @@ -633,6 +696,10 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (err) return err; + if ((migf->state == MLX5_MIGF_STATE_PRE_COPY || + migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) && !track && !inc) + pre_copy_cleanup = true; + if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) /* * In case we had a PRE_COPY error, SAVE is triggered only for @@ -651,29 +718,27 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, async_data = &migf->async_data; async_data->buf = buf; - async_data->stop_copy_chunk = !track; + async_data->stop_copy_chunk = (!track && !pre_copy_cleanup); async_data->out = kvzalloc(out_size, GFP_KERNEL); if (!async_data->out) { err = -ENOMEM; goto err_out; } - if (MLX5VF_PRE_COPY_SUPP(mvdev)) { - if (async_data->stop_copy_chunk) { - u8 header_idx = buf->stop_copy_chunk_num ? - buf->stop_copy_chunk_num - 1 : 0; + if (async_data->stop_copy_chunk) { + u8 header_idx = buf->stop_copy_chunk_num ? + buf->stop_copy_chunk_num - 1 : 0; - header_buf = migf->buf_header[header_idx]; - migf->buf_header[header_idx] = NULL; - } + header_buf = migf->buf_header[header_idx]; + migf->buf_header[header_idx] = NULL; + } - if (!header_buf) { - header_buf = mlx5vf_get_data_buffer(migf, - sizeof(struct mlx5_vf_migration_header), DMA_NONE); - if (IS_ERR(header_buf)) { - err = PTR_ERR(header_buf); - goto err_free; - } + if (!header_buf) { + header_buf = mlx5vf_get_data_buffer(migf, + sizeof(struct mlx5_vf_migration_header), DMA_NONE); + if (IS_ERR(header_buf)) { + err = PTR_ERR(header_buf); + goto err_free; } } @@ -900,6 +965,29 @@ static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev, return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); } +static int mlx5vf_cmd_query_tracker(struct mlx5_core_dev *mdev, + struct mlx5_vhca_page_tracker *tracker) +{ + u32 out[MLX5_ST_SZ_DW(query_page_track_obj_out)] = {}; + u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; + void *obj_context; + void *cmd_hdr; + int err; + + cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker->id); + + err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + obj_context = MLX5_ADDR_OF(query_page_track_obj_out, out, obj_context); + tracker->status = MLX5_GET(page_track, obj_context, state); + return 0; +} + static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev, struct mlx5_vhca_cq_buf *buf, int nent, int cqe_size) @@ -957,9 +1045,11 @@ static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type, mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb); struct mlx5vf_pci_core_device *mvdev = container_of( tracker, struct mlx5vf_pci_core_device, tracker); + struct mlx5_eqe_obj_change *object; struct mlx5_eqe *eqe = data; u8 event_type = (u8)type; u8 queue_type; + u32 obj_id; int qp_num; switch (event_type) { @@ -975,6 +1065,12 @@ static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type, break; set_tracker_error(mvdev); break; + case MLX5_EVENT_TYPE_OBJECT_CHANGE: + object = &eqe->data.obj_change; + obj_id = be32_to_cpu(object->obj_id); + if (obj_id == tracker->id) + set_tracker_change_event(mvdev); + break; default: break; } @@ -1634,6 +1730,11 @@ int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova, goto end; } + if (tracker->is_err) { + err = -EIO; + goto end; + } + mdev = mvdev->mdev; err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length, MLX5_PAGE_TRACK_STATE_REPORTING); @@ -1652,6 +1753,12 @@ int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova, dirty, &tracker->status); if (poll_err == CQ_EMPTY) { wait_for_completion(&mvdev->tracker_comp); + if (tracker->object_changed) { + tracker->object_changed = false; + err = mlx5vf_cmd_query_tracker(mdev, tracker); + if (err) + goto end; + } continue; } } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index f2c7227fa6..df421dc6de 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -13,9 +13,6 @@ #include #include -#define MLX5VF_PRE_COPY_SUPP(mvdev) \ - ((mvdev)->core_device.vdev.migration_flags & VFIO_MIGRATION_PRE_COPY) - enum mlx5_vf_migf_state { MLX5_MIGF_STATE_ERROR = 1, MLX5_MIGF_STATE_PRE_COPY_ERROR, @@ -25,7 +22,6 @@ enum mlx5_vf_migf_state { }; enum mlx5_vf_load_state { - MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER, MLX5_VF_LOAD_STATE_READ_HEADER, MLX5_VF_LOAD_STATE_PREP_HEADER_DATA, MLX5_VF_LOAD_STATE_READ_HEADER_DATA, @@ -162,6 +158,7 @@ struct mlx5_vhca_page_tracker { u32 id; u32 pdn; u8 is_err:1; + u8 object_changed:1; struct mlx5_uars_page *uar; struct mlx5_vhca_cq cq; struct mlx5_vhca_qp *host_qp; @@ -196,6 +193,7 @@ struct mlx5vf_pci_core_device { enum { MLX5VF_QUERY_INC = (1UL << 0), MLX5VF_QUERY_FINAL = (1UL << 1), + MLX5VF_QUERY_CLEANUP = (1UL << 2), }; int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); @@ -226,12 +224,11 @@ struct mlx5_vhca_data_buffer * mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, size_t length, enum dma_data_direction dma_dir); void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf); -int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, - unsigned int npages); struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, unsigned long offset); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); -void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); +void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev, + enum mlx5_vf_migf_state *last_save_state); void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf, u8 chunk_num, size_t next_required_umem_size); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index fe09a8c8af..61d9b0f914 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -65,50 +65,6 @@ mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, return NULL; } -int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, - unsigned int npages) -{ - unsigned int to_alloc = npages; - struct page **page_list; - unsigned long filled; - unsigned int to_fill; - int ret; - - to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); - page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT); - if (!page_list) - return -ENOMEM; - - do { - filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, - page_list); - if (!filled) { - ret = -ENOMEM; - goto err; - } - to_alloc -= filled; - ret = sg_alloc_append_table_from_pages( - &buf->table, page_list, filled, 0, - filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, - GFP_KERNEL_ACCOUNT); - - if (ret) - goto err; - buf->allocated_length += filled * PAGE_SIZE; - /* clean input for another bulk allocation */ - memset(page_list, 0, filled * sizeof(*page_list)); - to_fill = min_t(unsigned int, to_alloc, - PAGE_SIZE / sizeof(*page_list)); - } while (to_alloc > 0); - - kvfree(page_list); - return 0; - -err: - kvfree(page_list); - return ret; -} - static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) { mutex_lock(&migf->lock); @@ -777,36 +733,6 @@ mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf, return 0; } -static int -mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf, - loff_t requested_length, - const char __user **buf, size_t *len, - loff_t *pos, ssize_t *done) -{ - int ret; - - if (requested_length > MAX_LOAD_SIZE) - return -ENOMEM; - - if (vhca_buf->allocated_length < requested_length) { - ret = mlx5vf_add_migration_pages( - vhca_buf, - DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, - PAGE_SIZE)); - if (ret) - return ret; - } - - while (*len) { - ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos, - done); - if (ret) - return ret; - } - - return 0; -} - static ssize_t mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf, struct mlx5_vhca_data_buffer *vhca_buf, @@ -1038,13 +964,6 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE; break; } - case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER: - ret = mlx5vf_resume_read_image_no_header(vhca_buf, - requested_length, - &buf, &len, pos, &done); - if (ret) - goto out_unlock; - break; case MLX5_VF_LOAD_STATE_READ_IMAGE: ret = mlx5vf_resume_read_image(migf, vhca_buf, migf->record_size, @@ -1114,21 +1033,16 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) } migf->buf[0] = buf; - if (MLX5VF_PRE_COPY_SUPP(mvdev)) { - buf = mlx5vf_alloc_data_buffer(migf, - sizeof(struct mlx5_vf_migration_header), DMA_NONE); - if (IS_ERR(buf)) { - ret = PTR_ERR(buf); - goto out_buf; - } - - migf->buf_header[0] = buf; - migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; - } else { - /* Initial state will be to read the image */ - migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER; + buf = mlx5vf_alloc_data_buffer(migf, + sizeof(struct mlx5_vf_migration_header), DMA_NONE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_buf; } + migf->buf_header[0] = buf; + migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; + stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); INIT_LIST_HEAD(&migf->buf_list); @@ -1146,7 +1060,8 @@ end: return ERR_PTR(ret); } -void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) +void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev, + enum mlx5_vf_migf_state *last_save_state) { if (mvdev->resuming_migf) { mlx5vf_disable_fd(mvdev->resuming_migf); @@ -1157,6 +1072,8 @@ void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) if (mvdev->saving_migf) { mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); cancel_work_sync(&mvdev->saving_migf->async_data.work); + if (last_save_state) + *last_save_state = mvdev->saving_migf->state; mlx5vf_disable_fd(mvdev->saving_migf); wake_up_interruptible(&mvdev->saving_migf->poll_wait); mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); @@ -1217,12 +1134,34 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return migf->filp; } - if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) || - (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || + if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) { + mlx5vf_disable_fds(mvdev, NULL); + return NULL; + } + + if ((cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_RUNNING_P2P)) { - mlx5vf_disable_fds(mvdev); - return NULL; + struct mlx5_vf_migration_file *migf = mvdev->saving_migf; + struct mlx5_vhca_data_buffer *buf; + enum mlx5_vf_migf_state state; + size_t size; + + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL, + MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP); + if (ret) + return ERR_PTR(ret); + buf = mlx5vf_get_data_buffer(migf, size, DMA_FROM_DEVICE); + if (IS_ERR(buf)) + return ERR_CAST(buf); + /* pre_copy cleanup */ + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, false); + if (ret) { + mlx5vf_put_data_buffer(buf); + return ERR_PTR(ret); + } + mlx5vf_disable_fds(mvdev, &state); + return (state != MLX5_MIGF_STATE_ERROR) ? NULL : ERR_PTR(-EIO); } if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { @@ -1237,14 +1176,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, } if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { - if (!MLX5VF_PRE_COPY_SUPP(mvdev)) { - ret = mlx5vf_cmd_load_vhca_state(mvdev, - mvdev->resuming_migf, - mvdev->resuming_migf->buf[0]); - if (ret) - return ERR_PTR(ret); - } - mlx5vf_disable_fds(mvdev); + mlx5vf_disable_fds(mvdev, NULL); return NULL; } @@ -1289,7 +1221,7 @@ again: mvdev->deferred_reset = false; spin_unlock(&mvdev->reset_lock); mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; - mlx5vf_disable_fds(mvdev); + mlx5vf_disable_fds(mvdev, NULL); goto again; } mutex_unlock(&mvdev->state_mutex); diff --git a/drivers/vfio/pci/nvgrace-gpu/Kconfig b/drivers/vfio/pci/nvgrace-gpu/Kconfig new file mode 100644 index 0000000000..a7f624b37e --- /dev/null +++ b/drivers/vfio/pci/nvgrace-gpu/Kconfig @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only +config NVGRACE_GPU_VFIO_PCI + tristate "VFIO support for the GPU in the NVIDIA Grace Hopper Superchip" + depends on ARM64 || (COMPILE_TEST && 64BIT) + select VFIO_PCI_CORE + help + VFIO support for the GPU in the NVIDIA Grace Hopper Superchip is + required to assign the GPU device to userspace using KVM/qemu/etc. + + If you don't know what to do here, say N. diff --git a/drivers/vfio/pci/nvgrace-gpu/Makefile b/drivers/vfio/pci/nvgrace-gpu/Makefile new file mode 100644 index 0000000000..3ca8c18789 --- /dev/null +++ b/drivers/vfio/pci/nvgrace-gpu/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu-vfio-pci.o +nvgrace-gpu-vfio-pci-y := main.o diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c new file mode 100644 index 0000000000..a7fd018aa5 --- /dev/null +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -0,0 +1,888 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include +#include + +/* + * The device memory usable to the workloads running in the VM is cached + * and showcased as a 64b device BAR (comprising of BAR4 and BAR5 region) + * to the VM and is represented as usemem. + * Moreover, the VM GPU device driver needs a non-cacheable region to + * support the MIG feature. This region is also exposed as a 64b BAR + * (comprising of BAR2 and BAR3 region) and represented as resmem. + */ +#define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX +#define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX + +/* Memory size expected as non cached and reserved by the VM driver */ +#define RESMEM_SIZE SZ_1G + +/* A hardwired and constant ABI value between the GPU FW and VFIO driver. */ +#define MEMBLK_SIZE SZ_512M + +/* + * The state of the two device memory region - resmem and usemem - is + * saved as struct mem_region. + */ +struct mem_region { + phys_addr_t memphys; /* Base physical address of the region */ + size_t memlength; /* Region size */ + size_t bar_size; /* Reported region BAR size */ + __le64 bar_val; /* Emulated BAR offset registers */ + union { + void *memaddr; + void __iomem *ioaddr; + }; /* Base virtual address of the region */ +}; + +struct nvgrace_gpu_pci_core_device { + struct vfio_pci_core_device core_device; + /* Cached and usable memory for the VM. */ + struct mem_region usemem; + /* Non cached memory carved out from the end of device memory */ + struct mem_region resmem; + /* Lock to control device memory kernel mapping */ + struct mutex remap_lock; +}; + +static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) +{ + struct nvgrace_gpu_pci_core_device *nvdev = + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, + core_device.vdev); + + nvdev->resmem.bar_val = 0; + nvdev->usemem.bar_val = 0; +} + +/* Choose the structure corresponding to the fake BAR with a given index. */ +static struct mem_region * +nvgrace_gpu_memregion(int index, + struct nvgrace_gpu_pci_core_device *nvdev) +{ + if (index == USEMEM_REGION_INDEX) + return &nvdev->usemem; + + if (index == RESMEM_REGION_INDEX) + return &nvdev->resmem; + + return NULL; +} + +static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) +{ + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); + struct nvgrace_gpu_pci_core_device *nvdev = + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, + core_device.vdev); + int ret; + + ret = vfio_pci_core_enable(vdev); + if (ret) + return ret; + + if (nvdev->usemem.memlength) { + nvgrace_gpu_init_fake_bar_emu_regs(core_vdev); + mutex_init(&nvdev->remap_lock); + } + + vfio_pci_core_finish_enable(vdev); + + return 0; +} + +static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) +{ + struct nvgrace_gpu_pci_core_device *nvdev = + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, + core_device.vdev); + + /* Unmap the mapping to the device memory cached region */ + if (nvdev->usemem.memaddr) { + memunmap(nvdev->usemem.memaddr); + nvdev->usemem.memaddr = NULL; + } + + /* Unmap the mapping to the device memory non-cached region */ + if (nvdev->resmem.ioaddr) { + iounmap(nvdev->resmem.ioaddr); + nvdev->resmem.ioaddr = NULL; + } + + mutex_destroy(&nvdev->remap_lock); + + vfio_pci_core_close_device(core_vdev); +} + +static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, + struct vm_area_struct *vma) +{ + struct nvgrace_gpu_pci_core_device *nvdev = + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, + core_device.vdev); + struct mem_region *memregion; + unsigned long start_pfn; + u64 req_len, pgoff, end; + unsigned int index; + int ret = 0; + + index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); + + memregion = nvgrace_gpu_memregion(index, nvdev); + if (!memregion) + return vfio_pci_core_mmap(core_vdev, vma); + + /* + * Request to mmap the BAR. Map to the CPU accessible memory on the + * GPU using the memory information gathered from the system ACPI + * tables. + */ + pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) || + check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) || + check_add_overflow(PFN_PHYS(pgoff), req_len, &end)) + return -EOVERFLOW; + + /* + * Check that the mapping request does not go beyond available device + * memory size + */ + if (end > memregion->memlength) + return -EINVAL; + + /* + * The carved out region of the device memory needs the NORMAL_NC + * property. Communicate as such to the hypervisor. + */ + if (index == RESMEM_REGION_INDEX) { + /* + * The nvgrace-gpu module has no issues with uncontained + * failures on NORMAL_NC accesses. VM_ALLOW_ANY_UNCACHED is + * set to communicate to the KVM to S2 map as NORMAL_NC. + * This opens up guest usage of NORMAL_NC for this mapping. + */ + vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED); + + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + } + + /* + * Perform a PFN map to the memory and back the device BAR by the + * GPU memory. + * + * The available GPU memory size may not be power-of-2 aligned. The + * remainder is only backed by vfio_device_ops read/write handlers. + * + * During device reset, the GPU is safely disconnected to the CPU + * and access to the BAR will be immediately returned preventing + * machine check. + */ + ret = remap_pfn_range(vma, vma->vm_start, start_pfn, + req_len, vma->vm_page_prot); + if (ret) + return ret; + + vma->vm_pgoff = start_pfn; + + return 0; +} + +static long +nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, + unsigned long arg) +{ + struct nvgrace_gpu_pci_core_device *nvdev = + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, + core_device.vdev); + unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; + struct vfio_region_info_cap_sparse_mmap *sparse; + struct vfio_region_info info; + struct mem_region *memregion; + u32 size; + int ret; + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + /* + * Request to determine the BAR region information. Send the + * GPU memory information. + */ + memregion = nvgrace_gpu_memregion(info.index, nvdev); + if (!memregion) + return vfio_pci_core_ioctl(core_vdev, + VFIO_DEVICE_GET_REGION_INFO, arg); + + size = struct_size(sparse, areas, 1); + + /* + * Setup for sparse mapping for the device memory. Only the + * available device memory on the hardware is shown as a + * mappable region. + */ + sparse = kzalloc(size, GFP_KERNEL); + if (!sparse) + return -ENOMEM; + + sparse->nr_areas = 1; + sparse->areas[0].offset = 0; + sparse->areas[0].size = memregion->memlength; + sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; + sparse->header.version = 1; + + ret = vfio_info_add_capability(&caps, &sparse->header, size); + kfree(sparse); + if (ret) + return ret; + + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + /* + * The region memory size may not be power-of-2 aligned. + * Given that the memory as a BAR and may not be + * aligned, roundup to the next power-of-2. + */ + info.size = memregion->bar_size; + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP; + + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user((void __user *)arg + + sizeof(info), caps.buf, + caps.size)) { + kfree(caps.buf); + return -EFAULT; + } + info.cap_offset = sizeof(info); + } + kfree(caps.buf); + } + return copy_to_user((void __user *)arg, &info, minsz) ? + -EFAULT : 0; +} + +static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case VFIO_DEVICE_GET_REGION_INFO: + return nvgrace_gpu_ioctl_get_region_info(core_vdev, arg); + case VFIO_DEVICE_IOEVENTFD: + return -ENOTTY; + case VFIO_DEVICE_RESET: + nvgrace_gpu_init_fake_bar_emu_regs(core_vdev); + fallthrough; + default: + return vfio_pci_core_ioctl(core_vdev, cmd, arg); + } +} + +static __le64 +nvgrace_gpu_get_read_value(size_t bar_size, u64 flags, __le64 val64) +{ + u64 tmp_val; + + tmp_val = le64_to_cpu(val64); + tmp_val &= ~(bar_size - 1); + tmp_val |= flags; + + return cpu_to_le64(tmp_val); +} + +/* + * Both the usable (usemem) and the reserved (resmem) device memory region + * are exposed as a 64b fake device BARs in the VM. These fake BARs must + * respond to the accesses on their respective PCI config space offsets. + * + * resmem BAR owns PCI_BASE_ADDRESS_2 & PCI_BASE_ADDRESS_3. + * usemem BAR owns PCI_BASE_ADDRESS_4 & PCI_BASE_ADDRESS_5. + */ +static ssize_t +nvgrace_gpu_read_config_emu(struct vfio_device *core_vdev, + char __user *buf, size_t count, loff_t *ppos) +{ + struct nvgrace_gpu_pci_core_device *nvdev = + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, + core_device.vdev); + u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; + struct mem_region *memregion = NULL; + __le64 val64; + size_t register_offset; + loff_t copy_offset; + size_t copy_count; + int ret; + + ret = vfio_pci_core_read(core_vdev, buf, count, ppos); + if (ret < 0) + return ret; + + if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2, + sizeof(val64), + ©_offset, ©_count, + ®ister_offset)) + memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev); + else if (vfio_pci_core_range_intersect_range(pos, count, + PCI_BASE_ADDRESS_4, + sizeof(val64), + ©_offset, ©_count, + ®ister_offset)) + memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev); + + if (memregion) { + val64 = nvgrace_gpu_get_read_value(memregion->bar_size, + PCI_BASE_ADDRESS_MEM_TYPE_64 | + PCI_BASE_ADDRESS_MEM_PREFETCH, + memregion->bar_val); + if (copy_to_user(buf + copy_offset, + (void *)&val64 + register_offset, copy_count)) { + /* + * The position has been incremented in + * vfio_pci_core_read. Reset the offset back to the + * starting position. + */ + *ppos -= count; + return -EFAULT; + } + } + + return count; +} + +static ssize_t +nvgrace_gpu_write_config_emu(struct vfio_device *core_vdev, + const char __user *buf, size_t count, loff_t *ppos) +{ + struct nvgrace_gpu_pci_core_device *nvdev = + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, + core_device.vdev); + u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; + struct mem_region *memregion = NULL; + size_t register_offset; + loff_t copy_offset; + size_t copy_count; + + if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2, + sizeof(u64), ©_offset, + ©_count, ®ister_offset)) + memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev); + else if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_4, + sizeof(u64), ©_offset, + ©_count, ®ister_offset)) + memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev); + + if (memregion) { + if (copy_from_user((void *)&memregion->bar_val + register_offset, + buf + copy_offset, copy_count)) + return -EFAULT; + *ppos += copy_count; + return copy_count; + } + + return vfio_pci_core_write(core_vdev, buf, count, ppos); +} + +/* + * Ad hoc map the device memory in the module kernel VA space. Primarily needed + * as vfio does not require the userspace driver to only perform accesses through + * mmaps of the vfio-pci BAR regions and such accesses should be supported using + * vfio_device_ops read/write implementations. + * + * The usemem region is cacheable memory and hence is memremaped. + * The resmem region is non-cached and is mapped using ioremap_wc (NORMAL_NC). + */ +static int +nvgrace_gpu_map_device_mem(int index, + struct nvgrace_gpu_pci_core_device *nvdev) +{ + struct mem_region *memregion; + int ret = 0; + + memregion = nvgrace_gpu_memregion(index, nvdev); + if (!memregion) + return -EINVAL; + + mutex_lock(&nvdev->remap_lock); + + if (memregion->memaddr) + goto unlock; + + if (index == USEMEM_REGION_INDEX) + memregion->memaddr = memremap(memregion->memphys, + memregion->memlength, + MEMREMAP_WB); + else + memregion->ioaddr = ioremap_wc(memregion->memphys, + memregion->memlength); + + if (!memregion->memaddr) + ret = -ENOMEM; + +unlock: + mutex_unlock(&nvdev->remap_lock); + + return ret; +} + +/* + * Read the data from the device memory (mapped either through ioremap + * or memremap) into the user buffer. + */ +static int +nvgrace_gpu_map_and_read(struct nvgrace_gpu_pci_core_device *nvdev, + char __user *buf, size_t mem_count, loff_t *ppos) +{ + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; + int ret; + + if (!mem_count) + return 0; + + /* + * Handle read on the BAR regions. Map to the target device memory + * physical address and copy to the request read buffer. + */ + ret = nvgrace_gpu_map_device_mem(index, nvdev); + if (ret) + return ret; + + if (index == USEMEM_REGION_INDEX) { + if (copy_to_user(buf, + (u8 *)nvdev->usemem.memaddr + offset, + mem_count)) + ret = -EFAULT; + } else { + /* + * The hardware ensures that the system does not crash when + * the device memory is accessed with the memory enable + * turned off. It synthesizes ~0 on such read. So there is + * no need to check or support the disablement/enablement of + * BAR through PCI_COMMAND config space register. Pass + * test_mem flag as false. + */ + ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false, + nvdev->resmem.ioaddr, + buf, offset, mem_count, + 0, 0, false); + } + + return ret; +} + +/* + * Read count bytes from the device memory at an offset. The actual device + * memory size (available) may not be a power-of-2. So the driver fakes + * the size to a power-of-2 (reported) when exposing to a user space driver. + * + * Reads starting beyond the reported size generate -EINVAL; reads extending + * beyond the actual device size is filled with ~0; reads extending beyond + * the reported size are truncated. + */ +static ssize_t +nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, + char __user *buf, size_t count, loff_t *ppos) +{ + u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + struct mem_region *memregion; + size_t mem_count, i; + u8 val = 0xFF; + int ret; + + /* No need to do NULL check as caller does. */ + memregion = nvgrace_gpu_memregion(index, nvdev); + + if (offset >= memregion->bar_size) + return -EINVAL; + + /* Clip short the read request beyond reported BAR size */ + count = min(count, memregion->bar_size - (size_t)offset); + + /* + * Determine how many bytes to be actually read from the device memory. + * Read request beyond the actual device memory size is filled with ~0, + * while those beyond the actual reported size is skipped. + */ + if (offset >= memregion->memlength) + mem_count = 0; + else + mem_count = min(count, memregion->memlength - (size_t)offset); + + ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos); + if (ret) + return ret; + + /* + * Only the device memory present on the hardware is mapped, which may + * not be power-of-2 aligned. A read to an offset beyond the device memory + * size is filled with ~0. + */ + for (i = mem_count; i < count; i++) { + ret = put_user(val, (unsigned char __user *)(buf + i)); + if (ret) + return ret; + } + + *ppos += count; + return count; +} + +static ssize_t +nvgrace_gpu_read(struct vfio_device *core_vdev, + char __user *buf, size_t count, loff_t *ppos) +{ + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + struct nvgrace_gpu_pci_core_device *nvdev = + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, + core_device.vdev); + + if (nvgrace_gpu_memregion(index, nvdev)) + return nvgrace_gpu_read_mem(nvdev, buf, count, ppos); + + if (index == VFIO_PCI_CONFIG_REGION_INDEX) + return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos); + + return vfio_pci_core_read(core_vdev, buf, count, ppos); +} + +/* + * Write the data to the device memory (mapped either through ioremap + * or memremap) from the user buffer. + */ +static int +nvgrace_gpu_map_and_write(struct nvgrace_gpu_pci_core_device *nvdev, + const char __user *buf, size_t mem_count, + loff_t *ppos) +{ + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + int ret; + + if (!mem_count) + return 0; + + ret = nvgrace_gpu_map_device_mem(index, nvdev); + if (ret) + return ret; + + if (index == USEMEM_REGION_INDEX) { + if (copy_from_user((u8 *)nvdev->usemem.memaddr + pos, + buf, mem_count)) + return -EFAULT; + } else { + /* + * The hardware ensures that the system does not crash when + * the device memory is accessed with the memory enable + * turned off. It drops such writes. So there is no need to + * check or support the disablement/enablement of BAR + * through PCI_COMMAND config space register. Pass test_mem + * flag as false. + */ + ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false, + nvdev->resmem.ioaddr, + (char __user *)buf, pos, mem_count, + 0, 0, true); + } + + return ret; +} + +/* + * Write count bytes to the device memory at a given offset. The actual device + * memory size (available) may not be a power-of-2. So the driver fakes the + * size to a power-of-2 (reported) when exposing to a user space driver. + * + * Writes extending beyond the reported size are truncated; writes starting + * beyond the reported size generate -EINVAL. + */ +static ssize_t +nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, + size_t count, loff_t *ppos, const char __user *buf) +{ + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; + struct mem_region *memregion; + size_t mem_count; + int ret = 0; + + /* No need to do NULL check as caller does. */ + memregion = nvgrace_gpu_memregion(index, nvdev); + + if (offset >= memregion->bar_size) + return -EINVAL; + + /* Clip short the write request beyond reported BAR size */ + count = min(count, memregion->bar_size - (size_t)offset); + + /* + * Determine how many bytes to be actually written to the device memory. + * Do not write to the offset beyond available size. + */ + if (offset >= memregion->memlength) + goto exitfn; + + /* + * Only the device memory present on the hardware is mapped, which may + * not be power-of-2 aligned. Drop access outside the available device + * memory on the hardware. + */ + mem_count = min(count, memregion->memlength - (size_t)offset); + + ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos); + if (ret) + return ret; + +exitfn: + *ppos += count; + return count; +} + +static ssize_t +nvgrace_gpu_write(struct vfio_device *core_vdev, + const char __user *buf, size_t count, loff_t *ppos) +{ + struct nvgrace_gpu_pci_core_device *nvdev = + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, + core_device.vdev); + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + + if (nvgrace_gpu_memregion(index, nvdev)) + return nvgrace_gpu_write_mem(nvdev, count, ppos, buf); + + if (index == VFIO_PCI_CONFIG_REGION_INDEX) + return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos); + + return vfio_pci_core_write(core_vdev, buf, count, ppos); +} + +static const struct vfio_device_ops nvgrace_gpu_pci_ops = { + .name = "nvgrace-gpu-vfio-pci", + .init = vfio_pci_core_init_dev, + .release = vfio_pci_core_release_dev, + .open_device = nvgrace_gpu_open_device, + .close_device = nvgrace_gpu_close_device, + .ioctl = nvgrace_gpu_ioctl, + .device_feature = vfio_pci_core_ioctl_feature, + .read = nvgrace_gpu_read, + .write = nvgrace_gpu_write, + .mmap = nvgrace_gpu_mmap, + .request = vfio_pci_core_request, + .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, +}; + +static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { + .name = "nvgrace-gpu-vfio-pci-core", + .init = vfio_pci_core_init_dev, + .release = vfio_pci_core_release_dev, + .open_device = nvgrace_gpu_open_device, + .close_device = vfio_pci_core_close_device, + .ioctl = vfio_pci_core_ioctl, + .device_feature = vfio_pci_core_ioctl_feature, + .read = vfio_pci_core_read, + .write = vfio_pci_core_write, + .mmap = vfio_pci_core_mmap, + .request = vfio_pci_core_request, + .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, +}; + +static int +nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev, + u64 *pmemphys, u64 *pmemlength) +{ + int ret; + + /* + * The memory information is present in the system ACPI tables as DSD + * properties nvidia,gpu-mem-base-pa and nvidia,gpu-mem-size. + */ + ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-base-pa", + pmemphys); + if (ret) + return ret; + + if (*pmemphys > type_max(phys_addr_t)) + return -EOVERFLOW; + + ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-size", + pmemlength); + if (ret) + return ret; + + if (*pmemlength > type_max(size_t)) + return -EOVERFLOW; + + /* + * If the C2C link is not up due to an error, the coherent device + * memory size is returned as 0. Fail in such case. + */ + if (*pmemlength == 0) + return -ENOMEM; + + return ret; +} + +static int +nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, + struct nvgrace_gpu_pci_core_device *nvdev, + u64 memphys, u64 memlength) +{ + int ret = 0; + + /* + * The VM GPU device driver needs a non-cacheable region to support + * the MIG feature. Since the device memory is mapped as NORMAL cached, + * carve out a region from the end with a different NORMAL_NC + * property (called as reserved memory and represented as resmem). This + * region then is exposed as a 64b BAR (region 2 and 3) to the VM, while + * exposing the rest (termed as usable memory and represented using usemem) + * as cacheable 64b BAR (region 4 and 5). + * + * devmem (memlength) + * |-------------------------------------------------| + * | | + * usemem.memphys resmem.memphys + */ + nvdev->usemem.memphys = memphys; + + /* + * The device memory exposed to the VM is added to the kernel by the + * VM driver module in chunks of memory block size. Only the usable + * memory (usemem) is added to the kernel for usage by the VM + * workloads. Make the usable memory size memblock aligned. + */ + if (check_sub_overflow(memlength, RESMEM_SIZE, + &nvdev->usemem.memlength)) { + ret = -EOVERFLOW; + goto done; + } + + /* + * The USEMEM part of the device memory has to be MEMBLK_SIZE + * aligned. This is a hardwired ABI value between the GPU FW and + * VFIO driver. The VM device driver is also aware of it and make + * use of the value for its calculation to determine USEMEM size. + */ + nvdev->usemem.memlength = round_down(nvdev->usemem.memlength, + MEMBLK_SIZE); + if (nvdev->usemem.memlength == 0) { + ret = -EINVAL; + goto done; + } + + if ((check_add_overflow(nvdev->usemem.memphys, + nvdev->usemem.memlength, + &nvdev->resmem.memphys)) || + (check_sub_overflow(memlength, nvdev->usemem.memlength, + &nvdev->resmem.memlength))) { + ret = -EOVERFLOW; + goto done; + } + + /* + * The memory regions are exposed as BARs. Calculate and save + * the BAR size for them. + */ + nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); + nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength); +done: + return ret; +} + +static int nvgrace_gpu_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + const struct vfio_device_ops *ops = &nvgrace_gpu_pci_core_ops; + struct nvgrace_gpu_pci_core_device *nvdev; + u64 memphys, memlength; + int ret; + + ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength); + if (!ret) + ops = &nvgrace_gpu_pci_ops; + + nvdev = vfio_alloc_device(nvgrace_gpu_pci_core_device, core_device.vdev, + &pdev->dev, ops); + if (IS_ERR(nvdev)) + return PTR_ERR(nvdev); + + dev_set_drvdata(&pdev->dev, &nvdev->core_device); + + if (ops == &nvgrace_gpu_pci_ops) { + /* + * Device memory properties are identified in the host ACPI + * table. Set the nvgrace_gpu_pci_core_device structure. + */ + ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev, + memphys, memlength); + if (ret) + goto out_put_vdev; + } + + ret = vfio_pci_core_register_device(&nvdev->core_device); + if (ret) + goto out_put_vdev; + + return ret; + +out_put_vdev: + vfio_put_device(&nvdev->core_device.vdev); + return ret; +} + +static void nvgrace_gpu_remove(struct pci_dev *pdev) +{ + struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); + + vfio_pci_core_unregister_device(core_device); + vfio_put_device(&core_device->vdev); +} + +static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { + /* GH200 120GB */ + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) }, + /* GH200 480GB */ + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) }, + {} +}; + +MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table); + +static struct pci_driver nvgrace_gpu_vfio_pci_driver = { + .name = KBUILD_MODNAME, + .id_table = nvgrace_gpu_vfio_pci_table, + .probe = nvgrace_gpu_probe, + .remove = nvgrace_gpu_remove, + .err_handler = &vfio_pci_core_err_handlers, + .driver_managed_dma = true, +}; + +module_pci_driver(nvgrace_gpu_vfio_pci_driver); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ankit Agrawal "); +MODULE_AUTHOR("Aniket Agashe "); +MODULE_DESCRIPTION("VFIO NVGRACE GPU PF - User Level driver for NVIDIA devices with CPU coherently accessible device memory"); diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c index 8ddf4346fc..68e8f006df 100644 --- a/drivers/vfio/pci/pds/dirty.c +++ b/drivers/vfio/pci/pds/dirty.c @@ -607,7 +607,7 @@ int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long iova, mutex_lock(&pds_vfio->state_mutex); err = pds_vfio_dirty_sync(pds_vfio, dirty, iova, length); - pds_vfio_state_mutex_unlock(pds_vfio); + mutex_unlock(&pds_vfio->state_mutex); return err; } @@ -624,7 +624,7 @@ int pds_vfio_dma_logging_start(struct vfio_device *vdev, mutex_lock(&pds_vfio->state_mutex); pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_IN_PROGRESS); err = pds_vfio_dirty_enable(pds_vfio, ranges, nnodes, page_size); - pds_vfio_state_mutex_unlock(pds_vfio); + mutex_unlock(&pds_vfio->state_mutex); return err; } @@ -637,7 +637,7 @@ int pds_vfio_dma_logging_stop(struct vfio_device *vdev) mutex_lock(&pds_vfio->state_mutex); pds_vfio_dirty_disable(pds_vfio, true); - pds_vfio_state_mutex_unlock(pds_vfio); + mutex_unlock(&pds_vfio->state_mutex); return 0; } diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c index a34dda5166..16e93b11ab 100644 --- a/drivers/vfio/pci/pds/pci_drv.c +++ b/drivers/vfio/pci/pds/pci_drv.c @@ -21,16 +21,13 @@ static void pds_vfio_recovery(struct pds_vfio_pci_device *pds_vfio) { - bool deferred_reset_needed = false; - /* * Documentation states that the kernel migration driver must not * generate asynchronous device state transitions outside of * manipulation by the user or the VFIO_DEVICE_RESET ioctl. * * Since recovery is an asynchronous event received from the device, - * initiate a deferred reset. Issue a deferred reset in the following - * situations: + * initiate a reset in the following situations: * 1. Migration is in progress, which will cause the next step of * the migration to fail. * 2. If the device is in a state that will be set to @@ -42,24 +39,8 @@ static void pds_vfio_recovery(struct pds_vfio_pci_device *pds_vfio) pds_vfio->state != VFIO_DEVICE_STATE_ERROR) || (pds_vfio->state == VFIO_DEVICE_STATE_RUNNING && pds_vfio_dirty_is_enabled(pds_vfio))) - deferred_reset_needed = true; + pds_vfio_reset(pds_vfio, VFIO_DEVICE_STATE_ERROR); mutex_unlock(&pds_vfio->state_mutex); - - /* - * On the next user initiated state transition, the device will - * transition to the VFIO_DEVICE_STATE_ERROR. At this point it's the user's - * responsibility to reset the device. - * - * If a VFIO_DEVICE_RESET is requested post recovery and before the next - * state transition, then the deferred reset state will be set to - * VFIO_DEVICE_STATE_RUNNING. - */ - if (deferred_reset_needed) { - mutex_lock(&pds_vfio->reset_mutex); - pds_vfio->deferred_reset = true; - pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_ERROR; - mutex_unlock(&pds_vfio->reset_mutex); - } } static int pds_vfio_pci_notify_handler(struct notifier_block *nb, @@ -185,7 +166,9 @@ static void pds_vfio_pci_aer_reset_done(struct pci_dev *pdev) { struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev); - pds_vfio_reset(pds_vfio); + mutex_lock(&pds_vfio->state_mutex); + pds_vfio_reset(pds_vfio, VFIO_DEVICE_STATE_RUNNING); + mutex_unlock(&pds_vfio->state_mutex); } static const struct pci_error_handlers pds_vfio_pci_err_handlers = { diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c index a286ebcc71..76a80ae708 100644 --- a/drivers/vfio/pci/pds/vfio_dev.c +++ b/drivers/vfio/pci/pds/vfio_dev.c @@ -26,37 +26,14 @@ struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev) vfio_coredev); } -void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio) +void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio, + enum vfio_device_mig_state state) { -again: - mutex_lock(&pds_vfio->reset_mutex); - if (pds_vfio->deferred_reset) { - pds_vfio->deferred_reset = false; - pds_vfio_put_restore_file(pds_vfio); - pds_vfio_put_save_file(pds_vfio); - if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) { - pds_vfio_dirty_disable(pds_vfio, false); - } - pds_vfio->state = pds_vfio->deferred_reset_state; - pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING; - mutex_unlock(&pds_vfio->reset_mutex); - goto again; - } - mutex_unlock(&pds_vfio->state_mutex); - mutex_unlock(&pds_vfio->reset_mutex); -} - -void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio) -{ - mutex_lock(&pds_vfio->reset_mutex); - pds_vfio->deferred_reset = true; - pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING; - if (!mutex_trylock(&pds_vfio->state_mutex)) { - mutex_unlock(&pds_vfio->reset_mutex); - return; - } - mutex_unlock(&pds_vfio->reset_mutex); - pds_vfio_state_mutex_unlock(pds_vfio); + pds_vfio_put_restore_file(pds_vfio); + pds_vfio_put_save_file(pds_vfio); + if (state == VFIO_DEVICE_STATE_ERROR) + pds_vfio_dirty_disable(pds_vfio, false); + pds_vfio->state = state; } static struct file * @@ -97,8 +74,7 @@ pds_vfio_set_device_state(struct vfio_device *vdev, break; } } - pds_vfio_state_mutex_unlock(pds_vfio); - /* still waiting on a deferred_reset */ + mutex_unlock(&pds_vfio->state_mutex); if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) res = ERR_PTR(-EIO); @@ -114,7 +90,7 @@ static int pds_vfio_get_device_state(struct vfio_device *vdev, mutex_lock(&pds_vfio->state_mutex); *current_state = pds_vfio->state; - pds_vfio_state_mutex_unlock(pds_vfio); + mutex_unlock(&pds_vfio->state_mutex); return 0; } @@ -156,7 +132,6 @@ static int pds_vfio_init_device(struct vfio_device *vdev) pds_vfio->vf_id = vf_id; mutex_init(&pds_vfio->state_mutex); - mutex_init(&pds_vfio->reset_mutex); vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P; vdev->mig_ops = &pds_vfio_lm_ops; @@ -178,7 +153,6 @@ static void pds_vfio_release_device(struct vfio_device *vdev) vfio_coredev.vdev); mutex_destroy(&pds_vfio->state_mutex); - mutex_destroy(&pds_vfio->reset_mutex); vfio_pci_core_release_dev(vdev); } @@ -194,7 +168,6 @@ static int pds_vfio_open_device(struct vfio_device *vdev) return err; pds_vfio->state = VFIO_DEVICE_STATE_RUNNING; - pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING; vfio_pci_core_finish_enable(&pds_vfio->vfio_coredev); diff --git a/drivers/vfio/pci/pds/vfio_dev.h b/drivers/vfio/pci/pds/vfio_dev.h index e7b01080a1..803d99d69c 100644 --- a/drivers/vfio/pci/pds/vfio_dev.h +++ b/drivers/vfio/pci/pds/vfio_dev.h @@ -18,20 +18,16 @@ struct pds_vfio_pci_device { struct pds_vfio_dirty dirty; struct mutex state_mutex; /* protect migration state */ enum vfio_device_mig_state state; - struct mutex reset_mutex; /* protect reset_done flow */ - u8 deferred_reset; - enum vfio_device_mig_state deferred_reset_state; struct notifier_block nb; int vf_id; u16 client_id; }; -void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio); - const struct vfio_device_ops *pds_vfio_ops_info(void); struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev); -void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio); +void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio, + enum vfio_device_mig_state state); struct pci_dev *pds_vfio_to_pci_dev(struct pds_vfio_pci_device *pds_vfio); struct device *pds_vfio_to_dev(struct pds_vfio_pci_device *pds_vfio); diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 7e2e62ab08..97422aafaa 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -1966,3 +1966,45 @@ ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, char __user *buf, return done; } + +/** + * vfio_pci_core_range_intersect_range() - Determine overlap between a buffer + * and register offset ranges. + * @buf_start: start offset of the buffer + * @buf_cnt: number of buffer bytes + * @reg_start: start register offset + * @reg_cnt: number of register bytes + * @buf_offset: start offset of overlap in the buffer + * @intersect_count: number of overlapping bytes + * @register_offset: start offset of overlap in register + * + * Returns: true if there is overlap, false if not. + * The overlap start and size is returned through function args. + */ +bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt, + loff_t reg_start, size_t reg_cnt, + loff_t *buf_offset, + size_t *intersect_count, + size_t *register_offset) +{ + if (buf_start <= reg_start && + buf_start + buf_cnt > reg_start) { + *buf_offset = reg_start - buf_start; + *intersect_count = min_t(size_t, reg_cnt, + buf_start + buf_cnt - reg_start); + *register_offset = 0; + return true; + } + + if (buf_start > reg_start && + buf_start < reg_start + reg_cnt) { + *buf_offset = 0; + *intersect_count = min_t(size_t, buf_cnt, + reg_start + reg_cnt - buf_start); + *register_offset = buf_start - reg_start; + return true; + } + + return false; +} +EXPORT_SYMBOL_GPL(vfio_pci_core_range_intersect_range); diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 1cbc990d42..d8c95cc16b 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -778,25 +778,26 @@ static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) } struct vfio_pci_fill_info { - struct vfio_pci_dependent_device __user *devices; - struct vfio_pci_dependent_device __user *devices_end; struct vfio_device *vdev; + struct vfio_pci_dependent_device *devices; + int nr_devices; u32 count; u32 flags; }; static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) { - struct vfio_pci_dependent_device info = { - .segment = pci_domain_nr(pdev->bus), - .bus = pdev->bus->number, - .devfn = pdev->devfn, - }; + struct vfio_pci_dependent_device *info; struct vfio_pci_fill_info *fill = data; - fill->count++; - if (fill->devices >= fill->devices_end) - return 0; + /* The topology changed since we counted devices */ + if (fill->count >= fill->nr_devices) + return -EAGAIN; + + info = &fill->devices[fill->count++]; + info->segment = pci_domain_nr(pdev->bus); + info->bus = pdev->bus->number; + info->devfn = pdev->devfn; if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) { struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev); @@ -809,19 +810,19 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) */ vdev = vfio_find_device_in_devset(dev_set, &pdev->dev); if (!vdev) { - info.devid = VFIO_PCI_DEVID_NOT_OWNED; + info->devid = VFIO_PCI_DEVID_NOT_OWNED; } else { int id = vfio_iommufd_get_dev_id(vdev, iommufd); if (id > 0) - info.devid = id; + info->devid = id; else if (id == -ENOENT) - info.devid = VFIO_PCI_DEVID_OWNED; + info->devid = VFIO_PCI_DEVID_OWNED; else - info.devid = VFIO_PCI_DEVID_NOT_OWNED; + info->devid = VFIO_PCI_DEVID_NOT_OWNED; } /* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */ - if (info.devid == VFIO_PCI_DEVID_NOT_OWNED) + if (info->devid == VFIO_PCI_DEVID_NOT_OWNED) fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED; } else { struct iommu_group *iommu_group; @@ -830,13 +831,10 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) if (!iommu_group) return -EPERM; /* Cannot reset non-isolated devices */ - info.group_id = iommu_group_id(iommu_group); + info->group_id = iommu_group_id(iommu_group); iommu_group_put(iommu_group); } - if (copy_to_user(fill->devices, &info, sizeof(info))) - return -EFAULT; - fill->devices++; return 0; } @@ -1258,10 +1256,11 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( { unsigned long minsz = offsetofend(struct vfio_pci_hot_reset_info, count); + struct vfio_pci_dependent_device *devices = NULL; struct vfio_pci_hot_reset_info hdr; struct vfio_pci_fill_info fill = {}; bool slot = false; - int ret = 0; + int ret, count; if (copy_from_user(&hdr, arg, minsz)) return -EFAULT; @@ -1277,9 +1276,23 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( else if (pci_probe_reset_bus(vdev->pdev->bus)) return -ENODEV; - fill.devices = arg->devices; - fill.devices_end = arg->devices + - (hdr.argsz - sizeof(hdr)) / sizeof(arg->devices[0]); + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, + &count, slot); + if (ret) + return ret; + + if (count > (hdr.argsz - sizeof(hdr)) / sizeof(*devices)) { + hdr.count = count; + ret = -ENOSPC; + goto header; + } + + devices = kcalloc(count, sizeof(*devices), GFP_KERNEL); + if (!devices) + return -ENOMEM; + + fill.devices = devices; + fill.nr_devices = count; fill.vdev = &vdev->vdev; if (vfio_device_cdev_opened(&vdev->vdev)) @@ -1291,16 +1304,23 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( &fill, slot); mutex_unlock(&vdev->vdev.dev_set->lock); if (ret) - return ret; + goto out; + + if (copy_to_user(arg->devices, devices, + sizeof(*devices) * fill.count)) { + ret = -EFAULT; + goto out; + } hdr.count = fill.count; hdr.flags = fill.flags; - if (copy_to_user(arg, &hdr, minsz)) - return -EFAULT; - if (fill.count > fill.devices - arg->devices) - return -ENOSPC; - return 0; +header: + if (copy_to_user(arg, &hdr, minsz)) + ret = -EFAULT; +out: + kfree(devices); + return ret; } static int @@ -1862,8 +1882,25 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma /* * See remap_pfn_range(), called from vfio_pci_fault() but we can't * change vm_flags within the fault handler. Set them now. + * + * VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64, + * allowing KVM stage 2 device mapping attributes to use Normal-NC + * rather than DEVICE_nGnRE, which allows guest mappings + * supporting write-combining attributes (WC). ARM does not + * architecturally guarantee this is safe, and indeed some MMIO + * regions like the GICv2 VCPU interface can trigger uncontained + * faults if Normal-NC is used. + * + * To safely use VFIO in KVM the platform must guarantee full + * safety in the guest where no action taken against a MMIO + * mapping can trigger an uncontained failure. The assumption is + * that most VFIO PCI platforms support this for both mapping types, + * at least in common flows, based on some expectations of how + * PCI IP is integrated. Hence VM_ALLOW_ANY_UNCACHED is set in + * the VMA flags. */ - vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); + vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP | + VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &vfio_pci_mmap_ops; return 0; @@ -2047,6 +2084,7 @@ static int vfio_pci_bus_notifier(struct notifier_block *nb, pci_name(pdev)); pdev->driver_override = kasprintf(GFP_KERNEL, "%s", vdev->vdev.ops->name); + WARN_ON(!pdev->driver_override); } else if (action == BUS_NOTIFY_BOUND_DRIVER && pdev->is_virtfn && physfn == vdev->pdev) { struct pci_driver *drv = pci_dev_driver(pdev); diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index fb5392b749..e80c5d75b5 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -277,8 +277,10 @@ static int vfio_intx_enable(struct vfio_pci_core_device *vdev, return -ENOMEM; ctx = vfio_irq_ctx_alloc(vdev, 0); - if (!ctx) + if (!ctx) { + kfree(name); return -ENOMEM; + } ctx->name = name; ctx->trigger = trigger; diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 07fea08ea8..03b8f7ada1 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -96,10 +96,10 @@ VFIO_IOREAD(32) * reads with -1. This is intended for handling MSI-X vector tables and * leftover space for ROM BARs. */ -static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, - void __iomem *io, char __user *buf, - loff_t off, size_t count, size_t x_start, - size_t x_end, bool iswrite) +ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, + void __iomem *io, char __user *buf, + loff_t off, size_t count, size_t x_start, + size_t x_end, bool iswrite) { ssize_t done = 0; int ret; @@ -201,6 +201,7 @@ static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, return done; } +EXPORT_SYMBOL_GPL(vfio_pci_core_do_io_rw); int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar) { @@ -279,8 +280,8 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, x_end = vdev->msix_offset + vdev->msix_size; } - done = do_io_rw(vdev, res->flags & IORESOURCE_MEM, io, buf, pos, - count, x_start, x_end, iswrite); + done = vfio_pci_core_do_io_rw(vdev, res->flags & IORESOURCE_MEM, io, buf, pos, + count, x_start, x_end, iswrite); if (done >= 0) *ppos += done; @@ -348,7 +349,8 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf, * probing, so we don't currently worry about access in relation * to the memory enable bit in the command register. */ - done = do_io_rw(vdev, false, iomem, buf, off, count, 0, 0, iswrite); + done = vfio_pci_core_do_io_rw(vdev, false, iomem, buf, off, count, + 0, 0, iswrite); vga_put(vdev->pdev, rsrc); diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c index d5af683837..b5d3a8c5bb 100644 --- a/drivers/vfio/pci/virtio/main.c +++ b/drivers/vfio/pci/virtio/main.c @@ -132,33 +132,6 @@ end: return ret ? ret : count; } -static bool range_intersect_range(loff_t range1_start, size_t count1, - loff_t range2_start, size_t count2, - loff_t *start_offset, - size_t *intersect_count, - size_t *register_offset) -{ - if (range1_start <= range2_start && - range1_start + count1 > range2_start) { - *start_offset = range2_start - range1_start; - *intersect_count = min_t(size_t, count2, - range1_start + count1 - range2_start); - *register_offset = 0; - return true; - } - - if (range1_start > range2_start && - range1_start < range2_start + count2) { - *start_offset = 0; - *intersect_count = min_t(size_t, count1, - range2_start + count2 - range1_start); - *register_offset = range1_start - range2_start; - return true; - } - - return false; -} - static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos) @@ -178,16 +151,18 @@ static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev, if (ret < 0) return ret; - if (range_intersect_range(pos, count, PCI_DEVICE_ID, sizeof(val16), - ©_offset, ©_count, ®ister_offset)) { + if (vfio_pci_core_range_intersect_range(pos, count, PCI_DEVICE_ID, + sizeof(val16), ©_offset, + ©_count, ®ister_offset)) { val16 = cpu_to_le16(VIRTIO_TRANS_ID_NET); if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, copy_count)) return -EFAULT; } if ((le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO) && - range_intersect_range(pos, count, PCI_COMMAND, sizeof(val16), - ©_offset, ©_count, ®ister_offset)) { + vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND, + sizeof(val16), ©_offset, + ©_count, ®ister_offset)) { if (copy_from_user((void *)&val16 + register_offset, buf + copy_offset, copy_count)) return -EFAULT; @@ -197,16 +172,18 @@ static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev, return -EFAULT; } - if (range_intersect_range(pos, count, PCI_REVISION_ID, sizeof(val8), - ©_offset, ©_count, ®ister_offset)) { + if (vfio_pci_core_range_intersect_range(pos, count, PCI_REVISION_ID, + sizeof(val8), ©_offset, + ©_count, ®ister_offset)) { /* Transional needs to have revision 0 */ val8 = 0; if (copy_to_user(buf + copy_offset, &val8, copy_count)) return -EFAULT; } - if (range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, sizeof(val32), - ©_offset, ©_count, ®ister_offset)) { + if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, + sizeof(val32), ©_offset, + ©_count, ®ister_offset)) { u32 bar_mask = ~(virtvdev->bar0_virtual_buf_size - 1); u32 pci_base_addr_0 = le32_to_cpu(virtvdev->pci_base_addr_0); @@ -215,8 +192,9 @@ static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev, return -EFAULT; } - if (range_intersect_range(pos, count, PCI_SUBSYSTEM_ID, sizeof(val16), - ©_offset, ©_count, ®ister_offset)) { + if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_ID, + sizeof(val16), ©_offset, + ©_count, ®ister_offset)) { /* * Transitional devices use the PCI subsystem device id as * virtio device id, same as legacy driver always did. @@ -227,8 +205,9 @@ static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev, return -EFAULT; } - if (range_intersect_range(pos, count, PCI_SUBSYSTEM_VENDOR_ID, sizeof(val16), - ©_offset, ©_count, ®ister_offset)) { + if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_VENDOR_ID, + sizeof(val16), ©_offset, + ©_count, ®ister_offset)) { val16 = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET); if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, copy_count)) @@ -270,19 +249,20 @@ static ssize_t virtiovf_pci_write_config(struct vfio_device *core_vdev, loff_t copy_offset; size_t copy_count; - if (range_intersect_range(pos, count, PCI_COMMAND, sizeof(virtvdev->pci_cmd), - ©_offset, ©_count, - ®ister_offset)) { + if (vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND, + sizeof(virtvdev->pci_cmd), + ©_offset, ©_count, + ®ister_offset)) { if (copy_from_user((void *)&virtvdev->pci_cmd + register_offset, buf + copy_offset, copy_count)) return -EFAULT; } - if (range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, - sizeof(virtvdev->pci_base_addr_0), - ©_offset, ©_count, - ®ister_offset)) { + if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, + sizeof(virtvdev->pci_base_addr_0), + ©_offset, ©_count, + ®ister_offset)) { if (copy_from_user((void *)&virtvdev->pci_base_addr_0 + register_offset, buf + copy_offset, copy_count)) -- cgit v1.2.3