summaryrefslogtreecommitdiffstats
path: root/drivers/vfio/pci
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-08-07 13:11:40 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-08-07 13:11:40 +0000
commit8b0a8165cdad0f4133837d753649ef4682e42c3b (patch)
tree5c58f869f31ddb1f7bd6e8bdea269b680b36c5b6 /drivers/vfio/pci
parentReleasing progress-linux version 6.8.12-1~progress7.99u1. (diff)
downloadlinux-8b0a8165cdad0f4133837d753649ef4682e42c3b.tar.xz
linux-8b0a8165cdad0f4133837d753649ef4682e42c3b.zip
Merging upstream version 6.9.7.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'drivers/vfio/pci')
-rw-r--r--drivers/vfio/pci/Kconfig2
-rw-r--r--drivers/vfio/pci/Makefile2
-rw-r--r--drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c48
-rw-r--r--drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h6
-rw-r--r--drivers/vfio/pci/mlx5/cmd.c157
-rw-r--r--drivers/vfio/pci/mlx5/cmd.h11
-rw-r--r--drivers/vfio/pci/mlx5/main.c148
-rw-r--r--drivers/vfio/pci/nvgrace-gpu/Kconfig10
-rw-r--r--drivers/vfio/pci/nvgrace-gpu/Makefile3
-rw-r--r--drivers/vfio/pci/nvgrace-gpu/main.c888
-rw-r--r--drivers/vfio/pci/pds/dirty.c6
-rw-r--r--drivers/vfio/pci/pds/pci_drv.c27
-rw-r--r--drivers/vfio/pci/pds/vfio_dev.c45
-rw-r--r--drivers/vfio/pci/pds/vfio_dev.h8
-rw-r--r--drivers/vfio/pci/vfio_pci_config.c42
-rw-r--r--drivers/vfio/pci/vfio_pci_core.c98
-rw-r--r--drivers/vfio/pci/vfio_pci_intrs.c4
-rw-r--r--drivers/vfio/pci/vfio_pci_rdwr.c16
-rw-r--r--drivers/vfio/pci/virtio/main.c72
19 files changed, 1262 insertions, 331 deletions
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 18c397df56..15821a2d77 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -67,4 +67,6 @@ source "drivers/vfio/pci/pds/Kconfig"
source "drivers/vfio/pci/virtio/Kconfig"
+source "drivers/vfio/pci/nvgrace-gpu/Kconfig"
+
endmenu
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 046139a4ec..ce7a61f1d9 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -15,3 +15,5 @@ obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/
obj-$(CONFIG_PDS_VFIO_PCI) += pds/
obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/
+
+obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu/
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
index 4d27465c8f..9a3e97108a 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
@@ -630,25 +630,11 @@ static void hisi_acc_vf_disable_fds(struct hisi_acc_vf_core_device *hisi_acc_vde
}
}
-/*
- * This function is called in all state_mutex unlock cases to
- * handle a 'deferred_reset' if exists.
- */
-static void
-hisi_acc_vf_state_mutex_unlock(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+static void hisi_acc_vf_reset(struct hisi_acc_vf_core_device *hisi_acc_vdev)
{
-again:
- spin_lock(&hisi_acc_vdev->reset_lock);
- if (hisi_acc_vdev->deferred_reset) {
- hisi_acc_vdev->deferred_reset = false;
- spin_unlock(&hisi_acc_vdev->reset_lock);
- hisi_acc_vdev->vf_qm_state = QM_NOT_READY;
- hisi_acc_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
- hisi_acc_vf_disable_fds(hisi_acc_vdev);
- goto again;
- }
- mutex_unlock(&hisi_acc_vdev->state_mutex);
- spin_unlock(&hisi_acc_vdev->reset_lock);
+ hisi_acc_vdev->vf_qm_state = QM_NOT_READY;
+ hisi_acc_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+ hisi_acc_vf_disable_fds(hisi_acc_vdev);
}
static void hisi_acc_vf_start_device(struct hisi_acc_vf_core_device *hisi_acc_vdev)
@@ -804,8 +790,10 @@ static long hisi_acc_vf_precopy_ioctl(struct file *filp,
info.dirty_bytes = 0;
info.initial_bytes = migf->total_length - *pos;
+ mutex_unlock(&migf->lock);
+ mutex_unlock(&hisi_acc_vdev->state_mutex);
- ret = copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0;
+ return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0;
out:
mutex_unlock(&migf->lock);
mutex_unlock(&hisi_acc_vdev->state_mutex);
@@ -1071,7 +1059,7 @@ hisi_acc_vfio_pci_set_device_state(struct vfio_device *vdev,
break;
}
}
- hisi_acc_vf_state_mutex_unlock(hisi_acc_vdev);
+ mutex_unlock(&hisi_acc_vdev->state_mutex);
return res;
}
@@ -1092,7 +1080,7 @@ hisi_acc_vfio_pci_get_device_state(struct vfio_device *vdev,
mutex_lock(&hisi_acc_vdev->state_mutex);
*curr_state = hisi_acc_vdev->mig_state;
- hisi_acc_vf_state_mutex_unlock(hisi_acc_vdev);
+ mutex_unlock(&hisi_acc_vdev->state_mutex);
return 0;
}
@@ -1104,21 +1092,9 @@ static void hisi_acc_vf_pci_aer_reset_done(struct pci_dev *pdev)
VFIO_MIGRATION_STOP_COPY)
return;
- /*
- * As the higher VFIO layers are holding locks across reset and using
- * those same locks with the mm_lock we need to prevent ABBA deadlock
- * with the state_mutex and mm_lock.
- * In case the state_mutex was taken already we defer the cleanup work
- * to the unlock flow of the other running context.
- */
- spin_lock(&hisi_acc_vdev->reset_lock);
- hisi_acc_vdev->deferred_reset = true;
- if (!mutex_trylock(&hisi_acc_vdev->state_mutex)) {
- spin_unlock(&hisi_acc_vdev->reset_lock);
- return;
- }
- spin_unlock(&hisi_acc_vdev->reset_lock);
- hisi_acc_vf_state_mutex_unlock(hisi_acc_vdev);
+ mutex_lock(&hisi_acc_vdev->state_mutex);
+ hisi_acc_vf_reset(hisi_acc_vdev);
+ mutex_unlock(&hisi_acc_vdev->state_mutex);
}
static int hisi_acc_vf_qm_init(struct hisi_acc_vf_core_device *hisi_acc_vdev)
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
index dcabfeec6c..5bab46602f 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
@@ -98,8 +98,8 @@ struct hisi_acc_vf_migration_file {
struct hisi_acc_vf_core_device {
struct vfio_pci_core_device core_device;
- u8 match_done:1;
- u8 deferred_reset:1;
+ u8 match_done;
+
/* For migration state */
struct mutex state_mutex;
enum vfio_device_mig_state mig_state;
@@ -109,8 +109,6 @@ struct hisi_acc_vf_core_device {
struct hisi_qm vf_qm;
u32 vf_qm_state;
int vf_id;
- /* For reset handler */
- spinlock_t reset_lock;
struct hisi_acc_vf_migration_file *resuming_migf;
struct hisi_acc_vf_migration_file *saving_migf;
};
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index efd1d252cd..41a4b0cf42 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -108,8 +108,9 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
if (ret)
return ret;
- if (mvdev->saving_migf->state ==
- MLX5_MIGF_STATE_PRE_COPY_ERROR) {
+ /* Upon cleanup, ignore previous pre_copy error state */
+ if (mvdev->saving_migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR &&
+ !(query_flags & MLX5VF_QUERY_CLEANUP)) {
/*
* In case we had a PRE_COPY error, only query full
* image for final image
@@ -121,6 +122,11 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
}
query_flags &= ~MLX5VF_QUERY_INC;
}
+ /* Block incremental query which is state-dependent */
+ if (mvdev->saving_migf->state == MLX5_MIGF_STATE_ERROR) {
+ complete(&mvdev->saving_migf->save_comp);
+ return -ENODEV;
+ }
}
MLX5_SET(query_vhca_migration_state_in, in, opcode,
@@ -149,6 +155,12 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
return 0;
}
+static void set_tracker_change_event(struct mlx5vf_pci_core_device *mvdev)
+{
+ mvdev->tracker.object_changed = true;
+ complete(&mvdev->tracker_comp);
+}
+
static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev)
{
/* Mark the tracker under an error and wake it up if it's running */
@@ -189,7 +201,7 @@ void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev)
/* Must be done outside the lock to let it progress */
set_tracker_error(mvdev);
mutex_lock(&mvdev->state_mutex);
- mlx5vf_disable_fds(mvdev);
+ mlx5vf_disable_fds(mvdev, NULL);
_mlx5vf_free_page_tracker_resources(mvdev);
mlx5vf_state_mutex_unlock(mvdev);
}
@@ -221,6 +233,10 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
if (!MLX5_CAP_GEN(mvdev->mdev, migration))
goto end;
+ if (!(MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
+ MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)))
+ goto end;
+
mvdev->vf_id = pci_iov_vf_id(pdev);
if (mvdev->vf_id < 0)
goto end;
@@ -250,17 +266,14 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
mvdev->migrate_cap = 1;
mvdev->core_device.vdev.migration_flags =
VFIO_MIGRATION_STOP_COPY |
- VFIO_MIGRATION_P2P;
+ VFIO_MIGRATION_P2P |
+ VFIO_MIGRATION_PRE_COPY;
+
mvdev->core_device.vdev.mig_ops = mig_ops;
init_completion(&mvdev->tracker_comp);
if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
mvdev->core_device.vdev.log_ops = log_ops;
- if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
- MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state))
- mvdev->core_device.vdev.migration_flags |=
- VFIO_MIGRATION_PRE_COPY;
-
if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks))
mvdev->chunk_mode = 1;
@@ -402,6 +415,50 @@ void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
kfree(buf);
}
+static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
+ unsigned int npages)
+{
+ unsigned int to_alloc = npages;
+ struct page **page_list;
+ unsigned long filled;
+ unsigned int to_fill;
+ int ret;
+
+ to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
+ page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT);
+ if (!page_list)
+ return -ENOMEM;
+
+ do {
+ filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
+ page_list);
+ if (!filled) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ to_alloc -= filled;
+ ret = sg_alloc_append_table_from_pages(
+ &buf->table, page_list, filled, 0,
+ filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
+ GFP_KERNEL_ACCOUNT);
+
+ if (ret)
+ goto err;
+ buf->allocated_length += filled * PAGE_SIZE;
+ /* clean input for another bulk allocation */
+ memset(page_list, 0, filled * sizeof(*page_list));
+ to_fill = min_t(unsigned int, to_alloc,
+ PAGE_SIZE / sizeof(*page_list));
+ } while (to_alloc > 0);
+
+ kvfree(page_list);
+ return 0;
+
+err:
+ kvfree(page_list);
+ return ret;
+}
+
struct mlx5_vhca_data_buffer *
mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
size_t length,
@@ -608,8 +665,13 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
err:
/* The error flow can't run from an interrupt context */
- if (status == -EREMOTEIO)
+ if (status == -EREMOTEIO) {
status = MLX5_GET(save_vhca_state_out, async_data->out, status);
+ /* Failed in FW, print cmd out failure details */
+ mlx5_cmd_out_err(migf->mvdev->mdev, MLX5_CMD_OP_SAVE_VHCA_STATE, 0,
+ async_data->out);
+ }
+
async_data->status = status;
queue_work(migf->mvdev->cb_wq, &async_data->work);
}
@@ -623,6 +685,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
struct mlx5_vhca_data_buffer *header_buf = NULL;
struct mlx5vf_async_data *async_data;
+ bool pre_copy_cleanup = false;
int err;
lockdep_assert_held(&mvdev->state_mutex);
@@ -633,6 +696,10 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
if (err)
return err;
+ if ((migf->state == MLX5_MIGF_STATE_PRE_COPY ||
+ migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) && !track && !inc)
+ pre_copy_cleanup = true;
+
if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
/*
* In case we had a PRE_COPY error, SAVE is triggered only for
@@ -651,29 +718,27 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
async_data = &migf->async_data;
async_data->buf = buf;
- async_data->stop_copy_chunk = !track;
+ async_data->stop_copy_chunk = (!track && !pre_copy_cleanup);
async_data->out = kvzalloc(out_size, GFP_KERNEL);
if (!async_data->out) {
err = -ENOMEM;
goto err_out;
}
- if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
- if (async_data->stop_copy_chunk) {
- u8 header_idx = buf->stop_copy_chunk_num ?
- buf->stop_copy_chunk_num - 1 : 0;
+ if (async_data->stop_copy_chunk) {
+ u8 header_idx = buf->stop_copy_chunk_num ?
+ buf->stop_copy_chunk_num - 1 : 0;
- header_buf = migf->buf_header[header_idx];
- migf->buf_header[header_idx] = NULL;
- }
+ header_buf = migf->buf_header[header_idx];
+ migf->buf_header[header_idx] = NULL;
+ }
- if (!header_buf) {
- header_buf = mlx5vf_get_data_buffer(migf,
- sizeof(struct mlx5_vf_migration_header), DMA_NONE);
- if (IS_ERR(header_buf)) {
- err = PTR_ERR(header_buf);
- goto err_free;
- }
+ if (!header_buf) {
+ header_buf = mlx5vf_get_data_buffer(migf,
+ sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ if (IS_ERR(header_buf)) {
+ err = PTR_ERR(header_buf);
+ goto err_free;
}
}
@@ -900,6 +965,29 @@ static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev,
return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
}
+static int mlx5vf_cmd_query_tracker(struct mlx5_core_dev *mdev,
+ struct mlx5_vhca_page_tracker *tracker)
+{
+ u32 out[MLX5_ST_SZ_DW(query_page_track_obj_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
+ void *obj_context;
+ void *cmd_hdr;
+ int err;
+
+ cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker->id);
+
+ err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+ if (err)
+ return err;
+
+ obj_context = MLX5_ADDR_OF(query_page_track_obj_out, out, obj_context);
+ tracker->status = MLX5_GET(page_track, obj_context, state);
+ return 0;
+}
+
static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
struct mlx5_vhca_cq_buf *buf, int nent,
int cqe_size)
@@ -957,9 +1045,11 @@ static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb);
struct mlx5vf_pci_core_device *mvdev = container_of(
tracker, struct mlx5vf_pci_core_device, tracker);
+ struct mlx5_eqe_obj_change *object;
struct mlx5_eqe *eqe = data;
u8 event_type = (u8)type;
u8 queue_type;
+ u32 obj_id;
int qp_num;
switch (event_type) {
@@ -975,6 +1065,12 @@ static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
break;
set_tracker_error(mvdev);
break;
+ case MLX5_EVENT_TYPE_OBJECT_CHANGE:
+ object = &eqe->data.obj_change;
+ obj_id = be32_to_cpu(object->obj_id);
+ if (obj_id == tracker->id)
+ set_tracker_change_event(mvdev);
+ break;
default:
break;
}
@@ -1634,6 +1730,11 @@ int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
goto end;
}
+ if (tracker->is_err) {
+ err = -EIO;
+ goto end;
+ }
+
mdev = mvdev->mdev;
err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length,
MLX5_PAGE_TRACK_STATE_REPORTING);
@@ -1652,6 +1753,12 @@ int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
dirty, &tracker->status);
if (poll_err == CQ_EMPTY) {
wait_for_completion(&mvdev->tracker_comp);
+ if (tracker->object_changed) {
+ tracker->object_changed = false;
+ err = mlx5vf_cmd_query_tracker(mdev, tracker);
+ if (err)
+ goto end;
+ }
continue;
}
}
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index f2c7227fa6..df421dc6de 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -13,9 +13,6 @@
#include <linux/mlx5/cq.h>
#include <linux/mlx5/qp.h>
-#define MLX5VF_PRE_COPY_SUPP(mvdev) \
- ((mvdev)->core_device.vdev.migration_flags & VFIO_MIGRATION_PRE_COPY)
-
enum mlx5_vf_migf_state {
MLX5_MIGF_STATE_ERROR = 1,
MLX5_MIGF_STATE_PRE_COPY_ERROR,
@@ -25,7 +22,6 @@ enum mlx5_vf_migf_state {
};
enum mlx5_vf_load_state {
- MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER,
MLX5_VF_LOAD_STATE_READ_HEADER,
MLX5_VF_LOAD_STATE_PREP_HEADER_DATA,
MLX5_VF_LOAD_STATE_READ_HEADER_DATA,
@@ -162,6 +158,7 @@ struct mlx5_vhca_page_tracker {
u32 id;
u32 pdn;
u8 is_err:1;
+ u8 object_changed:1;
struct mlx5_uars_page *uar;
struct mlx5_vhca_cq cq;
struct mlx5_vhca_qp *host_qp;
@@ -196,6 +193,7 @@ struct mlx5vf_pci_core_device {
enum {
MLX5VF_QUERY_INC = (1UL << 0),
MLX5VF_QUERY_FINAL = (1UL << 1),
+ MLX5VF_QUERY_CLEANUP = (1UL << 2),
};
int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
@@ -226,12 +224,11 @@ struct mlx5_vhca_data_buffer *
mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
size_t length, enum dma_data_direction dma_dir);
void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf);
-int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
- unsigned int npages);
struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
unsigned long offset);
void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev);
-void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev);
+void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev,
+ enum mlx5_vf_migf_state *last_save_state);
void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work);
void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
u8 chunk_num, size_t next_required_umem_size);
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index fe09a8c8af..61d9b0f914 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -65,50 +65,6 @@ mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
return NULL;
}
-int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
- unsigned int npages)
-{
- unsigned int to_alloc = npages;
- struct page **page_list;
- unsigned long filled;
- unsigned int to_fill;
- int ret;
-
- to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
- page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT);
- if (!page_list)
- return -ENOMEM;
-
- do {
- filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
- page_list);
- if (!filled) {
- ret = -ENOMEM;
- goto err;
- }
- to_alloc -= filled;
- ret = sg_alloc_append_table_from_pages(
- &buf->table, page_list, filled, 0,
- filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
- GFP_KERNEL_ACCOUNT);
-
- if (ret)
- goto err;
- buf->allocated_length += filled * PAGE_SIZE;
- /* clean input for another bulk allocation */
- memset(page_list, 0, filled * sizeof(*page_list));
- to_fill = min_t(unsigned int, to_alloc,
- PAGE_SIZE / sizeof(*page_list));
- } while (to_alloc > 0);
-
- kvfree(page_list);
- return 0;
-
-err:
- kvfree(page_list);
- return ret;
-}
-
static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
{
mutex_lock(&migf->lock);
@@ -777,36 +733,6 @@ mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
return 0;
}
-static int
-mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf,
- loff_t requested_length,
- const char __user **buf, size_t *len,
- loff_t *pos, ssize_t *done)
-{
- int ret;
-
- if (requested_length > MAX_LOAD_SIZE)
- return -ENOMEM;
-
- if (vhca_buf->allocated_length < requested_length) {
- ret = mlx5vf_add_migration_pages(
- vhca_buf,
- DIV_ROUND_UP(requested_length - vhca_buf->allocated_length,
- PAGE_SIZE));
- if (ret)
- return ret;
- }
-
- while (*len) {
- ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos,
- done);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
static ssize_t
mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
struct mlx5_vhca_data_buffer *vhca_buf,
@@ -1038,13 +964,6 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
break;
}
- case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER:
- ret = mlx5vf_resume_read_image_no_header(vhca_buf,
- requested_length,
- &buf, &len, pos, &done);
- if (ret)
- goto out_unlock;
- break;
case MLX5_VF_LOAD_STATE_READ_IMAGE:
ret = mlx5vf_resume_read_image(migf, vhca_buf,
migf->record_size,
@@ -1114,21 +1033,16 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
}
migf->buf[0] = buf;
- if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
- buf = mlx5vf_alloc_data_buffer(migf,
- sizeof(struct mlx5_vf_migration_header), DMA_NONE);
- if (IS_ERR(buf)) {
- ret = PTR_ERR(buf);
- goto out_buf;
- }
-
- migf->buf_header[0] = buf;
- migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
- } else {
- /* Initial state will be to read the image */
- migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER;
+ buf = mlx5vf_alloc_data_buffer(migf,
+ sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out_buf;
}
+ migf->buf_header[0] = buf;
+ migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
+
stream_open(migf->filp->f_inode, migf->filp);
mutex_init(&migf->lock);
INIT_LIST_HEAD(&migf->buf_list);
@@ -1146,7 +1060,8 @@ end:
return ERR_PTR(ret);
}
-void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
+void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev,
+ enum mlx5_vf_migf_state *last_save_state)
{
if (mvdev->resuming_migf) {
mlx5vf_disable_fd(mvdev->resuming_migf);
@@ -1157,6 +1072,8 @@ void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
if (mvdev->saving_migf) {
mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
cancel_work_sync(&mvdev->saving_migf->async_data.work);
+ if (last_save_state)
+ *last_save_state = mvdev->saving_migf->state;
mlx5vf_disable_fd(mvdev->saving_migf);
wake_up_interruptible(&mvdev->saving_migf->poll_wait);
mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
@@ -1217,12 +1134,34 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
return migf->filp;
}
- if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
- (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
+ if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) {
+ mlx5vf_disable_fds(mvdev, NULL);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
(cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
- mlx5vf_disable_fds(mvdev);
- return NULL;
+ struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
+ struct mlx5_vhca_data_buffer *buf;
+ enum mlx5_vf_migf_state state;
+ size_t size;
+
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL,
+ MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP);
+ if (ret)
+ return ERR_PTR(ret);
+ buf = mlx5vf_get_data_buffer(migf, size, DMA_FROM_DEVICE);
+ if (IS_ERR(buf))
+ return ERR_CAST(buf);
+ /* pre_copy cleanup */
+ ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, false);
+ if (ret) {
+ mlx5vf_put_data_buffer(buf);
+ return ERR_PTR(ret);
+ }
+ mlx5vf_disable_fds(mvdev, &state);
+ return (state != MLX5_MIGF_STATE_ERROR) ? NULL : ERR_PTR(-EIO);
}
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
@@ -1237,14 +1176,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
}
if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
- if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
- ret = mlx5vf_cmd_load_vhca_state(mvdev,
- mvdev->resuming_migf,
- mvdev->resuming_migf->buf[0]);
- if (ret)
- return ERR_PTR(ret);
- }
- mlx5vf_disable_fds(mvdev);
+ mlx5vf_disable_fds(mvdev, NULL);
return NULL;
}
@@ -1289,7 +1221,7 @@ again:
mvdev->deferred_reset = false;
spin_unlock(&mvdev->reset_lock);
mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
- mlx5vf_disable_fds(mvdev);
+ mlx5vf_disable_fds(mvdev, NULL);
goto again;
}
mutex_unlock(&mvdev->state_mutex);
diff --git a/drivers/vfio/pci/nvgrace-gpu/Kconfig b/drivers/vfio/pci/nvgrace-gpu/Kconfig
new file mode 100644
index 0000000000..a7f624b37e
--- /dev/null
+++ b/drivers/vfio/pci/nvgrace-gpu/Kconfig
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config NVGRACE_GPU_VFIO_PCI
+ tristate "VFIO support for the GPU in the NVIDIA Grace Hopper Superchip"
+ depends on ARM64 || (COMPILE_TEST && 64BIT)
+ select VFIO_PCI_CORE
+ help
+ VFIO support for the GPU in the NVIDIA Grace Hopper Superchip is
+ required to assign the GPU device to userspace using KVM/qemu/etc.
+
+ If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/nvgrace-gpu/Makefile b/drivers/vfio/pci/nvgrace-gpu/Makefile
new file mode 100644
index 0000000000..3ca8c18789
--- /dev/null
+++ b/drivers/vfio/pci/nvgrace-gpu/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu-vfio-pci.o
+nvgrace-gpu-vfio-pci-y := main.o
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
new file mode 100644
index 0000000000..a7fd018aa5
--- /dev/null
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -0,0 +1,888 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include <linux/sizes.h>
+#include <linux/vfio_pci_core.h>
+
+/*
+ * The device memory usable to the workloads running in the VM is cached
+ * and showcased as a 64b device BAR (comprising of BAR4 and BAR5 region)
+ * to the VM and is represented as usemem.
+ * Moreover, the VM GPU device driver needs a non-cacheable region to
+ * support the MIG feature. This region is also exposed as a 64b BAR
+ * (comprising of BAR2 and BAR3 region) and represented as resmem.
+ */
+#define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX
+#define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX
+
+/* Memory size expected as non cached and reserved by the VM driver */
+#define RESMEM_SIZE SZ_1G
+
+/* A hardwired and constant ABI value between the GPU FW and VFIO driver. */
+#define MEMBLK_SIZE SZ_512M
+
+/*
+ * The state of the two device memory region - resmem and usemem - is
+ * saved as struct mem_region.
+ */
+struct mem_region {
+ phys_addr_t memphys; /* Base physical address of the region */
+ size_t memlength; /* Region size */
+ size_t bar_size; /* Reported region BAR size */
+ __le64 bar_val; /* Emulated BAR offset registers */
+ union {
+ void *memaddr;
+ void __iomem *ioaddr;
+ }; /* Base virtual address of the region */
+};
+
+struct nvgrace_gpu_pci_core_device {
+ struct vfio_pci_core_device core_device;
+ /* Cached and usable memory for the VM. */
+ struct mem_region usemem;
+ /* Non cached memory carved out from the end of device memory */
+ struct mem_region resmem;
+ /* Lock to control device memory kernel mapping */
+ struct mutex remap_lock;
+};
+
+static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
+{
+ struct nvgrace_gpu_pci_core_device *nvdev =
+ container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+ core_device.vdev);
+
+ nvdev->resmem.bar_val = 0;
+ nvdev->usemem.bar_val = 0;
+}
+
+/* Choose the structure corresponding to the fake BAR with a given index. */
+static struct mem_region *
+nvgrace_gpu_memregion(int index,
+ struct nvgrace_gpu_pci_core_device *nvdev)
+{
+ if (index == USEMEM_REGION_INDEX)
+ return &nvdev->usemem;
+
+ if (index == RESMEM_REGION_INDEX)
+ return &nvdev->resmem;
+
+ return NULL;
+}
+
+static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
+{
+ struct vfio_pci_core_device *vdev =
+ container_of(core_vdev, struct vfio_pci_core_device, vdev);
+ struct nvgrace_gpu_pci_core_device *nvdev =
+ container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+ core_device.vdev);
+ int ret;
+
+ ret = vfio_pci_core_enable(vdev);
+ if (ret)
+ return ret;
+
+ if (nvdev->usemem.memlength) {
+ nvgrace_gpu_init_fake_bar_emu_regs(core_vdev);
+ mutex_init(&nvdev->remap_lock);
+ }
+
+ vfio_pci_core_finish_enable(vdev);
+
+ return 0;
+}
+
+static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
+{
+ struct nvgrace_gpu_pci_core_device *nvdev =
+ container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+ core_device.vdev);
+
+ /* Unmap the mapping to the device memory cached region */
+ if (nvdev->usemem.memaddr) {
+ memunmap(nvdev->usemem.memaddr);
+ nvdev->usemem.memaddr = NULL;
+ }
+
+ /* Unmap the mapping to the device memory non-cached region */
+ if (nvdev->resmem.ioaddr) {
+ iounmap(nvdev->resmem.ioaddr);
+ nvdev->resmem.ioaddr = NULL;
+ }
+
+ mutex_destroy(&nvdev->remap_lock);
+
+ vfio_pci_core_close_device(core_vdev);
+}
+
+static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
+ struct vm_area_struct *vma)
+{
+ struct nvgrace_gpu_pci_core_device *nvdev =
+ container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+ core_device.vdev);
+ struct mem_region *memregion;
+ unsigned long start_pfn;
+ u64 req_len, pgoff, end;
+ unsigned int index;
+ int ret = 0;
+
+ index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+
+ memregion = nvgrace_gpu_memregion(index, nvdev);
+ if (!memregion)
+ return vfio_pci_core_mmap(core_vdev, vma);
+
+ /*
+ * Request to mmap the BAR. Map to the CPU accessible memory on the
+ * GPU using the memory information gathered from the system ACPI
+ * tables.
+ */
+ pgoff = vma->vm_pgoff &
+ ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+
+ if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) ||
+ check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) ||
+ check_add_overflow(PFN_PHYS(pgoff), req_len, &end))
+ return -EOVERFLOW;
+
+ /*
+ * Check that the mapping request does not go beyond available device
+ * memory size
+ */
+ if (end > memregion->memlength)
+ return -EINVAL;
+
+ /*
+ * The carved out region of the device memory needs the NORMAL_NC
+ * property. Communicate as such to the hypervisor.
+ */
+ if (index == RESMEM_REGION_INDEX) {
+ /*
+ * The nvgrace-gpu module has no issues with uncontained
+ * failures on NORMAL_NC accesses. VM_ALLOW_ANY_UNCACHED is
+ * set to communicate to the KVM to S2 map as NORMAL_NC.
+ * This opens up guest usage of NORMAL_NC for this mapping.
+ */
+ vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED);
+
+ vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+ }
+
+ /*
+ * Perform a PFN map to the memory and back the device BAR by the
+ * GPU memory.
+ *
+ * The available GPU memory size may not be power-of-2 aligned. The
+ * remainder is only backed by vfio_device_ops read/write handlers.
+ *
+ * During device reset, the GPU is safely disconnected to the CPU
+ * and access to the BAR will be immediately returned preventing
+ * machine check.
+ */
+ ret = remap_pfn_range(vma, vma->vm_start, start_pfn,
+ req_len, vma->vm_page_prot);
+ if (ret)
+ return ret;
+
+ vma->vm_pgoff = start_pfn;
+
+ return 0;
+}
+
+static long
+nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev,
+ unsigned long arg)
+{
+ struct nvgrace_gpu_pci_core_device *nvdev =
+ container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+ core_device.vdev);
+ unsigned long minsz = offsetofend(struct vfio_region_info, offset);
+ struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+ struct vfio_region_info_cap_sparse_mmap *sparse;
+ struct vfio_region_info info;
+ struct mem_region *memregion;
+ u32 size;
+ int ret;
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ /*
+ * Request to determine the BAR region information. Send the
+ * GPU memory information.
+ */
+ memregion = nvgrace_gpu_memregion(info.index, nvdev);
+ if (!memregion)
+ return vfio_pci_core_ioctl(core_vdev,
+ VFIO_DEVICE_GET_REGION_INFO, arg);
+
+ size = struct_size(sparse, areas, 1);
+
+ /*
+ * Setup for sparse mapping for the device memory. Only the
+ * available device memory on the hardware is shown as a
+ * mappable region.
+ */
+ sparse = kzalloc(size, GFP_KERNEL);
+ if (!sparse)
+ return -ENOMEM;
+
+ sparse->nr_areas = 1;
+ sparse->areas[0].offset = 0;
+ sparse->areas[0].size = memregion->memlength;
+ sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+ sparse->header.version = 1;
+
+ ret = vfio_info_add_capability(&caps, &sparse->header, size);
+ kfree(sparse);
+ if (ret)
+ return ret;
+
+ info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+ /*
+ * The region memory size may not be power-of-2 aligned.
+ * Given that the memory as a BAR and may not be
+ * aligned, roundup to the next power-of-2.
+ */
+ info.size = memregion->bar_size;
+ info.flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE |
+ VFIO_REGION_INFO_FLAG_MMAP;
+
+ if (caps.size) {
+ info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
+ if (info.argsz < sizeof(info) + caps.size) {
+ info.argsz = sizeof(info) + caps.size;
+ info.cap_offset = 0;
+ } else {
+ vfio_info_cap_shift(&caps, sizeof(info));
+ if (copy_to_user((void __user *)arg +
+ sizeof(info), caps.buf,
+ caps.size)) {
+ kfree(caps.buf);
+ return -EFAULT;
+ }
+ info.cap_offset = sizeof(info);
+ }
+ kfree(caps.buf);
+ }
+ return copy_to_user((void __user *)arg, &info, minsz) ?
+ -EFAULT : 0;
+}
+
+static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev,
+ unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case VFIO_DEVICE_GET_REGION_INFO:
+ return nvgrace_gpu_ioctl_get_region_info(core_vdev, arg);
+ case VFIO_DEVICE_IOEVENTFD:
+ return -ENOTTY;
+ case VFIO_DEVICE_RESET:
+ nvgrace_gpu_init_fake_bar_emu_regs(core_vdev);
+ fallthrough;
+ default:
+ return vfio_pci_core_ioctl(core_vdev, cmd, arg);
+ }
+}
+
+static __le64
+nvgrace_gpu_get_read_value(size_t bar_size, u64 flags, __le64 val64)
+{
+ u64 tmp_val;
+
+ tmp_val = le64_to_cpu(val64);
+ tmp_val &= ~(bar_size - 1);
+ tmp_val |= flags;
+
+ return cpu_to_le64(tmp_val);
+}
+
+/*
+ * Both the usable (usemem) and the reserved (resmem) device memory region
+ * are exposed as a 64b fake device BARs in the VM. These fake BARs must
+ * respond to the accesses on their respective PCI config space offsets.
+ *
+ * resmem BAR owns PCI_BASE_ADDRESS_2 & PCI_BASE_ADDRESS_3.
+ * usemem BAR owns PCI_BASE_ADDRESS_4 & PCI_BASE_ADDRESS_5.
+ */
+static ssize_t
+nvgrace_gpu_read_config_emu(struct vfio_device *core_vdev,
+ char __user *buf, size_t count, loff_t *ppos)
+{
+ struct nvgrace_gpu_pci_core_device *nvdev =
+ container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+ core_device.vdev);
+ u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
+ struct mem_region *memregion = NULL;
+ __le64 val64;
+ size_t register_offset;
+ loff_t copy_offset;
+ size_t copy_count;
+ int ret;
+
+ ret = vfio_pci_core_read(core_vdev, buf, count, ppos);
+ if (ret < 0)
+ return ret;
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2,
+ sizeof(val64),
+ &copy_offset, &copy_count,
+ &register_offset))
+ memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev);
+ else if (vfio_pci_core_range_intersect_range(pos, count,
+ PCI_BASE_ADDRESS_4,
+ sizeof(val64),
+ &copy_offset, &copy_count,
+ &register_offset))
+ memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev);
+
+ if (memregion) {
+ val64 = nvgrace_gpu_get_read_value(memregion->bar_size,
+ PCI_BASE_ADDRESS_MEM_TYPE_64 |
+ PCI_BASE_ADDRESS_MEM_PREFETCH,
+ memregion->bar_val);
+ if (copy_to_user(buf + copy_offset,
+ (void *)&val64 + register_offset, copy_count)) {
+ /*
+ * The position has been incremented in
+ * vfio_pci_core_read. Reset the offset back to the
+ * starting position.
+ */
+ *ppos -= count;
+ return -EFAULT;
+ }
+ }
+
+ return count;
+}
+
+static ssize_t
+nvgrace_gpu_write_config_emu(struct vfio_device *core_vdev,
+ const char __user *buf, size_t count, loff_t *ppos)
+{
+ struct nvgrace_gpu_pci_core_device *nvdev =
+ container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+ core_device.vdev);
+ u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
+ struct mem_region *memregion = NULL;
+ size_t register_offset;
+ loff_t copy_offset;
+ size_t copy_count;
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2,
+ sizeof(u64), &copy_offset,
+ &copy_count, &register_offset))
+ memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev);
+ else if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_4,
+ sizeof(u64), &copy_offset,
+ &copy_count, &register_offset))
+ memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev);
+
+ if (memregion) {
+ if (copy_from_user((void *)&memregion->bar_val + register_offset,
+ buf + copy_offset, copy_count))
+ return -EFAULT;
+ *ppos += copy_count;
+ return copy_count;
+ }
+
+ return vfio_pci_core_write(core_vdev, buf, count, ppos);
+}
+
+/*
+ * Ad hoc map the device memory in the module kernel VA space. Primarily needed
+ * as vfio does not require the userspace driver to only perform accesses through
+ * mmaps of the vfio-pci BAR regions and such accesses should be supported using
+ * vfio_device_ops read/write implementations.
+ *
+ * The usemem region is cacheable memory and hence is memremaped.
+ * The resmem region is non-cached and is mapped using ioremap_wc (NORMAL_NC).
+ */
+static int
+nvgrace_gpu_map_device_mem(int index,
+ struct nvgrace_gpu_pci_core_device *nvdev)
+{
+ struct mem_region *memregion;
+ int ret = 0;
+
+ memregion = nvgrace_gpu_memregion(index, nvdev);
+ if (!memregion)
+ return -EINVAL;
+
+ mutex_lock(&nvdev->remap_lock);
+
+ if (memregion->memaddr)
+ goto unlock;
+
+ if (index == USEMEM_REGION_INDEX)
+ memregion->memaddr = memremap(memregion->memphys,
+ memregion->memlength,
+ MEMREMAP_WB);
+ else
+ memregion->ioaddr = ioremap_wc(memregion->memphys,
+ memregion->memlength);
+
+ if (!memregion->memaddr)
+ ret = -ENOMEM;
+
+unlock:
+ mutex_unlock(&nvdev->remap_lock);
+
+ return ret;
+}
+
+/*
+ * Read the data from the device memory (mapped either through ioremap
+ * or memremap) into the user buffer.
+ */
+static int
+nvgrace_gpu_map_and_read(struct nvgrace_gpu_pci_core_device *nvdev,
+ char __user *buf, size_t mem_count, loff_t *ppos)
+{
+ unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
+ int ret;
+
+ if (!mem_count)
+ return 0;
+
+ /*
+ * Handle read on the BAR regions. Map to the target device memory
+ * physical address and copy to the request read buffer.
+ */
+ ret = nvgrace_gpu_map_device_mem(index, nvdev);
+ if (ret)
+ return ret;
+
+ if (index == USEMEM_REGION_INDEX) {
+ if (copy_to_user(buf,
+ (u8 *)nvdev->usemem.memaddr + offset,
+ mem_count))
+ ret = -EFAULT;
+ } else {
+ /*
+ * The hardware ensures that the system does not crash when
+ * the device memory is accessed with the memory enable
+ * turned off. It synthesizes ~0 on such read. So there is
+ * no need to check or support the disablement/enablement of
+ * BAR through PCI_COMMAND config space register. Pass
+ * test_mem flag as false.
+ */
+ ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false,
+ nvdev->resmem.ioaddr,
+ buf, offset, mem_count,
+ 0, 0, false);
+ }
+
+ return ret;
+}
+
+/*
+ * Read count bytes from the device memory at an offset. The actual device
+ * memory size (available) may not be a power-of-2. So the driver fakes
+ * the size to a power-of-2 (reported) when exposing to a user space driver.
+ *
+ * Reads starting beyond the reported size generate -EINVAL; reads extending
+ * beyond the actual device size is filled with ~0; reads extending beyond
+ * the reported size are truncated.
+ */
+static ssize_t
+nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev,
+ char __user *buf, size_t count, loff_t *ppos)
+{
+ u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
+ unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ struct mem_region *memregion;
+ size_t mem_count, i;
+ u8 val = 0xFF;
+ int ret;
+
+ /* No need to do NULL check as caller does. */
+ memregion = nvgrace_gpu_memregion(index, nvdev);
+
+ if (offset >= memregion->bar_size)
+ return -EINVAL;
+
+ /* Clip short the read request beyond reported BAR size */
+ count = min(count, memregion->bar_size - (size_t)offset);
+
+ /*
+ * Determine how many bytes to be actually read from the device memory.
+ * Read request beyond the actual device memory size is filled with ~0,
+ * while those beyond the actual reported size is skipped.
+ */
+ if (offset >= memregion->memlength)
+ mem_count = 0;
+ else
+ mem_count = min(count, memregion->memlength - (size_t)offset);
+
+ ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos);
+ if (ret)
+ return ret;
+
+ /*
+ * Only the device memory present on the hardware is mapped, which may
+ * not be power-of-2 aligned. A read to an offset beyond the device memory
+ * size is filled with ~0.
+ */
+ for (i = mem_count; i < count; i++) {
+ ret = put_user(val, (unsigned char __user *)(buf + i));
+ if (ret)
+ return ret;
+ }
+
+ *ppos += count;
+ return count;
+}
+
+static ssize_t
+nvgrace_gpu_read(struct vfio_device *core_vdev,
+ char __user *buf, size_t count, loff_t *ppos)
+{
+ unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ struct nvgrace_gpu_pci_core_device *nvdev =
+ container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+ core_device.vdev);
+
+ if (nvgrace_gpu_memregion(index, nvdev))
+ return nvgrace_gpu_read_mem(nvdev, buf, count, ppos);
+
+ if (index == VFIO_PCI_CONFIG_REGION_INDEX)
+ return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos);
+
+ return vfio_pci_core_read(core_vdev, buf, count, ppos);
+}
+
+/*
+ * Write the data to the device memory (mapped either through ioremap
+ * or memremap) from the user buffer.
+ */
+static int
+nvgrace_gpu_map_and_write(struct nvgrace_gpu_pci_core_device *nvdev,
+ const char __user *buf, size_t mem_count,
+ loff_t *ppos)
+{
+ unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+ int ret;
+
+ if (!mem_count)
+ return 0;
+
+ ret = nvgrace_gpu_map_device_mem(index, nvdev);
+ if (ret)
+ return ret;
+
+ if (index == USEMEM_REGION_INDEX) {
+ if (copy_from_user((u8 *)nvdev->usemem.memaddr + pos,
+ buf, mem_count))
+ return -EFAULT;
+ } else {
+ /*
+ * The hardware ensures that the system does not crash when
+ * the device memory is accessed with the memory enable
+ * turned off. It drops such writes. So there is no need to
+ * check or support the disablement/enablement of BAR
+ * through PCI_COMMAND config space register. Pass test_mem
+ * flag as false.
+ */
+ ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false,
+ nvdev->resmem.ioaddr,
+ (char __user *)buf, pos, mem_count,
+ 0, 0, true);
+ }
+
+ return ret;
+}
+
+/*
+ * Write count bytes to the device memory at a given offset. The actual device
+ * memory size (available) may not be a power-of-2. So the driver fakes the
+ * size to a power-of-2 (reported) when exposing to a user space driver.
+ *
+ * Writes extending beyond the reported size are truncated; writes starting
+ * beyond the reported size generate -EINVAL.
+ */
+static ssize_t
+nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev,
+ size_t count, loff_t *ppos, const char __user *buf)
+{
+ unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
+ struct mem_region *memregion;
+ size_t mem_count;
+ int ret = 0;
+
+ /* No need to do NULL check as caller does. */
+ memregion = nvgrace_gpu_memregion(index, nvdev);
+
+ if (offset >= memregion->bar_size)
+ return -EINVAL;
+
+ /* Clip short the write request beyond reported BAR size */
+ count = min(count, memregion->bar_size - (size_t)offset);
+
+ /*
+ * Determine how many bytes to be actually written to the device memory.
+ * Do not write to the offset beyond available size.
+ */
+ if (offset >= memregion->memlength)
+ goto exitfn;
+
+ /*
+ * Only the device memory present on the hardware is mapped, which may
+ * not be power-of-2 aligned. Drop access outside the available device
+ * memory on the hardware.
+ */
+ mem_count = min(count, memregion->memlength - (size_t)offset);
+
+ ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos);
+ if (ret)
+ return ret;
+
+exitfn:
+ *ppos += count;
+ return count;
+}
+
+static ssize_t
+nvgrace_gpu_write(struct vfio_device *core_vdev,
+ const char __user *buf, size_t count, loff_t *ppos)
+{
+ struct nvgrace_gpu_pci_core_device *nvdev =
+ container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+ core_device.vdev);
+ unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+
+ if (nvgrace_gpu_memregion(index, nvdev))
+ return nvgrace_gpu_write_mem(nvdev, count, ppos, buf);
+
+ if (index == VFIO_PCI_CONFIG_REGION_INDEX)
+ return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos);
+
+ return vfio_pci_core_write(core_vdev, buf, count, ppos);
+}
+
+static const struct vfio_device_ops nvgrace_gpu_pci_ops = {
+ .name = "nvgrace-gpu-vfio-pci",
+ .init = vfio_pci_core_init_dev,
+ .release = vfio_pci_core_release_dev,
+ .open_device = nvgrace_gpu_open_device,
+ .close_device = nvgrace_gpu_close_device,
+ .ioctl = nvgrace_gpu_ioctl,
+ .device_feature = vfio_pci_core_ioctl_feature,
+ .read = nvgrace_gpu_read,
+ .write = nvgrace_gpu_write,
+ .mmap = nvgrace_gpu_mmap,
+ .request = vfio_pci_core_request,
+ .match = vfio_pci_core_match,
+ .bind_iommufd = vfio_iommufd_physical_bind,
+ .unbind_iommufd = vfio_iommufd_physical_unbind,
+ .attach_ioas = vfio_iommufd_physical_attach_ioas,
+ .detach_ioas = vfio_iommufd_physical_detach_ioas,
+};
+
+static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = {
+ .name = "nvgrace-gpu-vfio-pci-core",
+ .init = vfio_pci_core_init_dev,
+ .release = vfio_pci_core_release_dev,
+ .open_device = nvgrace_gpu_open_device,
+ .close_device = vfio_pci_core_close_device,
+ .ioctl = vfio_pci_core_ioctl,
+ .device_feature = vfio_pci_core_ioctl_feature,
+ .read = vfio_pci_core_read,
+ .write = vfio_pci_core_write,
+ .mmap = vfio_pci_core_mmap,
+ .request = vfio_pci_core_request,
+ .match = vfio_pci_core_match,
+ .bind_iommufd = vfio_iommufd_physical_bind,
+ .unbind_iommufd = vfio_iommufd_physical_unbind,
+ .attach_ioas = vfio_iommufd_physical_attach_ioas,
+ .detach_ioas = vfio_iommufd_physical_detach_ioas,
+};
+
+static int
+nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev,
+ u64 *pmemphys, u64 *pmemlength)
+{
+ int ret;
+
+ /*
+ * The memory information is present in the system ACPI tables as DSD
+ * properties nvidia,gpu-mem-base-pa and nvidia,gpu-mem-size.
+ */
+ ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-base-pa",
+ pmemphys);
+ if (ret)
+ return ret;
+
+ if (*pmemphys > type_max(phys_addr_t))
+ return -EOVERFLOW;
+
+ ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-size",
+ pmemlength);
+ if (ret)
+ return ret;
+
+ if (*pmemlength > type_max(size_t))
+ return -EOVERFLOW;
+
+ /*
+ * If the C2C link is not up due to an error, the coherent device
+ * memory size is returned as 0. Fail in such case.
+ */
+ if (*pmemlength == 0)
+ return -ENOMEM;
+
+ return ret;
+}
+
+static int
+nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
+ struct nvgrace_gpu_pci_core_device *nvdev,
+ u64 memphys, u64 memlength)
+{
+ int ret = 0;
+
+ /*
+ * The VM GPU device driver needs a non-cacheable region to support
+ * the MIG feature. Since the device memory is mapped as NORMAL cached,
+ * carve out a region from the end with a different NORMAL_NC
+ * property (called as reserved memory and represented as resmem). This
+ * region then is exposed as a 64b BAR (region 2 and 3) to the VM, while
+ * exposing the rest (termed as usable memory and represented using usemem)
+ * as cacheable 64b BAR (region 4 and 5).
+ *
+ * devmem (memlength)
+ * |-------------------------------------------------|
+ * | |
+ * usemem.memphys resmem.memphys
+ */
+ nvdev->usemem.memphys = memphys;
+
+ /*
+ * The device memory exposed to the VM is added to the kernel by the
+ * VM driver module in chunks of memory block size. Only the usable
+ * memory (usemem) is added to the kernel for usage by the VM
+ * workloads. Make the usable memory size memblock aligned.
+ */
+ if (check_sub_overflow(memlength, RESMEM_SIZE,
+ &nvdev->usemem.memlength)) {
+ ret = -EOVERFLOW;
+ goto done;
+ }
+
+ /*
+ * The USEMEM part of the device memory has to be MEMBLK_SIZE
+ * aligned. This is a hardwired ABI value between the GPU FW and
+ * VFIO driver. The VM device driver is also aware of it and make
+ * use of the value for its calculation to determine USEMEM size.
+ */
+ nvdev->usemem.memlength = round_down(nvdev->usemem.memlength,
+ MEMBLK_SIZE);
+ if (nvdev->usemem.memlength == 0) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ if ((check_add_overflow(nvdev->usemem.memphys,
+ nvdev->usemem.memlength,
+ &nvdev->resmem.memphys)) ||
+ (check_sub_overflow(memlength, nvdev->usemem.memlength,
+ &nvdev->resmem.memlength))) {
+ ret = -EOVERFLOW;
+ goto done;
+ }
+
+ /*
+ * The memory regions are exposed as BARs. Calculate and save
+ * the BAR size for them.
+ */
+ nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);
+ nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength);
+done:
+ return ret;
+}
+
+static int nvgrace_gpu_probe(struct pci_dev *pdev,
+ const struct pci_device_id *id)
+{
+ const struct vfio_device_ops *ops = &nvgrace_gpu_pci_core_ops;
+ struct nvgrace_gpu_pci_core_device *nvdev;
+ u64 memphys, memlength;
+ int ret;
+
+ ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength);
+ if (!ret)
+ ops = &nvgrace_gpu_pci_ops;
+
+ nvdev = vfio_alloc_device(nvgrace_gpu_pci_core_device, core_device.vdev,
+ &pdev->dev, ops);
+ if (IS_ERR(nvdev))
+ return PTR_ERR(nvdev);
+
+ dev_set_drvdata(&pdev->dev, &nvdev->core_device);
+
+ if (ops == &nvgrace_gpu_pci_ops) {
+ /*
+ * Device memory properties are identified in the host ACPI
+ * table. Set the nvgrace_gpu_pci_core_device structure.
+ */
+ ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev,
+ memphys, memlength);
+ if (ret)
+ goto out_put_vdev;
+ }
+
+ ret = vfio_pci_core_register_device(&nvdev->core_device);
+ if (ret)
+ goto out_put_vdev;
+
+ return ret;
+
+out_put_vdev:
+ vfio_put_device(&nvdev->core_device.vdev);
+ return ret;
+}
+
+static void nvgrace_gpu_remove(struct pci_dev *pdev)
+{
+ struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
+
+ vfio_pci_core_unregister_device(core_device);
+ vfio_put_device(&core_device->vdev);
+}
+
+static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = {
+ /* GH200 120GB */
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) },
+ /* GH200 480GB */
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) },
+ {}
+};
+
+MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table);
+
+static struct pci_driver nvgrace_gpu_vfio_pci_driver = {
+ .name = KBUILD_MODNAME,
+ .id_table = nvgrace_gpu_vfio_pci_table,
+ .probe = nvgrace_gpu_probe,
+ .remove = nvgrace_gpu_remove,
+ .err_handler = &vfio_pci_core_err_handlers,
+ .driver_managed_dma = true,
+};
+
+module_pci_driver(nvgrace_gpu_vfio_pci_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ankit Agrawal <ankita@nvidia.com>");
+MODULE_AUTHOR("Aniket Agashe <aniketa@nvidia.com>");
+MODULE_DESCRIPTION("VFIO NVGRACE GPU PF - User Level driver for NVIDIA devices with CPU coherently accessible device memory");
diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c
index 8ddf4346fc..68e8f006df 100644
--- a/drivers/vfio/pci/pds/dirty.c
+++ b/drivers/vfio/pci/pds/dirty.c
@@ -607,7 +607,7 @@ int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long iova,
mutex_lock(&pds_vfio->state_mutex);
err = pds_vfio_dirty_sync(pds_vfio, dirty, iova, length);
- pds_vfio_state_mutex_unlock(pds_vfio);
+ mutex_unlock(&pds_vfio->state_mutex);
return err;
}
@@ -624,7 +624,7 @@ int pds_vfio_dma_logging_start(struct vfio_device *vdev,
mutex_lock(&pds_vfio->state_mutex);
pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_IN_PROGRESS);
err = pds_vfio_dirty_enable(pds_vfio, ranges, nnodes, page_size);
- pds_vfio_state_mutex_unlock(pds_vfio);
+ mutex_unlock(&pds_vfio->state_mutex);
return err;
}
@@ -637,7 +637,7 @@ int pds_vfio_dma_logging_stop(struct vfio_device *vdev)
mutex_lock(&pds_vfio->state_mutex);
pds_vfio_dirty_disable(pds_vfio, true);
- pds_vfio_state_mutex_unlock(pds_vfio);
+ mutex_unlock(&pds_vfio->state_mutex);
return 0;
}
diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c
index a34dda5166..16e93b11ab 100644
--- a/drivers/vfio/pci/pds/pci_drv.c
+++ b/drivers/vfio/pci/pds/pci_drv.c
@@ -21,16 +21,13 @@
static void pds_vfio_recovery(struct pds_vfio_pci_device *pds_vfio)
{
- bool deferred_reset_needed = false;
-
/*
* Documentation states that the kernel migration driver must not
* generate asynchronous device state transitions outside of
* manipulation by the user or the VFIO_DEVICE_RESET ioctl.
*
* Since recovery is an asynchronous event received from the device,
- * initiate a deferred reset. Issue a deferred reset in the following
- * situations:
+ * initiate a reset in the following situations:
* 1. Migration is in progress, which will cause the next step of
* the migration to fail.
* 2. If the device is in a state that will be set to
@@ -42,24 +39,8 @@ static void pds_vfio_recovery(struct pds_vfio_pci_device *pds_vfio)
pds_vfio->state != VFIO_DEVICE_STATE_ERROR) ||
(pds_vfio->state == VFIO_DEVICE_STATE_RUNNING &&
pds_vfio_dirty_is_enabled(pds_vfio)))
- deferred_reset_needed = true;
+ pds_vfio_reset(pds_vfio, VFIO_DEVICE_STATE_ERROR);
mutex_unlock(&pds_vfio->state_mutex);
-
- /*
- * On the next user initiated state transition, the device will
- * transition to the VFIO_DEVICE_STATE_ERROR. At this point it's the user's
- * responsibility to reset the device.
- *
- * If a VFIO_DEVICE_RESET is requested post recovery and before the next
- * state transition, then the deferred reset state will be set to
- * VFIO_DEVICE_STATE_RUNNING.
- */
- if (deferred_reset_needed) {
- mutex_lock(&pds_vfio->reset_mutex);
- pds_vfio->deferred_reset = true;
- pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_ERROR;
- mutex_unlock(&pds_vfio->reset_mutex);
- }
}
static int pds_vfio_pci_notify_handler(struct notifier_block *nb,
@@ -185,7 +166,9 @@ static void pds_vfio_pci_aer_reset_done(struct pci_dev *pdev)
{
struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev);
- pds_vfio_reset(pds_vfio);
+ mutex_lock(&pds_vfio->state_mutex);
+ pds_vfio_reset(pds_vfio, VFIO_DEVICE_STATE_RUNNING);
+ mutex_unlock(&pds_vfio->state_mutex);
}
static const struct pci_error_handlers pds_vfio_pci_err_handlers = {
diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c
index a286ebcc71..76a80ae708 100644
--- a/drivers/vfio/pci/pds/vfio_dev.c
+++ b/drivers/vfio/pci/pds/vfio_dev.c
@@ -26,37 +26,14 @@ struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev)
vfio_coredev);
}
-void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio)
+void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio,
+ enum vfio_device_mig_state state)
{
-again:
- mutex_lock(&pds_vfio->reset_mutex);
- if (pds_vfio->deferred_reset) {
- pds_vfio->deferred_reset = false;
- pds_vfio_put_restore_file(pds_vfio);
- pds_vfio_put_save_file(pds_vfio);
- if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) {
- pds_vfio_dirty_disable(pds_vfio, false);
- }
- pds_vfio->state = pds_vfio->deferred_reset_state;
- pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
- mutex_unlock(&pds_vfio->reset_mutex);
- goto again;
- }
- mutex_unlock(&pds_vfio->state_mutex);
- mutex_unlock(&pds_vfio->reset_mutex);
-}
-
-void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio)
-{
- mutex_lock(&pds_vfio->reset_mutex);
- pds_vfio->deferred_reset = true;
- pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
- if (!mutex_trylock(&pds_vfio->state_mutex)) {
- mutex_unlock(&pds_vfio->reset_mutex);
- return;
- }
- mutex_unlock(&pds_vfio->reset_mutex);
- pds_vfio_state_mutex_unlock(pds_vfio);
+ pds_vfio_put_restore_file(pds_vfio);
+ pds_vfio_put_save_file(pds_vfio);
+ if (state == VFIO_DEVICE_STATE_ERROR)
+ pds_vfio_dirty_disable(pds_vfio, false);
+ pds_vfio->state = state;
}
static struct file *
@@ -97,8 +74,7 @@ pds_vfio_set_device_state(struct vfio_device *vdev,
break;
}
}
- pds_vfio_state_mutex_unlock(pds_vfio);
- /* still waiting on a deferred_reset */
+ mutex_unlock(&pds_vfio->state_mutex);
if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR)
res = ERR_PTR(-EIO);
@@ -114,7 +90,7 @@ static int pds_vfio_get_device_state(struct vfio_device *vdev,
mutex_lock(&pds_vfio->state_mutex);
*current_state = pds_vfio->state;
- pds_vfio_state_mutex_unlock(pds_vfio);
+ mutex_unlock(&pds_vfio->state_mutex);
return 0;
}
@@ -156,7 +132,6 @@ static int pds_vfio_init_device(struct vfio_device *vdev)
pds_vfio->vf_id = vf_id;
mutex_init(&pds_vfio->state_mutex);
- mutex_init(&pds_vfio->reset_mutex);
vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P;
vdev->mig_ops = &pds_vfio_lm_ops;
@@ -178,7 +153,6 @@ static void pds_vfio_release_device(struct vfio_device *vdev)
vfio_coredev.vdev);
mutex_destroy(&pds_vfio->state_mutex);
- mutex_destroy(&pds_vfio->reset_mutex);
vfio_pci_core_release_dev(vdev);
}
@@ -194,7 +168,6 @@ static int pds_vfio_open_device(struct vfio_device *vdev)
return err;
pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
- pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
vfio_pci_core_finish_enable(&pds_vfio->vfio_coredev);
diff --git a/drivers/vfio/pci/pds/vfio_dev.h b/drivers/vfio/pci/pds/vfio_dev.h
index e7b01080a1..803d99d69c 100644
--- a/drivers/vfio/pci/pds/vfio_dev.h
+++ b/drivers/vfio/pci/pds/vfio_dev.h
@@ -18,20 +18,16 @@ struct pds_vfio_pci_device {
struct pds_vfio_dirty dirty;
struct mutex state_mutex; /* protect migration state */
enum vfio_device_mig_state state;
- struct mutex reset_mutex; /* protect reset_done flow */
- u8 deferred_reset;
- enum vfio_device_mig_state deferred_reset_state;
struct notifier_block nb;
int vf_id;
u16 client_id;
};
-void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio);
-
const struct vfio_device_ops *pds_vfio_ops_info(void);
struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev);
-void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio,
+ enum vfio_device_mig_state state);
struct pci_dev *pds_vfio_to_pci_dev(struct pds_vfio_pci_device *pds_vfio);
struct device *pds_vfio_to_dev(struct pds_vfio_pci_device *pds_vfio);
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 7e2e62ab08..97422aafaa 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -1966,3 +1966,45 @@ ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, char __user *buf,
return done;
}
+
+/**
+ * vfio_pci_core_range_intersect_range() - Determine overlap between a buffer
+ * and register offset ranges.
+ * @buf_start: start offset of the buffer
+ * @buf_cnt: number of buffer bytes
+ * @reg_start: start register offset
+ * @reg_cnt: number of register bytes
+ * @buf_offset: start offset of overlap in the buffer
+ * @intersect_count: number of overlapping bytes
+ * @register_offset: start offset of overlap in register
+ *
+ * Returns: true if there is overlap, false if not.
+ * The overlap start and size is returned through function args.
+ */
+bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt,
+ loff_t reg_start, size_t reg_cnt,
+ loff_t *buf_offset,
+ size_t *intersect_count,
+ size_t *register_offset)
+{
+ if (buf_start <= reg_start &&
+ buf_start + buf_cnt > reg_start) {
+ *buf_offset = reg_start - buf_start;
+ *intersect_count = min_t(size_t, reg_cnt,
+ buf_start + buf_cnt - reg_start);
+ *register_offset = 0;
+ return true;
+ }
+
+ if (buf_start > reg_start &&
+ buf_start < reg_start + reg_cnt) {
+ *buf_offset = 0;
+ *intersect_count = min_t(size_t, buf_cnt,
+ reg_start + reg_cnt - buf_start);
+ *register_offset = buf_start - reg_start;
+ return true;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_range_intersect_range);
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 1cbc990d42..d8c95cc16b 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -778,25 +778,26 @@ static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
}
struct vfio_pci_fill_info {
- struct vfio_pci_dependent_device __user *devices;
- struct vfio_pci_dependent_device __user *devices_end;
struct vfio_device *vdev;
+ struct vfio_pci_dependent_device *devices;
+ int nr_devices;
u32 count;
u32 flags;
};
static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
{
- struct vfio_pci_dependent_device info = {
- .segment = pci_domain_nr(pdev->bus),
- .bus = pdev->bus->number,
- .devfn = pdev->devfn,
- };
+ struct vfio_pci_dependent_device *info;
struct vfio_pci_fill_info *fill = data;
- fill->count++;
- if (fill->devices >= fill->devices_end)
- return 0;
+ /* The topology changed since we counted devices */
+ if (fill->count >= fill->nr_devices)
+ return -EAGAIN;
+
+ info = &fill->devices[fill->count++];
+ info->segment = pci_domain_nr(pdev->bus);
+ info->bus = pdev->bus->number;
+ info->devfn = pdev->devfn;
if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) {
struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev);
@@ -809,19 +810,19 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
*/
vdev = vfio_find_device_in_devset(dev_set, &pdev->dev);
if (!vdev) {
- info.devid = VFIO_PCI_DEVID_NOT_OWNED;
+ info->devid = VFIO_PCI_DEVID_NOT_OWNED;
} else {
int id = vfio_iommufd_get_dev_id(vdev, iommufd);
if (id > 0)
- info.devid = id;
+ info->devid = id;
else if (id == -ENOENT)
- info.devid = VFIO_PCI_DEVID_OWNED;
+ info->devid = VFIO_PCI_DEVID_OWNED;
else
- info.devid = VFIO_PCI_DEVID_NOT_OWNED;
+ info->devid = VFIO_PCI_DEVID_NOT_OWNED;
}
/* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */
- if (info.devid == VFIO_PCI_DEVID_NOT_OWNED)
+ if (info->devid == VFIO_PCI_DEVID_NOT_OWNED)
fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
} else {
struct iommu_group *iommu_group;
@@ -830,13 +831,10 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
if (!iommu_group)
return -EPERM; /* Cannot reset non-isolated devices */
- info.group_id = iommu_group_id(iommu_group);
+ info->group_id = iommu_group_id(iommu_group);
iommu_group_put(iommu_group);
}
- if (copy_to_user(fill->devices, &info, sizeof(info)))
- return -EFAULT;
- fill->devices++;
return 0;
}
@@ -1258,10 +1256,11 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
{
unsigned long minsz =
offsetofend(struct vfio_pci_hot_reset_info, count);
+ struct vfio_pci_dependent_device *devices = NULL;
struct vfio_pci_hot_reset_info hdr;
struct vfio_pci_fill_info fill = {};
bool slot = false;
- int ret = 0;
+ int ret, count;
if (copy_from_user(&hdr, arg, minsz))
return -EFAULT;
@@ -1277,9 +1276,23 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
else if (pci_probe_reset_bus(vdev->pdev->bus))
return -ENODEV;
- fill.devices = arg->devices;
- fill.devices_end = arg->devices +
- (hdr.argsz - sizeof(hdr)) / sizeof(arg->devices[0]);
+ ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
+ &count, slot);
+ if (ret)
+ return ret;
+
+ if (count > (hdr.argsz - sizeof(hdr)) / sizeof(*devices)) {
+ hdr.count = count;
+ ret = -ENOSPC;
+ goto header;
+ }
+
+ devices = kcalloc(count, sizeof(*devices), GFP_KERNEL);
+ if (!devices)
+ return -ENOMEM;
+
+ fill.devices = devices;
+ fill.nr_devices = count;
fill.vdev = &vdev->vdev;
if (vfio_device_cdev_opened(&vdev->vdev))
@@ -1291,16 +1304,23 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
&fill, slot);
mutex_unlock(&vdev->vdev.dev_set->lock);
if (ret)
- return ret;
+ goto out;
+
+ if (copy_to_user(arg->devices, devices,
+ sizeof(*devices) * fill.count)) {
+ ret = -EFAULT;
+ goto out;
+ }
hdr.count = fill.count;
hdr.flags = fill.flags;
- if (copy_to_user(arg, &hdr, minsz))
- return -EFAULT;
- if (fill.count > fill.devices - arg->devices)
- return -ENOSPC;
- return 0;
+header:
+ if (copy_to_user(arg, &hdr, minsz))
+ ret = -EFAULT;
+out:
+ kfree(devices);
+ return ret;
}
static int
@@ -1862,8 +1882,25 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma
/*
* See remap_pfn_range(), called from vfio_pci_fault() but we can't
* change vm_flags within the fault handler. Set them now.
+ *
+ * VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64,
+ * allowing KVM stage 2 device mapping attributes to use Normal-NC
+ * rather than DEVICE_nGnRE, which allows guest mappings
+ * supporting write-combining attributes (WC). ARM does not
+ * architecturally guarantee this is safe, and indeed some MMIO
+ * regions like the GICv2 VCPU interface can trigger uncontained
+ * faults if Normal-NC is used.
+ *
+ * To safely use VFIO in KVM the platform must guarantee full
+ * safety in the guest where no action taken against a MMIO
+ * mapping can trigger an uncontained failure. The assumption is
+ * that most VFIO PCI platforms support this for both mapping types,
+ * at least in common flows, based on some expectations of how
+ * PCI IP is integrated. Hence VM_ALLOW_ANY_UNCACHED is set in
+ * the VMA flags.
*/
- vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
+ vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP |
+ VM_DONTEXPAND | VM_DONTDUMP);
vma->vm_ops = &vfio_pci_mmap_ops;
return 0;
@@ -2047,6 +2084,7 @@ static int vfio_pci_bus_notifier(struct notifier_block *nb,
pci_name(pdev));
pdev->driver_override = kasprintf(GFP_KERNEL, "%s",
vdev->vdev.ops->name);
+ WARN_ON(!pdev->driver_override);
} else if (action == BUS_NOTIFY_BOUND_DRIVER &&
pdev->is_virtfn && physfn == vdev->pdev) {
struct pci_driver *drv = pci_dev_driver(pdev);
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index fb5392b749..e80c5d75b5 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -277,8 +277,10 @@ static int vfio_intx_enable(struct vfio_pci_core_device *vdev,
return -ENOMEM;
ctx = vfio_irq_ctx_alloc(vdev, 0);
- if (!ctx)
+ if (!ctx) {
+ kfree(name);
return -ENOMEM;
+ }
ctx->name = name;
ctx->trigger = trigger;
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index 07fea08ea8..03b8f7ada1 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -96,10 +96,10 @@ VFIO_IOREAD(32)
* reads with -1. This is intended for handling MSI-X vector tables and
* leftover space for ROM BARs.
*/
-static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
- void __iomem *io, char __user *buf,
- loff_t off, size_t count, size_t x_start,
- size_t x_end, bool iswrite)
+ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
+ void __iomem *io, char __user *buf,
+ loff_t off, size_t count, size_t x_start,
+ size_t x_end, bool iswrite)
{
ssize_t done = 0;
int ret;
@@ -201,6 +201,7 @@ static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
return done;
}
+EXPORT_SYMBOL_GPL(vfio_pci_core_do_io_rw);
int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar)
{
@@ -279,8 +280,8 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
x_end = vdev->msix_offset + vdev->msix_size;
}
- done = do_io_rw(vdev, res->flags & IORESOURCE_MEM, io, buf, pos,
- count, x_start, x_end, iswrite);
+ done = vfio_pci_core_do_io_rw(vdev, res->flags & IORESOURCE_MEM, io, buf, pos,
+ count, x_start, x_end, iswrite);
if (done >= 0)
*ppos += done;
@@ -348,7 +349,8 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf,
* probing, so we don't currently worry about access in relation
* to the memory enable bit in the command register.
*/
- done = do_io_rw(vdev, false, iomem, buf, off, count, 0, 0, iswrite);
+ done = vfio_pci_core_do_io_rw(vdev, false, iomem, buf, off, count,
+ 0, 0, iswrite);
vga_put(vdev->pdev, rsrc);
diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c
index d5af683837..b5d3a8c5bb 100644
--- a/drivers/vfio/pci/virtio/main.c
+++ b/drivers/vfio/pci/virtio/main.c
@@ -132,33 +132,6 @@ end:
return ret ? ret : count;
}
-static bool range_intersect_range(loff_t range1_start, size_t count1,
- loff_t range2_start, size_t count2,
- loff_t *start_offset,
- size_t *intersect_count,
- size_t *register_offset)
-{
- if (range1_start <= range2_start &&
- range1_start + count1 > range2_start) {
- *start_offset = range2_start - range1_start;
- *intersect_count = min_t(size_t, count2,
- range1_start + count1 - range2_start);
- *register_offset = 0;
- return true;
- }
-
- if (range1_start > range2_start &&
- range1_start < range2_start + count2) {
- *start_offset = 0;
- *intersect_count = min_t(size_t, count1,
- range2_start + count2 - range1_start);
- *register_offset = range1_start - range2_start;
- return true;
- }
-
- return false;
-}
-
static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev,
char __user *buf, size_t count,
loff_t *ppos)
@@ -178,16 +151,18 @@ static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev,
if (ret < 0)
return ret;
- if (range_intersect_range(pos, count, PCI_DEVICE_ID, sizeof(val16),
- &copy_offset, &copy_count, &register_offset)) {
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_DEVICE_ID,
+ sizeof(val16), &copy_offset,
+ &copy_count, &register_offset)) {
val16 = cpu_to_le16(VIRTIO_TRANS_ID_NET);
if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, copy_count))
return -EFAULT;
}
if ((le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO) &&
- range_intersect_range(pos, count, PCI_COMMAND, sizeof(val16),
- &copy_offset, &copy_count, &register_offset)) {
+ vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND,
+ sizeof(val16), &copy_offset,
+ &copy_count, &register_offset)) {
if (copy_from_user((void *)&val16 + register_offset, buf + copy_offset,
copy_count))
return -EFAULT;
@@ -197,16 +172,18 @@ static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev,
return -EFAULT;
}
- if (range_intersect_range(pos, count, PCI_REVISION_ID, sizeof(val8),
- &copy_offset, &copy_count, &register_offset)) {
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_REVISION_ID,
+ sizeof(val8), &copy_offset,
+ &copy_count, &register_offset)) {
/* Transional needs to have revision 0 */
val8 = 0;
if (copy_to_user(buf + copy_offset, &val8, copy_count))
return -EFAULT;
}
- if (range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, sizeof(val32),
- &copy_offset, &copy_count, &register_offset)) {
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0,
+ sizeof(val32), &copy_offset,
+ &copy_count, &register_offset)) {
u32 bar_mask = ~(virtvdev->bar0_virtual_buf_size - 1);
u32 pci_base_addr_0 = le32_to_cpu(virtvdev->pci_base_addr_0);
@@ -215,8 +192,9 @@ static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev,
return -EFAULT;
}
- if (range_intersect_range(pos, count, PCI_SUBSYSTEM_ID, sizeof(val16),
- &copy_offset, &copy_count, &register_offset)) {
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_ID,
+ sizeof(val16), &copy_offset,
+ &copy_count, &register_offset)) {
/*
* Transitional devices use the PCI subsystem device id as
* virtio device id, same as legacy driver always did.
@@ -227,8 +205,9 @@ static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev,
return -EFAULT;
}
- if (range_intersect_range(pos, count, PCI_SUBSYSTEM_VENDOR_ID, sizeof(val16),
- &copy_offset, &copy_count, &register_offset)) {
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_VENDOR_ID,
+ sizeof(val16), &copy_offset,
+ &copy_count, &register_offset)) {
val16 = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET);
if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset,
copy_count))
@@ -270,19 +249,20 @@ static ssize_t virtiovf_pci_write_config(struct vfio_device *core_vdev,
loff_t copy_offset;
size_t copy_count;
- if (range_intersect_range(pos, count, PCI_COMMAND, sizeof(virtvdev->pci_cmd),
- &copy_offset, &copy_count,
- &register_offset)) {
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND,
+ sizeof(virtvdev->pci_cmd),
+ &copy_offset, &copy_count,
+ &register_offset)) {
if (copy_from_user((void *)&virtvdev->pci_cmd + register_offset,
buf + copy_offset,
copy_count))
return -EFAULT;
}
- if (range_intersect_range(pos, count, PCI_BASE_ADDRESS_0,
- sizeof(virtvdev->pci_base_addr_0),
- &copy_offset, &copy_count,
- &register_offset)) {
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0,
+ sizeof(virtvdev->pci_base_addr_0),
+ &copy_offset, &copy_count,
+ &register_offset)) {
if (copy_from_user((void *)&virtvdev->pci_base_addr_0 + register_offset,
buf + copy_offset,
copy_count))