summaryrefslogtreecommitdiffstats
path: root/drivers/vfio/pci/mlx5
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-11 08:27:49 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-11 08:27:49 +0000
commitace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch)
treeb2d64bc10158fdd5497876388cd68142ca374ed3 /drivers/vfio/pci/mlx5
parentInitial commit. (diff)
downloadlinux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz
linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'drivers/vfio/pci/mlx5')
-rw-r--r--drivers/vfio/pci/mlx5/Kconfig10
-rw-r--r--drivers/vfio/pci/mlx5/Makefile4
-rw-r--r--drivers/vfio/pci/mlx5/cmd.c1624
-rw-r--r--drivers/vfio/pci/mlx5/cmd.h225
-rw-r--r--drivers/vfio/pci/mlx5/main.c1383
5 files changed, 3246 insertions, 0 deletions
diff --git a/drivers/vfio/pci/mlx5/Kconfig b/drivers/vfio/pci/mlx5/Kconfig
new file mode 100644
index 0000000000..7088edc4fb
--- /dev/null
+++ b/drivers/vfio/pci/mlx5/Kconfig
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config MLX5_VFIO_PCI
+ tristate "VFIO support for MLX5 PCI devices"
+ depends on MLX5_CORE
+ select VFIO_PCI_CORE
+ help
+ This provides migration support for MLX5 devices using the VFIO
+ framework.
+
+ If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/mlx5/Makefile b/drivers/vfio/pci/mlx5/Makefile
new file mode 100644
index 0000000000..689627da7f
--- /dev/null
+++ b/drivers/vfio/pci/mlx5/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_MLX5_VFIO_PCI) += mlx5-vfio-pci.o
+mlx5-vfio-pci-y := main.o cmd.o
+
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
new file mode 100644
index 0000000000..33574b0447
--- /dev/null
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -0,0 +1,1624 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include "cmd.h"
+
+enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
+
+static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id)
+{
+ int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
+ void *query_cap = NULL, *cap;
+ int ret;
+
+ query_cap = kzalloc(query_sz, GFP_KERNEL);
+ if (!query_cap)
+ return -ENOMEM;
+
+ ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap,
+ MLX5_CAP_GENERAL_2);
+ if (ret)
+ goto out;
+
+ cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability);
+ if (!MLX5_GET(cmd_hca_cap_2, cap, migratable))
+ ret = -EOPNOTSUPP;
+out:
+ kfree(query_cap);
+ return ret;
+}
+
+static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
+ u16 *vhca_id);
+static void
+_mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);
+
+int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
+{
+ struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
+ u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {};
+ int err;
+
+ lockdep_assert_held(&mvdev->state_mutex);
+ if (mvdev->mdev_detach)
+ return -ENOTCONN;
+
+ /*
+ * In case PRE_COPY is used, saving_migf is exposed while the device is
+ * running. Make sure to run only once there is no active save command.
+ * Running both in parallel, might end-up with a failure in the save
+ * command once it will try to turn on 'tracking' on a suspended device.
+ */
+ if (migf) {
+ err = wait_for_completion_interruptible(&migf->save_comp);
+ if (err)
+ return err;
+ }
+
+ MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);
+ MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);
+ MLX5_SET(suspend_vhca_in, in, op_mod, op_mod);
+
+ err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
+ if (migf)
+ complete(&migf->save_comp);
+
+ return err;
+}
+
+int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
+{
+ u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {};
+
+ lockdep_assert_held(&mvdev->state_mutex);
+ if (mvdev->mdev_detach)
+ return -ENOTCONN;
+
+ MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA);
+ MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id);
+ MLX5_SET(resume_vhca_in, in, op_mod, op_mod);
+
+ return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out);
+}
+
+int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
+ size_t *state_size, u8 query_flags)
+{
+ u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
+ bool inc = query_flags & MLX5VF_QUERY_INC;
+ int ret;
+
+ lockdep_assert_held(&mvdev->state_mutex);
+ if (mvdev->mdev_detach)
+ return -ENOTCONN;
+
+ /*
+ * In case PRE_COPY is used, saving_migf is exposed while device is
+ * running. Make sure to run only once there is no active save command.
+ * Running both in parallel, might end-up with a failure in the
+ * incremental query command on un-tracked vhca.
+ */
+ if (inc) {
+ ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
+ if (ret)
+ return ret;
+ if (mvdev->saving_migf->state ==
+ MLX5_MIGF_STATE_PRE_COPY_ERROR) {
+ /*
+ * In case we had a PRE_COPY error, only query full
+ * image for final image
+ */
+ if (!(query_flags & MLX5VF_QUERY_FINAL)) {
+ *state_size = 0;
+ complete(&mvdev->saving_migf->save_comp);
+ return 0;
+ }
+ query_flags &= ~MLX5VF_QUERY_INC;
+ }
+ }
+
+ MLX5_SET(query_vhca_migration_state_in, in, opcode,
+ MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
+ MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
+ MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
+ MLX5_SET(query_vhca_migration_state_in, in, incremental,
+ query_flags & MLX5VF_QUERY_INC);
+
+ ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
+ out);
+ if (inc)
+ complete(&mvdev->saving_migf->save_comp);
+
+ if (ret)
+ return ret;
+
+ *state_size = MLX5_GET(query_vhca_migration_state_out, out,
+ required_umem_size);
+ return 0;
+}
+
+static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev)
+{
+ /* Mark the tracker under an error and wake it up if it's running */
+ mvdev->tracker.is_err = true;
+ complete(&mvdev->tracker_comp);
+}
+
+static int mlx5fv_vf_event(struct notifier_block *nb,
+ unsigned long event, void *data)
+{
+ struct mlx5vf_pci_core_device *mvdev =
+ container_of(nb, struct mlx5vf_pci_core_device, nb);
+
+ switch (event) {
+ case MLX5_PF_NOTIFY_ENABLE_VF:
+ mutex_lock(&mvdev->state_mutex);
+ mvdev->mdev_detach = false;
+ mlx5vf_state_mutex_unlock(mvdev);
+ break;
+ case MLX5_PF_NOTIFY_DISABLE_VF:
+ mlx5vf_cmd_close_migratable(mvdev);
+ mutex_lock(&mvdev->state_mutex);
+ mvdev->mdev_detach = true;
+ mlx5vf_state_mutex_unlock(mvdev);
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev)
+{
+ if (!mvdev->migrate_cap)
+ return;
+
+ /* Must be done outside the lock to let it progress */
+ set_tracker_error(mvdev);
+ mutex_lock(&mvdev->state_mutex);
+ mlx5vf_disable_fds(mvdev);
+ _mlx5vf_free_page_tracker_resources(mvdev);
+ mlx5vf_state_mutex_unlock(mvdev);
+}
+
+void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev)
+{
+ if (!mvdev->migrate_cap)
+ return;
+
+ mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id,
+ &mvdev->nb);
+ destroy_workqueue(mvdev->cb_wq);
+}
+
+void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
+ const struct vfio_migration_ops *mig_ops,
+ const struct vfio_log_ops *log_ops)
+{
+ struct pci_dev *pdev = mvdev->core_device.pdev;
+ int ret;
+
+ if (!pdev->is_virtfn)
+ return;
+
+ mvdev->mdev = mlx5_vf_get_core_dev(pdev);
+ if (!mvdev->mdev)
+ return;
+
+ if (!MLX5_CAP_GEN(mvdev->mdev, migration))
+ goto end;
+
+ mvdev->vf_id = pci_iov_vf_id(pdev);
+ if (mvdev->vf_id < 0)
+ goto end;
+
+ ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1);
+ if (ret)
+ goto end;
+
+ if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1,
+ &mvdev->vhca_id))
+ goto end;
+
+ mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0);
+ if (!mvdev->cb_wq)
+ goto end;
+
+ mutex_init(&mvdev->state_mutex);
+ spin_lock_init(&mvdev->reset_lock);
+ mvdev->nb.notifier_call = mlx5fv_vf_event;
+ ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id,
+ &mvdev->nb);
+ if (ret) {
+ destroy_workqueue(mvdev->cb_wq);
+ goto end;
+ }
+
+ mvdev->migrate_cap = 1;
+ mvdev->core_device.vdev.migration_flags =
+ VFIO_MIGRATION_STOP_COPY |
+ VFIO_MIGRATION_P2P;
+ mvdev->core_device.vdev.mig_ops = mig_ops;
+ init_completion(&mvdev->tracker_comp);
+ if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
+ mvdev->core_device.vdev.log_ops = log_ops;
+
+ if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
+ MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state))
+ mvdev->core_device.vdev.migration_flags |=
+ VFIO_MIGRATION_PRE_COPY;
+
+end:
+ mlx5_vf_put_core_dev(mvdev->mdev);
+}
+
+static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
+ u16 *vhca_id)
+{
+ u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
+ int out_size;
+ void *out;
+ int ret;
+
+ out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
+ out = kzalloc(out_size, GFP_KERNEL);
+ if (!out)
+ return -ENOMEM;
+
+ MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
+ MLX5_SET(query_hca_cap_in, in, other_function, 1);
+ MLX5_SET(query_hca_cap_in, in, function_id, function_id);
+ MLX5_SET(query_hca_cap_in, in, op_mod,
+ MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 |
+ HCA_CAP_OPMOD_GET_CUR);
+
+ ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
+ if (ret)
+ goto err_exec;
+
+ *vhca_id = MLX5_GET(query_hca_cap_out, out,
+ capability.cmd_hca_cap.vhca_id);
+
+err_exec:
+ kfree(out);
+ return ret;
+}
+
+static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
+ struct mlx5_vhca_data_buffer *buf,
+ struct mlx5_vhca_recv_buf *recv_buf,
+ u32 *mkey)
+{
+ size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) :
+ recv_buf->npages;
+ int err = 0, inlen;
+ __be64 *mtt;
+ void *mkc;
+ u32 *in;
+
+ inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
+ sizeof(*mtt) * round_up(npages, 2);
+
+ in = kvzalloc(inlen, GFP_KERNEL);
+ if (!in)
+ return -ENOMEM;
+
+ MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
+ DIV_ROUND_UP(npages, 2));
+ mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
+
+ if (buf) {
+ struct sg_dma_page_iter dma_iter;
+
+ for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
+ *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
+ } else {
+ int i;
+
+ for (i = 0; i < npages; i++)
+ *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]);
+ }
+
+ mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+ MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
+ MLX5_SET(mkc, mkc, lr, 1);
+ MLX5_SET(mkc, mkc, lw, 1);
+ MLX5_SET(mkc, mkc, rr, 1);
+ MLX5_SET(mkc, mkc, rw, 1);
+ MLX5_SET(mkc, mkc, pd, pdn);
+ MLX5_SET(mkc, mkc, bsf_octword_size, 0);
+ MLX5_SET(mkc, mkc, qpn, 0xffffff);
+ MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
+ MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
+ MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
+ err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
+ kvfree(in);
+ return err;
+}
+
+static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
+{
+ struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
+ struct mlx5_core_dev *mdev = mvdev->mdev;
+ int ret;
+
+ lockdep_assert_held(&mvdev->state_mutex);
+ if (mvdev->mdev_detach)
+ return -ENOTCONN;
+
+ if (buf->dmaed || !buf->allocated_length)
+ return -EINVAL;
+
+ ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
+ if (ret)
+ return ret;
+
+ ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey);
+ if (ret)
+ goto err;
+
+ buf->dmaed = true;
+
+ return 0;
+err:
+ dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
+ return ret;
+}
+
+void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
+{
+ struct mlx5_vf_migration_file *migf = buf->migf;
+ struct sg_page_iter sg_iter;
+
+ lockdep_assert_held(&migf->mvdev->state_mutex);
+ WARN_ON(migf->mvdev->mdev_detach);
+
+ if (buf->dmaed) {
+ mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey);
+ dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt,
+ buf->dma_dir, 0);
+ }
+
+ /* Undo alloc_pages_bulk_array() */
+ for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
+ __free_page(sg_page_iter_page(&sg_iter));
+ sg_free_append_table(&buf->table);
+ kfree(buf);
+}
+
+struct mlx5_vhca_data_buffer *
+mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
+ size_t length,
+ enum dma_data_direction dma_dir)
+{
+ struct mlx5_vhca_data_buffer *buf;
+ int ret;
+
+ buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+
+ buf->dma_dir = dma_dir;
+ buf->migf = migf;
+ if (length) {
+ ret = mlx5vf_add_migration_pages(buf,
+ DIV_ROUND_UP_ULL(length, PAGE_SIZE));
+ if (ret)
+ goto end;
+
+ if (dma_dir != DMA_NONE) {
+ ret = mlx5vf_dma_data_buffer(buf);
+ if (ret)
+ goto end;
+ }
+ }
+
+ return buf;
+end:
+ mlx5vf_free_data_buffer(buf);
+ return ERR_PTR(ret);
+}
+
+void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
+{
+ spin_lock_irq(&buf->migf->list_lock);
+ list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
+ spin_unlock_irq(&buf->migf->list_lock);
+}
+
+struct mlx5_vhca_data_buffer *
+mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
+ size_t length, enum dma_data_direction dma_dir)
+{
+ struct mlx5_vhca_data_buffer *buf, *temp_buf;
+ struct list_head free_list;
+
+ lockdep_assert_held(&migf->mvdev->state_mutex);
+ if (migf->mvdev->mdev_detach)
+ return ERR_PTR(-ENOTCONN);
+
+ INIT_LIST_HEAD(&free_list);
+
+ spin_lock_irq(&migf->list_lock);
+ list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
+ if (buf->dma_dir == dma_dir) {
+ list_del_init(&buf->buf_elm);
+ if (buf->allocated_length >= length) {
+ spin_unlock_irq(&migf->list_lock);
+ goto found;
+ }
+ /*
+ * Prevent holding redundant buffers. Put in a free
+ * list and call at the end not under the spin lock
+ * (&migf->list_lock) to mlx5vf_free_data_buffer which
+ * might sleep.
+ */
+ list_add(&buf->buf_elm, &free_list);
+ }
+ }
+ spin_unlock_irq(&migf->list_lock);
+ buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir);
+
+found:
+ while ((temp_buf = list_first_entry_or_null(&free_list,
+ struct mlx5_vhca_data_buffer, buf_elm))) {
+ list_del(&temp_buf->buf_elm);
+ mlx5vf_free_data_buffer(temp_buf);
+ }
+
+ return buf;
+}
+
+void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
+{
+ struct mlx5vf_async_data *async_data = container_of(_work,
+ struct mlx5vf_async_data, work);
+ struct mlx5_vf_migration_file *migf = container_of(async_data,
+ struct mlx5_vf_migration_file, async_data);
+
+ mutex_lock(&migf->lock);
+ if (async_data->status) {
+ mlx5vf_put_data_buffer(async_data->buf);
+ if (async_data->header_buf)
+ mlx5vf_put_data_buffer(async_data->header_buf);
+ if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
+ migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
+ else
+ migf->state = MLX5_MIGF_STATE_ERROR;
+ wake_up_interruptible(&migf->poll_wait);
+ }
+ mutex_unlock(&migf->lock);
+ kvfree(async_data->out);
+ complete(&migf->save_comp);
+ fput(migf->filp);
+}
+
+static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
+ size_t image_size, bool initial_pre_copy)
+{
+ struct mlx5_vf_migration_file *migf = header_buf->migf;
+ struct mlx5_vf_migration_header header = {};
+ unsigned long flags;
+ struct page *page;
+ u8 *to_buff;
+
+ header.record_size = cpu_to_le64(image_size);
+ header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY);
+ header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA);
+ page = mlx5vf_get_migration_page(header_buf, 0);
+ if (!page)
+ return -EINVAL;
+ to_buff = kmap_local_page(page);
+ memcpy(to_buff, &header, sizeof(header));
+ kunmap_local(to_buff);
+ header_buf->length = sizeof(header);
+ header_buf->start_pos = header_buf->migf->max_pos;
+ migf->max_pos += header_buf->length;
+ spin_lock_irqsave(&migf->list_lock, flags);
+ list_add_tail(&header_buf->buf_elm, &migf->buf_list);
+ spin_unlock_irqrestore(&migf->list_lock, flags);
+ if (initial_pre_copy)
+ migf->pre_copy_initial_bytes += sizeof(header);
+ return 0;
+}
+
+static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
+{
+ struct mlx5vf_async_data *async_data = container_of(context,
+ struct mlx5vf_async_data, cb_work);
+ struct mlx5_vf_migration_file *migf = container_of(async_data,
+ struct mlx5_vf_migration_file, async_data);
+
+ if (!status) {
+ size_t image_size;
+ unsigned long flags;
+ bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY &&
+ !async_data->last_chunk;
+
+ image_size = MLX5_GET(save_vhca_state_out, async_data->out,
+ actual_image_size);
+ if (async_data->header_buf) {
+ status = add_buf_header(async_data->header_buf, image_size,
+ initial_pre_copy);
+ if (status)
+ goto err;
+ }
+ async_data->buf->length = image_size;
+ async_data->buf->start_pos = migf->max_pos;
+ migf->max_pos += async_data->buf->length;
+ spin_lock_irqsave(&migf->list_lock, flags);
+ list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
+ spin_unlock_irqrestore(&migf->list_lock, flags);
+ if (initial_pre_copy)
+ migf->pre_copy_initial_bytes += image_size;
+ migf->state = async_data->last_chunk ?
+ MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY;
+ wake_up_interruptible(&migf->poll_wait);
+ }
+
+err:
+ /*
+ * The error and the cleanup flows can't run from an
+ * interrupt context
+ */
+ if (status == -EREMOTEIO)
+ status = MLX5_GET(save_vhca_state_out, async_data->out, status);
+ async_data->status = status;
+ queue_work(migf->mvdev->cb_wq, &async_data->work);
+}
+
+int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
+ struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *buf, bool inc,
+ bool track)
+{
+ u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);
+ u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
+ struct mlx5_vhca_data_buffer *header_buf = NULL;
+ struct mlx5vf_async_data *async_data;
+ int err;
+
+ lockdep_assert_held(&mvdev->state_mutex);
+ if (mvdev->mdev_detach)
+ return -ENOTCONN;
+
+ err = wait_for_completion_interruptible(&migf->save_comp);
+ if (err)
+ return err;
+
+ if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
+ /*
+ * In case we had a PRE_COPY error, SAVE is triggered only for
+ * the final image, read device full image.
+ */
+ inc = false;
+
+ MLX5_SET(save_vhca_state_in, in, opcode,
+ MLX5_CMD_OP_SAVE_VHCA_STATE);
+ MLX5_SET(save_vhca_state_in, in, op_mod, 0);
+ MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
+ MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey);
+ MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length);
+ MLX5_SET(save_vhca_state_in, in, incremental, inc);
+ MLX5_SET(save_vhca_state_in, in, set_track, track);
+
+ async_data = &migf->async_data;
+ async_data->buf = buf;
+ async_data->last_chunk = !track;
+ async_data->out = kvzalloc(out_size, GFP_KERNEL);
+ if (!async_data->out) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+
+ if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
+ if (async_data->last_chunk && migf->buf_header) {
+ header_buf = migf->buf_header;
+ migf->buf_header = NULL;
+ } else {
+ header_buf = mlx5vf_get_data_buffer(migf,
+ sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ if (IS_ERR(header_buf)) {
+ err = PTR_ERR(header_buf);
+ goto err_free;
+ }
+ }
+ }
+
+ if (async_data->last_chunk)
+ migf->state = MLX5_MIGF_STATE_SAVE_LAST;
+
+ async_data->header_buf = header_buf;
+ get_file(migf->filp);
+ err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in),
+ async_data->out,
+ out_size, mlx5vf_save_callback,
+ &async_data->cb_work);
+ if (err)
+ goto err_exec;
+
+ return 0;
+
+err_exec:
+ if (header_buf)
+ mlx5vf_put_data_buffer(header_buf);
+ fput(migf->filp);
+err_free:
+ kvfree(async_data->out);
+err_out:
+ complete(&migf->save_comp);
+ return err;
+}
+
+int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
+ struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *buf)
+{
+ u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {};
+ int err;
+
+ lockdep_assert_held(&mvdev->state_mutex);
+ if (mvdev->mdev_detach)
+ return -ENOTCONN;
+
+ if (!buf->dmaed) {
+ err = mlx5vf_dma_data_buffer(buf);
+ if (err)
+ return err;
+ }
+
+ MLX5_SET(load_vhca_state_in, in, opcode,
+ MLX5_CMD_OP_LOAD_VHCA_STATE);
+ MLX5_SET(load_vhca_state_in, in, op_mod, 0);
+ MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id);
+ MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey);
+ MLX5_SET(load_vhca_state_in, in, size, buf->length);
+ return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out);
+}
+
+int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf)
+{
+ int err;
+
+ lockdep_assert_held(&migf->mvdev->state_mutex);
+ if (migf->mvdev->mdev_detach)
+ return -ENOTCONN;
+
+ err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn);
+ return err;
+}
+
+void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
+{
+ lockdep_assert_held(&migf->mvdev->state_mutex);
+ if (migf->mvdev->mdev_detach)
+ return;
+
+ mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn);
+}
+
+void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
+{
+ struct mlx5_vhca_data_buffer *entry;
+
+ lockdep_assert_held(&migf->mvdev->state_mutex);
+ WARN_ON(migf->mvdev->mdev_detach);
+
+ if (migf->buf) {
+ mlx5vf_free_data_buffer(migf->buf);
+ migf->buf = NULL;
+ }
+
+ if (migf->buf_header) {
+ mlx5vf_free_data_buffer(migf->buf_header);
+ migf->buf_header = NULL;
+ }
+
+ list_splice(&migf->avail_list, &migf->buf_list);
+
+ while ((entry = list_first_entry_or_null(&migf->buf_list,
+ struct mlx5_vhca_data_buffer, buf_elm))) {
+ list_del(&entry->buf_elm);
+ mlx5vf_free_data_buffer(entry);
+ }
+
+ mlx5vf_cmd_dealloc_pd(migf);
+}
+
+static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
+ struct mlx5vf_pci_core_device *mvdev,
+ struct rb_root_cached *ranges, u32 nnodes)
+{
+ int max_num_range =
+ MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range);
+ struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
+ int record_size = MLX5_ST_SZ_BYTES(page_track_range);
+ u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
+ struct interval_tree_node *node = NULL;
+ u64 total_ranges_len = 0;
+ u32 num_ranges = nnodes;
+ u8 log_addr_space_size;
+ void *range_list_ptr;
+ void *obj_context;
+ void *cmd_hdr;
+ int inlen;
+ void *in;
+ int err;
+ int i;
+
+ if (num_ranges > max_num_range) {
+ vfio_combine_iova_ranges(ranges, nnodes, max_num_range);
+ num_ranges = max_num_range;
+ }
+
+ inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) +
+ record_size * num_ranges;
+ in = kzalloc(inlen, GFP_KERNEL);
+ if (!in)
+ return -ENOMEM;
+
+ cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in,
+ general_obj_in_cmd_hdr);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,
+ MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,
+ MLX5_OBJ_TYPE_PAGE_TRACK);
+ obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context);
+ MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id);
+ MLX5_SET(page_track, obj_context, track_type, 1);
+ MLX5_SET(page_track, obj_context, log_page_size,
+ ilog2(tracker->host_qp->tracked_page_size));
+ MLX5_SET(page_track, obj_context, log_msg_size,
+ ilog2(tracker->host_qp->max_msg_size));
+ MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn);
+ MLX5_SET(page_track, obj_context, num_ranges, num_ranges);
+
+ range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range);
+ node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
+ for (i = 0; i < num_ranges; i++) {
+ void *addr_range_i_base = range_list_ptr + record_size * i;
+ unsigned long length = node->last - node->start + 1;
+
+ MLX5_SET64(page_track_range, addr_range_i_base, start_address,
+ node->start);
+ MLX5_SET64(page_track_range, addr_range_i_base, length, length);
+ total_ranges_len += length;
+ node = interval_tree_iter_next(node, 0, ULONG_MAX);
+ }
+
+ WARN_ON(node);
+ log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len));
+ if (log_addr_space_size <
+ (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
+ log_addr_space_size >
+ (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ MLX5_SET(page_track, obj_context, log_addr_space_size,
+ log_addr_space_size);
+ err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
+ if (err)
+ goto out;
+
+ tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
+out:
+ kfree(in);
+ return err;
+}
+
+static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev,
+ u32 tracker_id)
+{
+ u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
+ u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
+
+ MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
+ MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
+ MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id);
+
+ return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev,
+ u32 tracker_id, unsigned long iova,
+ unsigned long length, u32 tracker_state)
+{
+ u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {};
+ u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
+ void *obj_context;
+ void *cmd_hdr;
+
+ cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
+ MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id);
+
+ obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context);
+ MLX5_SET64(page_track, obj_context, modify_field_select, 0x3);
+ MLX5_SET64(page_track, obj_context, range_start_address, iova);
+ MLX5_SET64(page_track, obj_context, length, length);
+ MLX5_SET(page_track, obj_context, state, tracker_state);
+
+ return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+}
+
+static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
+ struct mlx5_vhca_cq_buf *buf, int nent,
+ int cqe_size)
+{
+ struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
+ u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0);
+ u8 log_wq_sz = ilog2(cqe_size);
+ int err;
+
+ err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf,
+ mdev->priv.numa_node);
+ if (err)
+ return err;
+
+ mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
+ buf->cqe_size = cqe_size;
+ buf->nent = nent;
+ return 0;
+}
+
+static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf)
+{
+ struct mlx5_cqe64 *cqe64;
+ void *cqe;
+ int i;
+
+ for (i = 0; i < buf->nent; i++) {
+ cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i);
+ cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64;
+ cqe64->op_own = MLX5_CQE_INVALID << 4;
+ }
+}
+
+static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev,
+ struct mlx5_vhca_cq *cq)
+{
+ mlx5_core_destroy_cq(mdev, &cq->mcq);
+ mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
+ mlx5_db_free(mdev, &cq->db);
+}
+
+static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type)
+{
+ if (type != MLX5_EVENT_TYPE_CQ_ERROR)
+ return;
+
+ set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device,
+ tracker.cq.mcq));
+}
+
+static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
+ void *data)
+{
+ struct mlx5_vhca_page_tracker *tracker =
+ mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb);
+ struct mlx5vf_pci_core_device *mvdev = container_of(
+ tracker, struct mlx5vf_pci_core_device, tracker);
+ struct mlx5_eqe *eqe = data;
+ u8 event_type = (u8)type;
+ u8 queue_type;
+ int qp_num;
+
+ switch (event_type) {
+ case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+ case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+ case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+ queue_type = eqe->data.qp_srq.type;
+ if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP)
+ break;
+ qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
+ if (qp_num != tracker->host_qp->qpn &&
+ qp_num != tracker->fw_qp->qpn)
+ break;
+ set_tracker_error(mvdev);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq,
+ struct mlx5_eqe *eqe)
+{
+ struct mlx5vf_pci_core_device *mvdev =
+ container_of(mcq, struct mlx5vf_pci_core_device,
+ tracker.cq.mcq);
+
+ complete(&mvdev->tracker_comp);
+}
+
+static int mlx5vf_create_cq(struct mlx5_core_dev *mdev,
+ struct mlx5_vhca_page_tracker *tracker,
+ size_t ncqe)
+{
+ int cqe_size = cache_line_size() == 128 ? 128 : 64;
+ u32 out[MLX5_ST_SZ_DW(create_cq_out)];
+ struct mlx5_vhca_cq *cq;
+ int inlen, err, eqn;
+ void *cqc, *in;
+ __be64 *pas;
+ int vector;
+
+ cq = &tracker->cq;
+ ncqe = roundup_pow_of_two(ncqe);
+ err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node);
+ if (err)
+ return err;
+
+ cq->ncqe = ncqe;
+ cq->mcq.set_ci_db = cq->db.db;
+ cq->mcq.arm_db = cq->db.db + 1;
+ cq->mcq.cqe_sz = cqe_size;
+ err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size);
+ if (err)
+ goto err_db_free;
+
+ init_cq_frag_buf(&cq->buf);
+ inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
+ MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) *
+ cq->buf.frag_buf.npages;
+ in = kvzalloc(inlen, GFP_KERNEL);
+ if (!in) {
+ err = -ENOMEM;
+ goto err_buff;
+ }
+
+ vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev);
+ err = mlx5_comp_eqn_get(mdev, vector, &eqn);
+ if (err)
+ goto err_vec;
+
+ cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
+ MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
+ MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
+ MLX5_SET(cqc, cqc, uar_page, tracker->uar->index);
+ MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift -
+ MLX5_ADAPTER_PAGE_SHIFT);
+ MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
+ pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
+ mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas);
+ cq->mcq.comp = mlx5vf_cq_complete;
+ cq->mcq.event = mlx5vf_cq_event;
+ err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
+ if (err)
+ goto err_vec;
+
+ mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
+ cq->mcq.cons_index);
+ kvfree(in);
+ return 0;
+
+err_vec:
+ kvfree(in);
+err_buff:
+ mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
+err_db_free:
+ mlx5_db_free(mdev, &cq->db);
+ return err;
+}
+
+static struct mlx5_vhca_qp *
+mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev,
+ struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr)
+{
+ u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
+ struct mlx5_vhca_qp *qp;
+ u8 log_rq_stride;
+ u8 log_rq_sz;
+ void *qpc;
+ int inlen;
+ void *in;
+ int err;
+
+ qp = kzalloc(sizeof(*qp), GFP_KERNEL_ACCOUNT);
+ if (!qp)
+ return ERR_PTR(-ENOMEM);
+
+ err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node);
+ if (err)
+ goto err_free;
+
+ if (max_recv_wr) {
+ qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr);
+ log_rq_stride = ilog2(MLX5_SEND_WQE_DS);
+ log_rq_sz = ilog2(qp->rq.wqe_cnt);
+ err = mlx5_frag_buf_alloc_node(mdev,
+ wq_get_byte_sz(log_rq_sz, log_rq_stride),
+ &qp->buf, mdev->priv.numa_node);
+ if (err)
+ goto err_db_free;
+ mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc);
+ }
+
+ qp->rq.db = &qp->db.db[MLX5_RCV_DBR];
+ inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
+ MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
+ qp->buf.npages;
+ in = kvzalloc(inlen, GFP_KERNEL);
+ if (!in) {
+ err = -ENOMEM;
+ goto err_in;
+ }
+
+ qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+ MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
+ MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
+ MLX5_SET(qpc, qpc, pd, tracker->pdn);
+ MLX5_SET(qpc, qpc, uar_page, tracker->uar->index);
+ MLX5_SET(qpc, qpc, log_page_size,
+ qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+ MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
+ if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
+ MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
+ MLX5_SET(qpc, qpc, no_sq, 1);
+ if (max_recv_wr) {
+ MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn);
+ MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4);
+ MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz);
+ MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
+ MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);
+ mlx5_fill_page_frag_array(&qp->buf,
+ (__be64 *)MLX5_ADDR_OF(create_qp_in,
+ in, pas));
+ } else {
+ MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
+ }
+
+ MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
+ err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
+ kvfree(in);
+ if (err)
+ goto err_in;
+
+ qp->qpn = MLX5_GET(create_qp_out, out, qpn);
+ return qp;
+
+err_in:
+ if (max_recv_wr)
+ mlx5_frag_buf_free(mdev, &qp->buf);
+err_db_free:
+ mlx5_db_free(mdev, &qp->db);
+err_free:
+ kfree(qp);
+ return ERR_PTR(err);
+}
+
+static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp)
+{
+ struct mlx5_wqe_data_seg *data;
+ unsigned int ix;
+
+ WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt);
+ ix = qp->rq.pc & (qp->rq.wqe_cnt - 1);
+ data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix);
+ data->byte_count = cpu_to_be32(qp->max_msg_size);
+ data->lkey = cpu_to_be32(qp->recv_buf.mkey);
+ data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset);
+ qp->rq.pc++;
+ /* Make sure that descriptors are written before doorbell record. */
+ dma_wmb();
+ *qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff);
+}
+
+static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev,
+ struct mlx5_vhca_qp *qp, u32 remote_qpn,
+ bool host_qp)
+{
+ u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
+ u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
+ u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
+ void *qpc;
+ int ret;
+
+ /* Init */
+ qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc);
+ MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
+ MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
+ MLX5_SET(qpc, qpc, rre, 1);
+ MLX5_SET(qpc, qpc, rwe, 1);
+ MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP);
+ MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn);
+ ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in);
+ if (ret)
+ return ret;
+
+ if (host_qp) {
+ struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
+ int i;
+
+ for (i = 0; i < qp->rq.wqe_cnt; i++) {
+ mlx5vf_post_recv(qp);
+ recv_buf->next_rq_offset += qp->max_msg_size;
+ }
+ }
+
+ /* RTR */
+ qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc);
+ MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
+ MLX5_SET(qpc, qpc, mtu, IB_MTU_4096);
+ MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg));
+ MLX5_SET(qpc, qpc, remote_qpn, remote_qpn);
+ MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
+ MLX5_SET(qpc, qpc, primary_address_path.fl, 1);
+ MLX5_SET(qpc, qpc, min_rnr_nak, 1);
+ MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
+ MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
+ ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in);
+ if (ret || host_qp)
+ return ret;
+
+ /* RTS */
+ qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc);
+ MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
+ MLX5_SET(qpc, qpc, retry_count, 7);
+ MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */
+ MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */
+ MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
+ MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
+
+ return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in);
+}
+
+static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev,
+ struct mlx5_vhca_qp *qp)
+{
+ u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
+
+ MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
+ MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
+ mlx5_cmd_exec_in(mdev, destroy_qp, in);
+
+ mlx5_frag_buf_free(mdev, &qp->buf);
+ mlx5_db_free(mdev, &qp->db);
+ kfree(qp);
+}
+
+static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf)
+{
+ int i;
+
+ /* Undo alloc_pages_bulk_array() */
+ for (i = 0; i < recv_buf->npages; i++)
+ __free_page(recv_buf->page_list[i]);
+
+ kvfree(recv_buf->page_list);
+}
+
+static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
+ unsigned int npages)
+{
+ unsigned int filled = 0, done = 0;
+ int i;
+
+ recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list),
+ GFP_KERNEL_ACCOUNT);
+ if (!recv_buf->page_list)
+ return -ENOMEM;
+
+ for (;;) {
+ filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT,
+ npages - done,
+ recv_buf->page_list + done);
+ if (!filled)
+ goto err;
+
+ done += filled;
+ if (done == npages)
+ break;
+ }
+
+ recv_buf->npages = npages;
+ return 0;
+
+err:
+ for (i = 0; i < npages; i++) {
+ if (recv_buf->page_list[i])
+ __free_page(recv_buf->page_list[i]);
+ }
+
+ kvfree(recv_buf->page_list);
+ return -ENOMEM;
+}
+
+static int register_dma_recv_pages(struct mlx5_core_dev *mdev,
+ struct mlx5_vhca_recv_buf *recv_buf)
+{
+ int i, j;
+
+ recv_buf->dma_addrs = kvcalloc(recv_buf->npages,
+ sizeof(*recv_buf->dma_addrs),
+ GFP_KERNEL_ACCOUNT);
+ if (!recv_buf->dma_addrs)
+ return -ENOMEM;
+
+ for (i = 0; i < recv_buf->npages; i++) {
+ recv_buf->dma_addrs[i] = dma_map_page(mdev->device,
+ recv_buf->page_list[i],
+ 0, PAGE_SIZE,
+ DMA_FROM_DEVICE);
+ if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i]))
+ goto error;
+ }
+ return 0;
+
+error:
+ for (j = 0; j < i; j++)
+ dma_unmap_single(mdev->device, recv_buf->dma_addrs[j],
+ PAGE_SIZE, DMA_FROM_DEVICE);
+
+ kvfree(recv_buf->dma_addrs);
+ return -ENOMEM;
+}
+
+static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev,
+ struct mlx5_vhca_recv_buf *recv_buf)
+{
+ int i;
+
+ for (i = 0; i < recv_buf->npages; i++)
+ dma_unmap_single(mdev->device, recv_buf->dma_addrs[i],
+ PAGE_SIZE, DMA_FROM_DEVICE);
+
+ kvfree(recv_buf->dma_addrs);
+}
+
+static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
+ struct mlx5_vhca_qp *qp)
+{
+ struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
+
+ mlx5_core_destroy_mkey(mdev, recv_buf->mkey);
+ unregister_dma_recv_pages(mdev, recv_buf);
+ free_recv_pages(&qp->recv_buf);
+}
+
+static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
+ struct mlx5_vhca_qp *qp, u32 pdn,
+ u64 rq_size)
+{
+ unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE);
+ struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
+ int err;
+
+ err = alloc_recv_pages(recv_buf, npages);
+ if (err < 0)
+ return err;
+
+ err = register_dma_recv_pages(mdev, recv_buf);
+ if (err)
+ goto end;
+
+ err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey);
+ if (err)
+ goto err_create_mkey;
+
+ return 0;
+
+err_create_mkey:
+ unregister_dma_recv_pages(mdev, recv_buf);
+end:
+ free_recv_pages(recv_buf);
+ return err;
+}
+
+static void
+_mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev)
+{
+ struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
+ struct mlx5_core_dev *mdev = mvdev->mdev;
+
+ lockdep_assert_held(&mvdev->state_mutex);
+
+ if (!mvdev->log_active)
+ return;
+
+ WARN_ON(mvdev->mdev_detach);
+
+ mlx5_eq_notifier_unregister(mdev, &tracker->nb);
+ mlx5vf_cmd_destroy_tracker(mdev, tracker->id);
+ mlx5vf_destroy_qp(mdev, tracker->fw_qp);
+ mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp);
+ mlx5vf_destroy_qp(mdev, tracker->host_qp);
+ mlx5vf_destroy_cq(mdev, &tracker->cq);
+ mlx5_core_dealloc_pd(mdev, tracker->pdn);
+ mlx5_put_uars_page(mdev, tracker->uar);
+ mvdev->log_active = false;
+}
+
+int mlx5vf_stop_page_tracker(struct vfio_device *vdev)
+{
+ struct mlx5vf_pci_core_device *mvdev = container_of(
+ vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+
+ mutex_lock(&mvdev->state_mutex);
+ if (!mvdev->log_active)
+ goto end;
+
+ _mlx5vf_free_page_tracker_resources(mvdev);
+ mvdev->log_active = false;
+end:
+ mlx5vf_state_mutex_unlock(mvdev);
+ return 0;
+}
+
+int mlx5vf_start_page_tracker(struct vfio_device *vdev,
+ struct rb_root_cached *ranges, u32 nnodes,
+ u64 *page_size)
+{
+ struct mlx5vf_pci_core_device *mvdev = container_of(
+ vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+ struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
+ u8 log_tracked_page = ilog2(*page_size);
+ struct mlx5_vhca_qp *host_qp;
+ struct mlx5_vhca_qp *fw_qp;
+ struct mlx5_core_dev *mdev;
+ u32 max_msg_size = PAGE_SIZE;
+ u64 rq_size = SZ_2M;
+ u32 max_recv_wr;
+ int err;
+
+ mutex_lock(&mvdev->state_mutex);
+ if (mvdev->mdev_detach) {
+ err = -ENOTCONN;
+ goto end;
+ }
+
+ if (mvdev->log_active) {
+ err = -EINVAL;
+ goto end;
+ }
+
+ mdev = mvdev->mdev;
+ memset(tracker, 0, sizeof(*tracker));
+ tracker->uar = mlx5_get_uars_page(mdev);
+ if (IS_ERR(tracker->uar)) {
+ err = PTR_ERR(tracker->uar);
+ goto end;
+ }
+
+ err = mlx5_core_alloc_pd(mdev, &tracker->pdn);
+ if (err)
+ goto err_uar;
+
+ max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size);
+ err = mlx5vf_create_cq(mdev, tracker, max_recv_wr);
+ if (err)
+ goto err_dealloc_pd;
+
+ host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr);
+ if (IS_ERR(host_qp)) {
+ err = PTR_ERR(host_qp);
+ goto err_cq;
+ }
+
+ host_qp->max_msg_size = max_msg_size;
+ if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev,
+ pg_track_log_min_page_size)) {
+ log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
+ pg_track_log_min_page_size);
+ } else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev,
+ pg_track_log_max_page_size)) {
+ log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
+ pg_track_log_max_page_size);
+ }
+
+ host_qp->tracked_page_size = (1ULL << log_tracked_page);
+ err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn,
+ rq_size);
+ if (err)
+ goto err_host_qp;
+
+ fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0);
+ if (IS_ERR(fw_qp)) {
+ err = PTR_ERR(fw_qp);
+ goto err_recv_resources;
+ }
+
+ err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true);
+ if (err)
+ goto err_activate;
+
+ err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false);
+ if (err)
+ goto err_activate;
+
+ tracker->host_qp = host_qp;
+ tracker->fw_qp = fw_qp;
+ err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes);
+ if (err)
+ goto err_activate;
+
+ MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY);
+ mlx5_eq_notifier_register(mdev, &tracker->nb);
+ *page_size = host_qp->tracked_page_size;
+ mvdev->log_active = true;
+ mlx5vf_state_mutex_unlock(mvdev);
+ return 0;
+
+err_activate:
+ mlx5vf_destroy_qp(mdev, fw_qp);
+err_recv_resources:
+ mlx5vf_free_qp_recv_resources(mdev, host_qp);
+err_host_qp:
+ mlx5vf_destroy_qp(mdev, host_qp);
+err_cq:
+ mlx5vf_destroy_cq(mdev, &tracker->cq);
+err_dealloc_pd:
+ mlx5_core_dealloc_pd(mdev, tracker->pdn);
+err_uar:
+ mlx5_put_uars_page(mdev, tracker->uar);
+end:
+ mlx5vf_state_mutex_unlock(mvdev);
+ return err;
+}
+
+static void
+set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp,
+ struct iova_bitmap *dirty)
+{
+ u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry);
+ u32 nent = size / entry_size;
+ struct page *page;
+ u64 addr;
+ u64 *buf;
+ int i;
+
+ if (WARN_ON(index >= qp->recv_buf.npages ||
+ (nent > qp->max_msg_size / entry_size)))
+ return;
+
+ page = qp->recv_buf.page_list[index];
+ buf = kmap_local_page(page);
+ for (i = 0; i < nent; i++) {
+ addr = MLX5_GET(page_track_report_entry, buf + i,
+ dirty_address_low);
+ addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
+ dirty_address_high) << 32;
+ iova_bitmap_set(dirty, addr, qp->tracked_page_size);
+ }
+ kunmap_local(buf);
+}
+
+static void
+mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe,
+ struct iova_bitmap *dirty, int *tracker_status)
+{
+ u32 size;
+ int ix;
+
+ qp->rq.cc++;
+ *tracker_status = be32_to_cpu(cqe->immediate) >> 28;
+ size = be32_to_cpu(cqe->byte_cnt);
+ ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1);
+
+ /* zero length CQE, no data */
+ WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING);
+ if (size)
+ set_report_output(size, ix, qp, dirty);
+
+ qp->recv_buf.next_rq_offset = ix * qp->max_msg_size;
+ mlx5vf_post_recv(qp);
+}
+
+static void *get_cqe(struct mlx5_vhca_cq *cq, int n)
+{
+ return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n);
+}
+
+static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n)
+{
+ void *cqe = get_cqe(cq, n & (cq->ncqe - 1));
+ struct mlx5_cqe64 *cqe64;
+
+ cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
+
+ if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
+ !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) {
+ return cqe64;
+ } else {
+ return NULL;
+ }
+}
+
+static int
+mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp,
+ struct iova_bitmap *dirty, int *tracker_status)
+{
+ struct mlx5_cqe64 *cqe;
+ u8 opcode;
+
+ cqe = get_sw_cqe(cq, cq->mcq.cons_index);
+ if (!cqe)
+ return CQ_EMPTY;
+
+ ++cq->mcq.cons_index;
+ /*
+ * Make sure we read CQ entry contents after we've checked the
+ * ownership bit.
+ */
+ rmb();
+ opcode = get_cqe_opcode(cqe);
+ switch (opcode) {
+ case MLX5_CQE_RESP_SEND_IMM:
+ mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status);
+ return CQ_OK;
+ default:
+ return CQ_POLL_ERR;
+ }
+}
+
+int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
+ unsigned long length,
+ struct iova_bitmap *dirty)
+{
+ struct mlx5vf_pci_core_device *mvdev = container_of(
+ vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+ struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
+ struct mlx5_vhca_cq *cq = &tracker->cq;
+ struct mlx5_core_dev *mdev;
+ int poll_err, err;
+
+ mutex_lock(&mvdev->state_mutex);
+ if (!mvdev->log_active) {
+ err = -EINVAL;
+ goto end;
+ }
+
+ if (mvdev->mdev_detach) {
+ err = -ENOTCONN;
+ goto end;
+ }
+
+ mdev = mvdev->mdev;
+ err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length,
+ MLX5_PAGE_TRACK_STATE_REPORTING);
+ if (err)
+ goto end;
+
+ tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING;
+ while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING &&
+ !tracker->is_err) {
+ poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty,
+ &tracker->status);
+ if (poll_err == CQ_EMPTY) {
+ mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
+ cq->mcq.cons_index);
+ poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp,
+ dirty, &tracker->status);
+ if (poll_err == CQ_EMPTY) {
+ wait_for_completion(&mvdev->tracker_comp);
+ continue;
+ }
+ }
+ if (poll_err == CQ_POLL_ERR) {
+ err = -EIO;
+ goto end;
+ }
+ mlx5_cq_set_ci(&cq->mcq);
+ }
+
+ if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR)
+ tracker->is_err = true;
+
+ if (tracker->is_err)
+ err = -EIO;
+end:
+ mlx5vf_state_mutex_unlock(mvdev);
+ return err;
+}
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
new file mode 100644
index 0000000000..aec4c69dd6
--- /dev/null
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -0,0 +1,225 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ */
+
+#ifndef MLX5_VFIO_CMD_H
+#define MLX5_VFIO_CMD_H
+
+#include <linux/kernel.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/vport.h>
+#include <linux/mlx5/cq.h>
+#include <linux/mlx5/qp.h>
+
+#define MLX5VF_PRE_COPY_SUPP(mvdev) \
+ ((mvdev)->core_device.vdev.migration_flags & VFIO_MIGRATION_PRE_COPY)
+
+enum mlx5_vf_migf_state {
+ MLX5_MIGF_STATE_ERROR = 1,
+ MLX5_MIGF_STATE_PRE_COPY_ERROR,
+ MLX5_MIGF_STATE_PRE_COPY,
+ MLX5_MIGF_STATE_SAVE_LAST,
+ MLX5_MIGF_STATE_COMPLETE,
+};
+
+enum mlx5_vf_load_state {
+ MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER,
+ MLX5_VF_LOAD_STATE_READ_HEADER,
+ MLX5_VF_LOAD_STATE_PREP_HEADER_DATA,
+ MLX5_VF_LOAD_STATE_READ_HEADER_DATA,
+ MLX5_VF_LOAD_STATE_PREP_IMAGE,
+ MLX5_VF_LOAD_STATE_READ_IMAGE,
+ MLX5_VF_LOAD_STATE_LOAD_IMAGE,
+};
+
+struct mlx5_vf_migration_tag_stop_copy_data {
+ __le64 stop_copy_size;
+};
+
+enum mlx5_vf_migf_header_flags {
+ MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY = 0,
+ MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL = 1 << 0,
+};
+
+enum mlx5_vf_migf_header_tag {
+ MLX5_MIGF_HEADER_TAG_FW_DATA = 0,
+ MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE = 1 << 0,
+};
+
+struct mlx5_vf_migration_header {
+ __le64 record_size;
+ /* For future use in case we may need to change the kernel protocol */
+ __le32 flags; /* Use mlx5_vf_migf_header_flags */
+ __le32 tag; /* Use mlx5_vf_migf_header_tag */
+ __u8 data[]; /* Its size is given in the record_size */
+};
+
+struct mlx5_vhca_data_buffer {
+ struct sg_append_table table;
+ loff_t start_pos;
+ u64 length;
+ u64 allocated_length;
+ u32 mkey;
+ enum dma_data_direction dma_dir;
+ u8 dmaed:1;
+ struct list_head buf_elm;
+ struct mlx5_vf_migration_file *migf;
+ /* Optimize mlx5vf_get_migration_page() for sequential access */
+ struct scatterlist *last_offset_sg;
+ unsigned int sg_last_entry;
+ unsigned long last_offset;
+};
+
+struct mlx5vf_async_data {
+ struct mlx5_async_work cb_work;
+ struct work_struct work;
+ struct mlx5_vhca_data_buffer *buf;
+ struct mlx5_vhca_data_buffer *header_buf;
+ int status;
+ u8 last_chunk:1;
+ void *out;
+};
+
+struct mlx5_vf_migration_file {
+ struct file *filp;
+ struct mutex lock;
+ enum mlx5_vf_migf_state state;
+
+ enum mlx5_vf_load_state load_state;
+ u32 pdn;
+ loff_t max_pos;
+ u64 record_size;
+ u32 record_tag;
+ u64 stop_copy_prep_size;
+ u64 pre_copy_initial_bytes;
+ struct mlx5_vhca_data_buffer *buf;
+ struct mlx5_vhca_data_buffer *buf_header;
+ spinlock_t list_lock;
+ struct list_head buf_list;
+ struct list_head avail_list;
+ struct mlx5vf_pci_core_device *mvdev;
+ wait_queue_head_t poll_wait;
+ struct completion save_comp;
+ struct mlx5_async_ctx async_ctx;
+ struct mlx5vf_async_data async_data;
+};
+
+struct mlx5_vhca_cq_buf {
+ struct mlx5_frag_buf_ctrl fbc;
+ struct mlx5_frag_buf frag_buf;
+ int cqe_size;
+ int nent;
+};
+
+struct mlx5_vhca_cq {
+ struct mlx5_vhca_cq_buf buf;
+ struct mlx5_db db;
+ struct mlx5_core_cq mcq;
+ size_t ncqe;
+};
+
+struct mlx5_vhca_recv_buf {
+ u32 npages;
+ struct page **page_list;
+ dma_addr_t *dma_addrs;
+ u32 next_rq_offset;
+ u32 mkey;
+};
+
+struct mlx5_vhca_qp {
+ struct mlx5_frag_buf buf;
+ struct mlx5_db db;
+ struct mlx5_vhca_recv_buf recv_buf;
+ u32 tracked_page_size;
+ u32 max_msg_size;
+ u32 qpn;
+ struct {
+ unsigned int pc;
+ unsigned int cc;
+ unsigned int wqe_cnt;
+ __be32 *db;
+ struct mlx5_frag_buf_ctrl fbc;
+ } rq;
+};
+
+struct mlx5_vhca_page_tracker {
+ u32 id;
+ u32 pdn;
+ u8 is_err:1;
+ struct mlx5_uars_page *uar;
+ struct mlx5_vhca_cq cq;
+ struct mlx5_vhca_qp *host_qp;
+ struct mlx5_vhca_qp *fw_qp;
+ struct mlx5_nb nb;
+ int status;
+};
+
+struct mlx5vf_pci_core_device {
+ struct vfio_pci_core_device core_device;
+ int vf_id;
+ u16 vhca_id;
+ u8 migrate_cap:1;
+ u8 deferred_reset:1;
+ u8 mdev_detach:1;
+ u8 log_active:1;
+ struct completion tracker_comp;
+ /* protect migration state */
+ struct mutex state_mutex;
+ enum vfio_device_mig_state mig_state;
+ /* protect the reset_done flow */
+ spinlock_t reset_lock;
+ struct mlx5_vf_migration_file *resuming_migf;
+ struct mlx5_vf_migration_file *saving_migf;
+ struct mlx5_vhca_page_tracker tracker;
+ struct workqueue_struct *cb_wq;
+ struct notifier_block nb;
+ struct mlx5_core_dev *mdev;
+};
+
+enum {
+ MLX5VF_QUERY_INC = (1UL << 0),
+ MLX5VF_QUERY_FINAL = (1UL << 1),
+};
+
+int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
+int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
+int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
+ size_t *state_size, u8 query_flags);
+void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
+ const struct vfio_migration_ops *mig_ops,
+ const struct vfio_log_ops *log_ops);
+void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev);
+void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev);
+int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
+ struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *buf, bool inc,
+ bool track);
+int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
+ struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *buf);
+int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf);
+void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf);
+void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf);
+struct mlx5_vhca_data_buffer *
+mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
+ size_t length, enum dma_data_direction dma_dir);
+void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf);
+struct mlx5_vhca_data_buffer *
+mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
+ size_t length, enum dma_data_direction dma_dir);
+void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf);
+int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
+ unsigned int npages);
+struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
+ unsigned long offset);
+void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev);
+void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev);
+void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work);
+int mlx5vf_start_page_tracker(struct vfio_device *vdev,
+ struct rb_root_cached *ranges, u32 nnodes, u64 *page_size);
+int mlx5vf_stop_page_tracker(struct vfio_device *vdev);
+int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
+ unsigned long length, struct iova_bitmap *dirty);
+#endif /* MLX5_VFIO_CMD_H */
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
new file mode 100644
index 0000000000..42ec574a86
--- /dev/null
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -0,0 +1,1383 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include <linux/device.h>
+#include <linux/eventfd.h>
+#include <linux/file.h>
+#include <linux/interrupt.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/pci.h>
+#include <linux/pm_runtime.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/sched/mm.h>
+#include <linux/anon_inodes.h>
+
+#include "cmd.h"
+
+/* Device specification max LOAD size */
+#define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
+
+static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
+{
+ struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
+
+ return container_of(core_device, struct mlx5vf_pci_core_device,
+ core_device);
+}
+
+struct page *
+mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
+ unsigned long offset)
+{
+ unsigned long cur_offset = 0;
+ struct scatterlist *sg;
+ unsigned int i;
+
+ /* All accesses are sequential */
+ if (offset < buf->last_offset || !buf->last_offset_sg) {
+ buf->last_offset = 0;
+ buf->last_offset_sg = buf->table.sgt.sgl;
+ buf->sg_last_entry = 0;
+ }
+
+ cur_offset = buf->last_offset;
+
+ for_each_sg(buf->last_offset_sg, sg,
+ buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
+ if (offset < sg->length + cur_offset) {
+ buf->last_offset_sg = sg;
+ buf->sg_last_entry += i;
+ buf->last_offset = cur_offset;
+ return nth_page(sg_page(sg),
+ (offset - cur_offset) / PAGE_SIZE);
+ }
+ cur_offset += sg->length;
+ }
+ return NULL;
+}
+
+int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
+ unsigned int npages)
+{
+ unsigned int to_alloc = npages;
+ struct page **page_list;
+ unsigned long filled;
+ unsigned int to_fill;
+ int ret;
+
+ to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
+ page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT);
+ if (!page_list)
+ return -ENOMEM;
+
+ do {
+ filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
+ page_list);
+ if (!filled) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ to_alloc -= filled;
+ ret = sg_alloc_append_table_from_pages(
+ &buf->table, page_list, filled, 0,
+ filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
+ GFP_KERNEL_ACCOUNT);
+
+ if (ret)
+ goto err;
+ buf->allocated_length += filled * PAGE_SIZE;
+ /* clean input for another bulk allocation */
+ memset(page_list, 0, filled * sizeof(*page_list));
+ to_fill = min_t(unsigned int, to_alloc,
+ PAGE_SIZE / sizeof(*page_list));
+ } while (to_alloc > 0);
+
+ kvfree(page_list);
+ return 0;
+
+err:
+ kvfree(page_list);
+ return ret;
+}
+
+static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
+{
+ mutex_lock(&migf->lock);
+ migf->state = MLX5_MIGF_STATE_ERROR;
+ migf->filp->f_pos = 0;
+ mutex_unlock(&migf->lock);
+}
+
+static int mlx5vf_release_file(struct inode *inode, struct file *filp)
+{
+ struct mlx5_vf_migration_file *migf = filp->private_data;
+
+ mlx5vf_disable_fd(migf);
+ mutex_destroy(&migf->lock);
+ kfree(migf);
+ return 0;
+}
+
+static struct mlx5_vhca_data_buffer *
+mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
+ bool *end_of_data)
+{
+ struct mlx5_vhca_data_buffer *buf;
+ bool found = false;
+
+ *end_of_data = false;
+ spin_lock_irq(&migf->list_lock);
+ if (list_empty(&migf->buf_list)) {
+ *end_of_data = true;
+ goto end;
+ }
+
+ buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
+ buf_elm);
+ if (pos >= buf->start_pos &&
+ pos < buf->start_pos + buf->length) {
+ found = true;
+ goto end;
+ }
+
+ /*
+ * As we use a stream based FD we may expect having the data always
+ * on first chunk
+ */
+ migf->state = MLX5_MIGF_STATE_ERROR;
+
+end:
+ spin_unlock_irq(&migf->list_lock);
+ return found ? buf : NULL;
+}
+
+static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
+ char __user **buf, size_t *len, loff_t *pos)
+{
+ unsigned long offset;
+ ssize_t done = 0;
+ size_t copy_len;
+
+ copy_len = min_t(size_t,
+ vhca_buf->start_pos + vhca_buf->length - *pos, *len);
+ while (copy_len) {
+ size_t page_offset;
+ struct page *page;
+ size_t page_len;
+ u8 *from_buff;
+ int ret;
+
+ offset = *pos - vhca_buf->start_pos;
+ page_offset = offset % PAGE_SIZE;
+ offset -= page_offset;
+ page = mlx5vf_get_migration_page(vhca_buf, offset);
+ if (!page)
+ return -EINVAL;
+ page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
+ from_buff = kmap_local_page(page);
+ ret = copy_to_user(*buf, from_buff + page_offset, page_len);
+ kunmap_local(from_buff);
+ if (ret)
+ return -EFAULT;
+ *pos += page_len;
+ *len -= page_len;
+ *buf += page_len;
+ done += page_len;
+ copy_len -= page_len;
+ }
+
+ if (*pos >= vhca_buf->start_pos + vhca_buf->length) {
+ spin_lock_irq(&vhca_buf->migf->list_lock);
+ list_del_init(&vhca_buf->buf_elm);
+ list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
+ spin_unlock_irq(&vhca_buf->migf->list_lock);
+ }
+
+ return done;
+}
+
+static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
+ loff_t *pos)
+{
+ struct mlx5_vf_migration_file *migf = filp->private_data;
+ struct mlx5_vhca_data_buffer *vhca_buf;
+ bool first_loop_call = true;
+ bool end_of_data;
+ ssize_t done = 0;
+
+ if (pos)
+ return -ESPIPE;
+ pos = &filp->f_pos;
+
+ if (!(filp->f_flags & O_NONBLOCK)) {
+ if (wait_event_interruptible(migf->poll_wait,
+ !list_empty(&migf->buf_list) ||
+ migf->state == MLX5_MIGF_STATE_ERROR ||
+ migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
+ migf->state == MLX5_MIGF_STATE_PRE_COPY ||
+ migf->state == MLX5_MIGF_STATE_COMPLETE))
+ return -ERESTARTSYS;
+ }
+
+ mutex_lock(&migf->lock);
+ if (migf->state == MLX5_MIGF_STATE_ERROR) {
+ done = -ENODEV;
+ goto out_unlock;
+ }
+
+ while (len) {
+ ssize_t count;
+
+ vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
+ &end_of_data);
+ if (first_loop_call) {
+ first_loop_call = false;
+ /* Temporary end of file as part of PRE_COPY */
+ if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
+ migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
+ done = -ENOMSG;
+ goto out_unlock;
+ }
+
+ if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
+ if (filp->f_flags & O_NONBLOCK) {
+ done = -EAGAIN;
+ goto out_unlock;
+ }
+ }
+ }
+
+ if (end_of_data)
+ goto out_unlock;
+
+ if (!vhca_buf) {
+ done = -EINVAL;
+ goto out_unlock;
+ }
+
+ count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos);
+ if (count < 0) {
+ done = count;
+ goto out_unlock;
+ }
+ done += count;
+ }
+
+out_unlock:
+ mutex_unlock(&migf->lock);
+ return done;
+}
+
+static __poll_t mlx5vf_save_poll(struct file *filp,
+ struct poll_table_struct *wait)
+{
+ struct mlx5_vf_migration_file *migf = filp->private_data;
+ __poll_t pollflags = 0;
+
+ poll_wait(filp, &migf->poll_wait, wait);
+
+ mutex_lock(&migf->lock);
+ if (migf->state == MLX5_MIGF_STATE_ERROR)
+ pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
+ else if (!list_empty(&migf->buf_list) ||
+ migf->state == MLX5_MIGF_STATE_COMPLETE)
+ pollflags = EPOLLIN | EPOLLRDNORM;
+ mutex_unlock(&migf->lock);
+
+ return pollflags;
+}
+
+/*
+ * FD is exposed and user can use it after receiving an error.
+ * Mark migf in error, and wake the user.
+ */
+static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
+{
+ migf->state = MLX5_MIGF_STATE_ERROR;
+ wake_up_interruptible(&migf->poll_wait);
+}
+
+static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
+{
+ size_t size = sizeof(struct mlx5_vf_migration_header) +
+ sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
+ struct mlx5_vf_migration_tag_stop_copy_data data = {};
+ struct mlx5_vhca_data_buffer *header_buf = NULL;
+ struct mlx5_vf_migration_header header = {};
+ unsigned long flags;
+ struct page *page;
+ u8 *to_buff;
+ int ret;
+
+ header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE);
+ if (IS_ERR(header_buf))
+ return PTR_ERR(header_buf);
+
+ header.record_size = cpu_to_le64(sizeof(data));
+ header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL);
+ header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE);
+ page = mlx5vf_get_migration_page(header_buf, 0);
+ if (!page) {
+ ret = -EINVAL;
+ goto err;
+ }
+ to_buff = kmap_local_page(page);
+ memcpy(to_buff, &header, sizeof(header));
+ header_buf->length = sizeof(header);
+ data.stop_copy_size = cpu_to_le64(migf->buf->allocated_length);
+ memcpy(to_buff + sizeof(header), &data, sizeof(data));
+ header_buf->length += sizeof(data);
+ kunmap_local(to_buff);
+ header_buf->start_pos = header_buf->migf->max_pos;
+ migf->max_pos += header_buf->length;
+ spin_lock_irqsave(&migf->list_lock, flags);
+ list_add_tail(&header_buf->buf_elm, &migf->buf_list);
+ spin_unlock_irqrestore(&migf->list_lock, flags);
+ migf->pre_copy_initial_bytes = size;
+ return 0;
+err:
+ mlx5vf_put_data_buffer(header_buf);
+ return ret;
+}
+
+static int mlx5vf_prep_stop_copy(struct mlx5_vf_migration_file *migf,
+ size_t state_size)
+{
+ struct mlx5_vhca_data_buffer *buf;
+ size_t inc_state_size;
+ int ret;
+
+ /* let's be ready for stop_copy size that might grow by 10 percents */
+ if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
+ inc_state_size = state_size;
+
+ buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+
+ migf->buf = buf;
+ buf = mlx5vf_get_data_buffer(migf,
+ sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto err;
+ }
+
+ migf->buf_header = buf;
+ ret = mlx5vf_add_stop_copy_header(migf);
+ if (ret)
+ goto err_header;
+ return 0;
+
+err_header:
+ mlx5vf_put_data_buffer(migf->buf_header);
+ migf->buf_header = NULL;
+err:
+ mlx5vf_put_data_buffer(migf->buf);
+ migf->buf = NULL;
+ return ret;
+}
+
+static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct mlx5_vf_migration_file *migf = filp->private_data;
+ struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
+ struct mlx5_vhca_data_buffer *buf;
+ struct vfio_precopy_info info = {};
+ loff_t *pos = &filp->f_pos;
+ unsigned long minsz;
+ size_t inc_length = 0;
+ bool end_of_data = false;
+ int ret;
+
+ if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
+ return -ENOTTY;
+
+ minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ mutex_lock(&mvdev->state_mutex);
+ if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
+ mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
+ ret = -EINVAL;
+ goto err_state_unlock;
+ }
+
+ /*
+ * We can't issue a SAVE command when the device is suspended, so as
+ * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
+ * bytes that can't be read.
+ */
+ if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
+ /*
+ * Once the query returns it's guaranteed that there is no
+ * active SAVE command.
+ * As so, the other code below is safe with the proper locks.
+ */
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
+ MLX5VF_QUERY_INC);
+ if (ret)
+ goto err_state_unlock;
+ }
+
+ mutex_lock(&migf->lock);
+ if (migf->state == MLX5_MIGF_STATE_ERROR) {
+ ret = -ENODEV;
+ goto err_migf_unlock;
+ }
+
+ if (migf->pre_copy_initial_bytes > *pos) {
+ info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
+ } else {
+ info.dirty_bytes = migf->max_pos - *pos;
+ if (!info.dirty_bytes)
+ end_of_data = true;
+ info.dirty_bytes += inc_length;
+ }
+
+ if (!end_of_data || !inc_length) {
+ mutex_unlock(&migf->lock);
+ goto done;
+ }
+
+ mutex_unlock(&migf->lock);
+ /*
+ * We finished transferring the current state and the device has a
+ * dirty state, save a new state to be ready for.
+ */
+ buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ mlx5vf_mark_err(migf);
+ goto err_state_unlock;
+ }
+
+ ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
+ if (ret) {
+ mlx5vf_mark_err(migf);
+ mlx5vf_put_data_buffer(buf);
+ goto err_state_unlock;
+ }
+
+done:
+ mlx5vf_state_mutex_unlock(mvdev);
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+ return 0;
+
+err_migf_unlock:
+ mutex_unlock(&migf->lock);
+err_state_unlock:
+ mlx5vf_state_mutex_unlock(mvdev);
+ return ret;
+}
+
+static const struct file_operations mlx5vf_save_fops = {
+ .owner = THIS_MODULE,
+ .read = mlx5vf_save_read,
+ .poll = mlx5vf_save_poll,
+ .unlocked_ioctl = mlx5vf_precopy_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+ .release = mlx5vf_release_file,
+ .llseek = no_llseek,
+};
+
+static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
+{
+ struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
+ struct mlx5_vhca_data_buffer *buf;
+ size_t length;
+ int ret;
+
+ if (migf->state == MLX5_MIGF_STATE_ERROR)
+ return -ENODEV;
+
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length,
+ MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
+ if (ret)
+ goto err;
+
+ /* Checking whether we have a matching pre-allocated buffer that can fit */
+ if (migf->buf && migf->buf->allocated_length >= length) {
+ buf = migf->buf;
+ migf->buf = NULL;
+ } else {
+ buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto err;
+ }
+ }
+
+ ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
+ if (ret)
+ goto err_save;
+
+ return 0;
+
+err_save:
+ mlx5vf_put_data_buffer(buf);
+err:
+ mlx5vf_mark_err(migf);
+ return ret;
+}
+
+static struct mlx5_vf_migration_file *
+mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
+{
+ struct mlx5_vf_migration_file *migf;
+ struct mlx5_vhca_data_buffer *buf;
+ size_t length;
+ int ret;
+
+ migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
+ if (!migf)
+ return ERR_PTR(-ENOMEM);
+
+ migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
+ O_RDONLY);
+ if (IS_ERR(migf->filp)) {
+ ret = PTR_ERR(migf->filp);
+ goto end;
+ }
+
+ migf->mvdev = mvdev;
+ ret = mlx5vf_cmd_alloc_pd(migf);
+ if (ret)
+ goto out_free;
+
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+ init_waitqueue_head(&migf->poll_wait);
+ init_completion(&migf->save_comp);
+ /*
+ * save_comp is being used as a binary semaphore built from
+ * a completion. A normal mutex cannot be used because the lock is
+ * passed between kernel threads and lockdep can't model this.
+ */
+ complete(&migf->save_comp);
+ mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
+ INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
+ INIT_LIST_HEAD(&migf->buf_list);
+ INIT_LIST_HEAD(&migf->avail_list);
+ spin_lock_init(&migf->list_lock);
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0);
+ if (ret)
+ goto out_pd;
+
+ if (track) {
+ ret = mlx5vf_prep_stop_copy(migf, length);
+ if (ret)
+ goto out_pd;
+ }
+
+ buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out_pd;
+ }
+
+ ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
+ if (ret)
+ goto out_save;
+ return migf;
+out_save:
+ mlx5vf_free_data_buffer(buf);
+out_pd:
+ mlx5fv_cmd_clean_migf_resources(migf);
+out_free:
+ fput(migf->filp);
+end:
+ kfree(migf);
+ return ERR_PTR(ret);
+}
+
+static int
+mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
+ const char __user **buf, size_t *len,
+ loff_t *pos, ssize_t *done)
+{
+ unsigned long offset;
+ size_t page_offset;
+ struct page *page;
+ size_t page_len;
+ u8 *to_buff;
+ int ret;
+
+ offset = *pos - vhca_buf->start_pos;
+ page_offset = offset % PAGE_SIZE;
+
+ page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset);
+ if (!page)
+ return -EINVAL;
+ page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
+ to_buff = kmap_local_page(page);
+ ret = copy_from_user(to_buff + page_offset, *buf, page_len);
+ kunmap_local(to_buff);
+ if (ret)
+ return -EFAULT;
+
+ *pos += page_len;
+ *done += page_len;
+ *buf += page_len;
+ *len -= page_len;
+ vhca_buf->length += page_len;
+ return 0;
+}
+
+static int
+mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf,
+ loff_t requested_length,
+ const char __user **buf, size_t *len,
+ loff_t *pos, ssize_t *done)
+{
+ int ret;
+
+ if (requested_length > MAX_LOAD_SIZE)
+ return -ENOMEM;
+
+ if (vhca_buf->allocated_length < requested_length) {
+ ret = mlx5vf_add_migration_pages(
+ vhca_buf,
+ DIV_ROUND_UP(requested_length - vhca_buf->allocated_length,
+ PAGE_SIZE));
+ if (ret)
+ return ret;
+ }
+
+ while (*len) {
+ ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos,
+ done);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static ssize_t
+mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *vhca_buf,
+ size_t image_size, const char __user **buf,
+ size_t *len, loff_t *pos, ssize_t *done,
+ bool *has_work)
+{
+ size_t copy_len, to_copy;
+ int ret;
+
+ to_copy = min_t(size_t, *len, image_size - vhca_buf->length);
+ copy_len = to_copy;
+ while (to_copy) {
+ ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
+ done);
+ if (ret)
+ return ret;
+ }
+
+ *len -= copy_len;
+ if (vhca_buf->length == image_size) {
+ migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE;
+ migf->max_pos += image_size;
+ *has_work = true;
+ }
+
+ return 0;
+}
+
+static int
+mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *vhca_buf,
+ const char __user **buf, size_t *len,
+ loff_t *pos, ssize_t *done)
+{
+ size_t copy_len, to_copy;
+ size_t required_data;
+ u8 *to_buff;
+ int ret;
+
+ required_data = migf->record_size - vhca_buf->length;
+ to_copy = min_t(size_t, *len, required_data);
+ copy_len = to_copy;
+ while (to_copy) {
+ ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
+ done);
+ if (ret)
+ return ret;
+ }
+
+ *len -= copy_len;
+ if (vhca_buf->length == migf->record_size) {
+ switch (migf->record_tag) {
+ case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
+ {
+ struct page *page;
+
+ page = mlx5vf_get_migration_page(vhca_buf, 0);
+ if (!page)
+ return -EINVAL;
+ to_buff = kmap_local_page(page);
+ migf->stop_copy_prep_size = min_t(u64,
+ le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE);
+ kunmap_local(to_buff);
+ break;
+ }
+ default:
+ /* Optional tag */
+ break;
+ }
+
+ migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
+ migf->max_pos += migf->record_size;
+ vhca_buf->length = 0;
+ }
+
+ return 0;
+}
+
+static int
+mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *vhca_buf,
+ const char __user **buf,
+ size_t *len, loff_t *pos,
+ ssize_t *done, bool *has_work)
+{
+ struct page *page;
+ size_t copy_len;
+ u8 *to_buff;
+ int ret;
+
+ copy_len = min_t(size_t, *len,
+ sizeof(struct mlx5_vf_migration_header) - vhca_buf->length);
+ page = mlx5vf_get_migration_page(vhca_buf, 0);
+ if (!page)
+ return -EINVAL;
+ to_buff = kmap_local_page(page);
+ ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
+ if (ret) {
+ ret = -EFAULT;
+ goto end;
+ }
+
+ *buf += copy_len;
+ *pos += copy_len;
+ *done += copy_len;
+ *len -= copy_len;
+ vhca_buf->length += copy_len;
+ if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) {
+ u64 record_size;
+ u32 flags;
+
+ record_size = le64_to_cpup((__le64 *)to_buff);
+ if (record_size > MAX_LOAD_SIZE) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ migf->record_size = record_size;
+ flags = le32_to_cpup((__le32 *)(to_buff +
+ offsetof(struct mlx5_vf_migration_header, flags)));
+ migf->record_tag = le32_to_cpup((__le32 *)(to_buff +
+ offsetof(struct mlx5_vf_migration_header, tag)));
+ switch (migf->record_tag) {
+ case MLX5_MIGF_HEADER_TAG_FW_DATA:
+ migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE;
+ break;
+ case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
+ migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
+ break;
+ default:
+ if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) {
+ ret = -EOPNOTSUPP;
+ goto end;
+ }
+ /* We may read and skip this optional record data */
+ migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
+ }
+
+ migf->max_pos += vhca_buf->length;
+ vhca_buf->length = 0;
+ *has_work = true;
+ }
+end:
+ kunmap_local(to_buff);
+ return ret;
+}
+
+static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct mlx5_vf_migration_file *migf = filp->private_data;
+ struct mlx5_vhca_data_buffer *vhca_buf = migf->buf;
+ struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header;
+ loff_t requested_length;
+ bool has_work = false;
+ ssize_t done = 0;
+ int ret = 0;
+
+ if (pos)
+ return -ESPIPE;
+ pos = &filp->f_pos;
+
+ if (*pos < 0 ||
+ check_add_overflow((loff_t)len, *pos, &requested_length))
+ return -EINVAL;
+
+ mutex_lock(&migf->mvdev->state_mutex);
+ mutex_lock(&migf->lock);
+ if (migf->state == MLX5_MIGF_STATE_ERROR) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
+ while (len || has_work) {
+ has_work = false;
+ switch (migf->load_state) {
+ case MLX5_VF_LOAD_STATE_READ_HEADER:
+ ret = mlx5vf_resume_read_header(migf, vhca_buf_header,
+ &buf, &len, pos,
+ &done, &has_work);
+ if (ret)
+ goto out_unlock;
+ break;
+ case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA:
+ if (vhca_buf_header->allocated_length < migf->record_size) {
+ mlx5vf_free_data_buffer(vhca_buf_header);
+
+ migf->buf_header = mlx5vf_alloc_data_buffer(migf,
+ migf->record_size, DMA_NONE);
+ if (IS_ERR(migf->buf_header)) {
+ ret = PTR_ERR(migf->buf_header);
+ migf->buf_header = NULL;
+ goto out_unlock;
+ }
+
+ vhca_buf_header = migf->buf_header;
+ }
+
+ vhca_buf_header->start_pos = migf->max_pos;
+ migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA;
+ break;
+ case MLX5_VF_LOAD_STATE_READ_HEADER_DATA:
+ ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header,
+ &buf, &len, pos, &done);
+ if (ret)
+ goto out_unlock;
+ break;
+ case MLX5_VF_LOAD_STATE_PREP_IMAGE:
+ {
+ u64 size = max(migf->record_size,
+ migf->stop_copy_prep_size);
+
+ if (vhca_buf->allocated_length < size) {
+ mlx5vf_free_data_buffer(vhca_buf);
+
+ migf->buf = mlx5vf_alloc_data_buffer(migf,
+ size, DMA_TO_DEVICE);
+ if (IS_ERR(migf->buf)) {
+ ret = PTR_ERR(migf->buf);
+ migf->buf = NULL;
+ goto out_unlock;
+ }
+
+ vhca_buf = migf->buf;
+ }
+
+ vhca_buf->start_pos = migf->max_pos;
+ migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
+ break;
+ }
+ case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER:
+ ret = mlx5vf_resume_read_image_no_header(vhca_buf,
+ requested_length,
+ &buf, &len, pos, &done);
+ if (ret)
+ goto out_unlock;
+ break;
+ case MLX5_VF_LOAD_STATE_READ_IMAGE:
+ ret = mlx5vf_resume_read_image(migf, vhca_buf,
+ migf->record_size,
+ &buf, &len, pos, &done, &has_work);
+ if (ret)
+ goto out_unlock;
+ break;
+ case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
+ ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf);
+ if (ret)
+ goto out_unlock;
+ migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
+
+ /* prep header buf for next image */
+ vhca_buf_header->length = 0;
+ /* prep data buf for next image */
+ vhca_buf->length = 0;
+
+ break;
+ default:
+ break;
+ }
+ }
+
+out_unlock:
+ if (ret)
+ migf->state = MLX5_MIGF_STATE_ERROR;
+ mutex_unlock(&migf->lock);
+ mlx5vf_state_mutex_unlock(migf->mvdev);
+ return ret ? ret : done;
+}
+
+static const struct file_operations mlx5vf_resume_fops = {
+ .owner = THIS_MODULE,
+ .write = mlx5vf_resume_write,
+ .release = mlx5vf_release_file,
+ .llseek = no_llseek,
+};
+
+static struct mlx5_vf_migration_file *
+mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
+{
+ struct mlx5_vf_migration_file *migf;
+ struct mlx5_vhca_data_buffer *buf;
+ int ret;
+
+ migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
+ if (!migf)
+ return ERR_PTR(-ENOMEM);
+
+ migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
+ O_WRONLY);
+ if (IS_ERR(migf->filp)) {
+ ret = PTR_ERR(migf->filp);
+ goto end;
+ }
+
+ migf->mvdev = mvdev;
+ ret = mlx5vf_cmd_alloc_pd(migf);
+ if (ret)
+ goto out_free;
+
+ buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out_pd;
+ }
+
+ migf->buf = buf;
+ if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
+ buf = mlx5vf_alloc_data_buffer(migf,
+ sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out_buf;
+ }
+
+ migf->buf_header = buf;
+ migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
+ } else {
+ /* Initial state will be to read the image */
+ migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER;
+ }
+
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+ INIT_LIST_HEAD(&migf->buf_list);
+ INIT_LIST_HEAD(&migf->avail_list);
+ spin_lock_init(&migf->list_lock);
+ return migf;
+out_buf:
+ mlx5vf_free_data_buffer(migf->buf);
+out_pd:
+ mlx5vf_cmd_dealloc_pd(migf);
+out_free:
+ fput(migf->filp);
+end:
+ kfree(migf);
+ return ERR_PTR(ret);
+}
+
+void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
+{
+ if (mvdev->resuming_migf) {
+ mlx5vf_disable_fd(mvdev->resuming_migf);
+ mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
+ fput(mvdev->resuming_migf->filp);
+ mvdev->resuming_migf = NULL;
+ }
+ if (mvdev->saving_migf) {
+ mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
+ cancel_work_sync(&mvdev->saving_migf->async_data.work);
+ mlx5vf_disable_fd(mvdev->saving_migf);
+ mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
+ fput(mvdev->saving_migf->filp);
+ mvdev->saving_migf = NULL;
+ }
+}
+
+static struct file *
+mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
+ u32 new)
+{
+ u32 cur = mvdev->mig_state;
+ int ret;
+
+ if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
+ ret = mlx5vf_cmd_suspend_vhca(mvdev,
+ MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
+ ret = mlx5vf_cmd_resume_vhca(mvdev,
+ MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER);
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
+ ret = mlx5vf_cmd_suspend_vhca(mvdev,
+ MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
+ ret = mlx5vf_cmd_resume_vhca(mvdev,
+ MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
+ struct mlx5_vf_migration_file *migf;
+
+ migf = mlx5vf_pci_save_device_data(mvdev, false);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ mvdev->saving_migf = migf;
+ return migf->filp;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
+ new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
+ mlx5vf_disable_fds(mvdev);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
+ struct mlx5_vf_migration_file *migf;
+
+ migf = mlx5vf_pci_resume_device_data(mvdev);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ mvdev->resuming_migf = migf;
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
+ if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
+ ret = mlx5vf_cmd_load_vhca_state(mvdev,
+ mvdev->resuming_migf,
+ mvdev->resuming_migf->buf);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+ mlx5vf_disable_fds(mvdev);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
+ (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
+ new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
+ struct mlx5_vf_migration_file *migf;
+
+ migf = mlx5vf_pci_save_device_data(mvdev, true);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ mvdev->saving_migf = migf;
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
+ ret = mlx5vf_cmd_suspend_vhca(mvdev,
+ MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
+ if (ret)
+ return ERR_PTR(ret);
+ ret = mlx5vf_pci_save_device_inc_data(mvdev);
+ return ret ? ERR_PTR(ret) : NULL;
+ }
+
+ /*
+ * vfio_mig_get_next_state() does not use arcs other than the above
+ */
+ WARN_ON(true);
+ return ERR_PTR(-EINVAL);
+}
+
+/*
+ * This function is called in all state_mutex unlock cases to
+ * handle a 'deferred_reset' if exists.
+ */
+void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
+{
+again:
+ spin_lock(&mvdev->reset_lock);
+ if (mvdev->deferred_reset) {
+ mvdev->deferred_reset = false;
+ spin_unlock(&mvdev->reset_lock);
+ mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+ mlx5vf_disable_fds(mvdev);
+ goto again;
+ }
+ mutex_unlock(&mvdev->state_mutex);
+ spin_unlock(&mvdev->reset_lock);
+}
+
+static struct file *
+mlx5vf_pci_set_device_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state new_state)
+{
+ struct mlx5vf_pci_core_device *mvdev = container_of(
+ vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+ enum vfio_device_mig_state next_state;
+ struct file *res = NULL;
+ int ret;
+
+ mutex_lock(&mvdev->state_mutex);
+ while (new_state != mvdev->mig_state) {
+ ret = vfio_mig_get_next_state(vdev, mvdev->mig_state,
+ new_state, &next_state);
+ if (ret) {
+ res = ERR_PTR(ret);
+ break;
+ }
+ res = mlx5vf_pci_step_device_state_locked(mvdev, next_state);
+ if (IS_ERR(res))
+ break;
+ mvdev->mig_state = next_state;
+ if (WARN_ON(res && new_state != mvdev->mig_state)) {
+ fput(res);
+ res = ERR_PTR(-EINVAL);
+ break;
+ }
+ }
+ mlx5vf_state_mutex_unlock(mvdev);
+ return res;
+}
+
+static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
+ unsigned long *stop_copy_length)
+{
+ struct mlx5vf_pci_core_device *mvdev = container_of(
+ vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+ size_t state_size;
+ int ret;
+
+ mutex_lock(&mvdev->state_mutex);
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev,
+ &state_size, 0);
+ if (!ret)
+ *stop_copy_length = state_size;
+ mlx5vf_state_mutex_unlock(mvdev);
+ return ret;
+}
+
+static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state *curr_state)
+{
+ struct mlx5vf_pci_core_device *mvdev = container_of(
+ vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+
+ mutex_lock(&mvdev->state_mutex);
+ *curr_state = mvdev->mig_state;
+ mlx5vf_state_mutex_unlock(mvdev);
+ return 0;
+}
+
+static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
+{
+ struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
+
+ if (!mvdev->migrate_cap)
+ return;
+
+ /*
+ * As the higher VFIO layers are holding locks across reset and using
+ * those same locks with the mm_lock we need to prevent ABBA deadlock
+ * with the state_mutex and mm_lock.
+ * In case the state_mutex was taken already we defer the cleanup work
+ * to the unlock flow of the other running context.
+ */
+ spin_lock(&mvdev->reset_lock);
+ mvdev->deferred_reset = true;
+ if (!mutex_trylock(&mvdev->state_mutex)) {
+ spin_unlock(&mvdev->reset_lock);
+ return;
+ }
+ spin_unlock(&mvdev->reset_lock);
+ mlx5vf_state_mutex_unlock(mvdev);
+}
+
+static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
+{
+ struct mlx5vf_pci_core_device *mvdev = container_of(
+ core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+ struct vfio_pci_core_device *vdev = &mvdev->core_device;
+ int ret;
+
+ ret = vfio_pci_core_enable(vdev);
+ if (ret)
+ return ret;
+
+ if (mvdev->migrate_cap)
+ mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+ vfio_pci_core_finish_enable(vdev);
+ return 0;
+}
+
+static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
+{
+ struct mlx5vf_pci_core_device *mvdev = container_of(
+ core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+
+ mlx5vf_cmd_close_migratable(mvdev);
+ vfio_pci_core_close_device(core_vdev);
+}
+
+static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
+ .migration_set_state = mlx5vf_pci_set_device_state,
+ .migration_get_state = mlx5vf_pci_get_device_state,
+ .migration_get_data_size = mlx5vf_pci_get_data_size,
+};
+
+static const struct vfio_log_ops mlx5vf_pci_log_ops = {
+ .log_start = mlx5vf_start_page_tracker,
+ .log_stop = mlx5vf_stop_page_tracker,
+ .log_read_and_clear = mlx5vf_tracker_read_and_clear,
+};
+
+static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev)
+{
+ struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
+ struct mlx5vf_pci_core_device, core_device.vdev);
+ int ret;
+
+ ret = vfio_pci_core_init_dev(core_vdev);
+ if (ret)
+ return ret;
+
+ mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops,
+ &mlx5vf_pci_log_ops);
+
+ return 0;
+}
+
+static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev)
+{
+ struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
+ struct mlx5vf_pci_core_device, core_device.vdev);
+
+ mlx5vf_cmd_remove_migratable(mvdev);
+ vfio_pci_core_release_dev(core_vdev);
+}
+
+static const struct vfio_device_ops mlx5vf_pci_ops = {
+ .name = "mlx5-vfio-pci",
+ .init = mlx5vf_pci_init_dev,
+ .release = mlx5vf_pci_release_dev,
+ .open_device = mlx5vf_pci_open_device,
+ .close_device = mlx5vf_pci_close_device,
+ .ioctl = vfio_pci_core_ioctl,
+ .device_feature = vfio_pci_core_ioctl_feature,
+ .read = vfio_pci_core_read,
+ .write = vfio_pci_core_write,
+ .mmap = vfio_pci_core_mmap,
+ .request = vfio_pci_core_request,
+ .match = vfio_pci_core_match,
+ .bind_iommufd = vfio_iommufd_physical_bind,
+ .unbind_iommufd = vfio_iommufd_physical_unbind,
+ .attach_ioas = vfio_iommufd_physical_attach_ioas,
+ .detach_ioas = vfio_iommufd_physical_detach_ioas,
+};
+
+static int mlx5vf_pci_probe(struct pci_dev *pdev,
+ const struct pci_device_id *id)
+{
+ struct mlx5vf_pci_core_device *mvdev;
+ int ret;
+
+ mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev,
+ &pdev->dev, &mlx5vf_pci_ops);
+ if (IS_ERR(mvdev))
+ return PTR_ERR(mvdev);
+
+ dev_set_drvdata(&pdev->dev, &mvdev->core_device);
+ ret = vfio_pci_core_register_device(&mvdev->core_device);
+ if (ret)
+ goto out_put_vdev;
+ return 0;
+
+out_put_vdev:
+ vfio_put_device(&mvdev->core_device.vdev);
+ return ret;
+}
+
+static void mlx5vf_pci_remove(struct pci_dev *pdev)
+{
+ struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
+
+ vfio_pci_core_unregister_device(&mvdev->core_device);
+ vfio_put_device(&mvdev->core_device.vdev);
+}
+
+static const struct pci_device_id mlx5vf_pci_table[] = {
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
+ {}
+};
+
+MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
+
+static const struct pci_error_handlers mlx5vf_err_handlers = {
+ .reset_done = mlx5vf_pci_aer_reset_done,
+ .error_detected = vfio_pci_core_aer_err_detected,
+};
+
+static struct pci_driver mlx5vf_pci_driver = {
+ .name = KBUILD_MODNAME,
+ .id_table = mlx5vf_pci_table,
+ .probe = mlx5vf_pci_probe,
+ .remove = mlx5vf_pci_remove,
+ .err_handler = &mlx5vf_err_handlers,
+ .driver_managed_dma = true,
+};
+
+module_pci_driver(mlx5vf_pci_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
+MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
+MODULE_DESCRIPTION(
+ "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");