summaryrefslogtreecommitdiffstats
path: root/drivers/vfio
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 17:40:19 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 17:40:19 +0000
commit9f0fc191371843c4fc000a226b0a26b6c059aacd (patch)
tree35f8be3ef04506ac891ad001e8c41e535ae8d01d /drivers/vfio
parentReleasing progress-linux version 6.6.15-2~progress7.99u1. (diff)
downloadlinux-9f0fc191371843c4fc000a226b0a26b6c059aacd.tar.xz
linux-9f0fc191371843c4fc000a226b0a26b6c059aacd.zip
Merging upstream version 6.7.7.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'drivers/vfio')
-rw-r--r--drivers/vfio/Makefile3
-rw-r--r--drivers/vfio/cdx/main.c58
-rw-r--r--drivers/vfio/cdx/private.h2
-rw-r--r--drivers/vfio/iova_bitmap.c423
-rw-r--r--drivers/vfio/pci/mlx5/Kconfig1
-rw-r--r--drivers/vfio/pci/mlx5/cmd.c103
-rw-r--r--drivers/vfio/pci/mlx5/cmd.h28
-rw-r--r--drivers/vfio/pci/mlx5/main.c284
-rw-r--r--drivers/vfio/pci/pds/Kconfig1
-rw-r--r--drivers/vfio/pci/pds/pci_drv.c1
-rw-r--r--drivers/vfio/vfio_main.c11
11 files changed, 386 insertions, 529 deletions
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index c82ea032d3..68c0570520 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,8 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_VFIO) += vfio.o
-vfio-y += vfio_main.o \
- iova_bitmap.o
+vfio-y += vfio_main.o
vfio-$(CONFIG_VFIO_DEVICE_CDEV) += device_cdev.o
vfio-$(CONFIG_VFIO_GROUP) += group.o
vfio-$(CONFIG_IOMMUFD) += iommufd.o
diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c
index de56686581..9cff8d7578 100644
--- a/drivers/vfio/cdx/main.c
+++ b/drivers/vfio/cdx/main.c
@@ -14,7 +14,7 @@ static int vfio_cdx_open_device(struct vfio_device *core_vdev)
container_of(core_vdev, struct vfio_cdx_device, vdev);
struct cdx_device *cdx_dev = to_cdx_device(core_vdev->dev);
int count = cdx_dev->res_count;
- int i;
+ int i, ret;
vdev->regions = kcalloc(count, sizeof(struct vfio_cdx_region),
GFP_KERNEL_ACCOUNT);
@@ -39,6 +39,17 @@ static int vfio_cdx_open_device(struct vfio_device *core_vdev)
if (!(cdx_dev->res[i].flags & IORESOURCE_READONLY))
vdev->regions[i].flags |= VFIO_REGION_INFO_FLAG_WRITE;
}
+ ret = cdx_dev_reset(core_vdev->dev);
+ if (ret) {
+ kfree(vdev->regions);
+ vdev->regions = NULL;
+ return ret;
+ }
+ ret = cdx_clear_master(cdx_dev);
+ if (ret)
+ vdev->flags &= ~BME_SUPPORT;
+ else
+ vdev->flags |= BME_SUPPORT;
return 0;
}
@@ -52,6 +63,49 @@ static void vfio_cdx_close_device(struct vfio_device *core_vdev)
cdx_dev_reset(core_vdev->dev);
}
+static int vfio_cdx_bm_ctrl(struct vfio_device *core_vdev, u32 flags,
+ void __user *arg, size_t argsz)
+{
+ size_t minsz =
+ offsetofend(struct vfio_device_feature_bus_master, op);
+ struct vfio_cdx_device *vdev =
+ container_of(core_vdev, struct vfio_cdx_device, vdev);
+ struct cdx_device *cdx_dev = to_cdx_device(core_vdev->dev);
+ struct vfio_device_feature_bus_master ops;
+ int ret;
+
+ if (!(vdev->flags & BME_SUPPORT))
+ return -ENOTTY;
+
+ ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
+ sizeof(ops));
+ if (ret != 1)
+ return ret;
+
+ if (copy_from_user(&ops, arg, minsz))
+ return -EFAULT;
+
+ switch (ops.op) {
+ case VFIO_DEVICE_FEATURE_CLEAR_MASTER:
+ return cdx_clear_master(cdx_dev);
+ case VFIO_DEVICE_FEATURE_SET_MASTER:
+ return cdx_set_master(cdx_dev);
+ default:
+ return -EINVAL;
+ }
+}
+
+static int vfio_cdx_ioctl_feature(struct vfio_device *device, u32 flags,
+ void __user *arg, size_t argsz)
+{
+ switch (flags & VFIO_DEVICE_FEATURE_MASK) {
+ case VFIO_DEVICE_FEATURE_BUS_MASTER:
+ return vfio_cdx_bm_ctrl(device, flags, arg, argsz);
+ default:
+ return -ENOTTY;
+ }
+}
+
static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev,
struct vfio_device_info __user *arg)
{
@@ -169,6 +223,7 @@ static const struct vfio_device_ops vfio_cdx_ops = {
.open_device = vfio_cdx_open_device,
.close_device = vfio_cdx_close_device,
.ioctl = vfio_cdx_ioctl,
+ .device_feature = vfio_cdx_ioctl_feature,
.mmap = vfio_cdx_mmap,
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
@@ -231,3 +286,4 @@ module_driver(vfio_cdx_driver, cdx_driver_register, cdx_driver_unregister);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("VFIO for CDX devices - User Level meta-driver");
+MODULE_IMPORT_NS(CDX_BUS);
diff --git a/drivers/vfio/cdx/private.h b/drivers/vfio/cdx/private.h
index 8bdc117ea8..8e9d259137 100644
--- a/drivers/vfio/cdx/private.h
+++ b/drivers/vfio/cdx/private.h
@@ -23,6 +23,8 @@ struct vfio_cdx_region {
struct vfio_cdx_device {
struct vfio_device vdev;
struct vfio_cdx_region *regions;
+ u32 flags;
+#define BME_SUPPORT BIT(0)
};
#endif /* VFIO_CDX_PRIVATE_H */
diff --git a/drivers/vfio/iova_bitmap.c b/drivers/vfio/iova_bitmap.c
deleted file mode 100644
index 0848f920ef..0000000000
--- a/drivers/vfio/iova_bitmap.c
+++ /dev/null
@@ -1,423 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2022, Oracle and/or its affiliates.
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
- */
-#include <linux/iova_bitmap.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-
-#define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE)
-
-/*
- * struct iova_bitmap_map - A bitmap representing an IOVA range
- *
- * Main data structure for tracking mapped user pages of bitmap data.
- *
- * For example, for something recording dirty IOVAs, it will be provided a
- * struct iova_bitmap structure, as a general structure for iterating the
- * total IOVA range. The struct iova_bitmap_map, though, represents the
- * subset of said IOVA space that is pinned by its parent structure (struct
- * iova_bitmap).
- *
- * The user does not need to exact location of the bits in the bitmap.
- * From user perspective the only API available is iova_bitmap_set() which
- * records the IOVA *range* in the bitmap by setting the corresponding
- * bits.
- *
- * The bitmap is an array of u64 whereas each bit represents an IOVA of
- * range of (1 << pgshift). Thus formula for the bitmap data to be set is:
- *
- * data[(iova / page_size) / 64] & (1ULL << (iova % 64))
- */
-struct iova_bitmap_map {
- /* base IOVA representing bit 0 of the first page */
- unsigned long iova;
-
- /* page size order that each bit granules to */
- unsigned long pgshift;
-
- /* page offset of the first user page pinned */
- unsigned long pgoff;
-
- /* number of pages pinned */
- unsigned long npages;
-
- /* pinned pages representing the bitmap data */
- struct page **pages;
-};
-
-/*
- * struct iova_bitmap - The IOVA bitmap object
- *
- * Main data structure for iterating over the bitmap data.
- *
- * Abstracts the pinning work and iterates in IOVA ranges.
- * It uses a windowing scheme and pins the bitmap in relatively
- * big ranges e.g.
- *
- * The bitmap object uses one base page to store all the pinned pages
- * pointers related to the bitmap. For sizeof(struct page*) == 8 it stores
- * 512 struct page pointers which, if the base page size is 4K, it means
- * 2M of bitmap data is pinned at a time. If the iova_bitmap page size is
- * also 4K then the range window to iterate is 64G.
- *
- * For example iterating on a total IOVA range of 4G..128G, it will walk
- * through this set of ranges:
- *
- * 4G - 68G-1 (64G)
- * 68G - 128G-1 (64G)
- *
- * An example of the APIs on how to use/iterate over the IOVA bitmap:
- *
- * bitmap = iova_bitmap_alloc(iova, length, page_size, data);
- * if (IS_ERR(bitmap))
- * return PTR_ERR(bitmap);
- *
- * ret = iova_bitmap_for_each(bitmap, arg, dirty_reporter_fn);
- *
- * iova_bitmap_free(bitmap);
- *
- * Each iteration of the @dirty_reporter_fn is called with a unique @iova
- * and @length argument, indicating the current range available through the
- * iova_bitmap. The @dirty_reporter_fn uses iova_bitmap_set() to mark dirty
- * areas (@iova_length) within that provided range, as following:
- *
- * iova_bitmap_set(bitmap, iova, iova_length);
- *
- * The internals of the object uses an index @mapped_base_index that indexes
- * which u64 word of the bitmap is mapped, up to @mapped_total_index.
- * Those keep being incremented until @mapped_total_index is reached while
- * mapping up to PAGE_SIZE / sizeof(struct page*) maximum of pages.
- *
- * The IOVA bitmap is usually located on what tracks DMA mapped ranges or
- * some form of IOVA range tracking that co-relates to the user passed
- * bitmap.
- */
-struct iova_bitmap {
- /* IOVA range representing the currently mapped bitmap data */
- struct iova_bitmap_map mapped;
-
- /* userspace address of the bitmap */
- u64 __user *bitmap;
-
- /* u64 index that @mapped points to */
- unsigned long mapped_base_index;
-
- /* how many u64 can we walk in total */
- unsigned long mapped_total_index;
-
- /* base IOVA of the whole bitmap */
- unsigned long iova;
-
- /* length of the IOVA range for the whole bitmap */
- size_t length;
-};
-
-/*
- * Converts a relative IOVA to a bitmap index.
- * This function provides the index into the u64 array (bitmap::bitmap)
- * for a given IOVA offset.
- * Relative IOVA means relative to the bitmap::mapped base IOVA
- * (stored in mapped::iova). All computations in this file are done using
- * relative IOVAs and thus avoid an extra subtraction against mapped::iova.
- * The user API iova_bitmap_set() always uses a regular absolute IOVAs.
- */
-static unsigned long iova_bitmap_offset_to_index(struct iova_bitmap *bitmap,
- unsigned long iova)
-{
- unsigned long pgsize = 1 << bitmap->mapped.pgshift;
-
- return iova / (BITS_PER_TYPE(*bitmap->bitmap) * pgsize);
-}
-
-/*
- * Converts a bitmap index to a *relative* IOVA.
- */
-static unsigned long iova_bitmap_index_to_offset(struct iova_bitmap *bitmap,
- unsigned long index)
-{
- unsigned long pgshift = bitmap->mapped.pgshift;
-
- return (index * BITS_PER_TYPE(*bitmap->bitmap)) << pgshift;
-}
-
-/*
- * Returns the base IOVA of the mapped range.
- */
-static unsigned long iova_bitmap_mapped_iova(struct iova_bitmap *bitmap)
-{
- unsigned long skip = bitmap->mapped_base_index;
-
- return bitmap->iova + iova_bitmap_index_to_offset(bitmap, skip);
-}
-
-/*
- * Pins the bitmap user pages for the current range window.
- * This is internal to IOVA bitmap and called when advancing the
- * index (@mapped_base_index) or allocating the bitmap.
- */
-static int iova_bitmap_get(struct iova_bitmap *bitmap)
-{
- struct iova_bitmap_map *mapped = &bitmap->mapped;
- unsigned long npages;
- u64 __user *addr;
- long ret;
-
- /*
- * @mapped_base_index is the index of the currently mapped u64 words
- * that we have access. Anything before @mapped_base_index is not
- * mapped. The range @mapped_base_index .. @mapped_total_index-1 is
- * mapped but capped at a maximum number of pages.
- */
- npages = DIV_ROUND_UP((bitmap->mapped_total_index -
- bitmap->mapped_base_index) *
- sizeof(*bitmap->bitmap), PAGE_SIZE);
-
- /*
- * We always cap at max number of 'struct page' a base page can fit.
- * This is, for example, on x86 means 2M of bitmap data max.
- */
- npages = min(npages, PAGE_SIZE / sizeof(struct page *));
-
- /*
- * Bitmap address to be pinned is calculated via pointer arithmetic
- * with bitmap u64 word index.
- */
- addr = bitmap->bitmap + bitmap->mapped_base_index;
-
- ret = pin_user_pages_fast((unsigned long)addr, npages,
- FOLL_WRITE, mapped->pages);
- if (ret <= 0)
- return -EFAULT;
-
- mapped->npages = (unsigned long)ret;
- /* Base IOVA where @pages point to i.e. bit 0 of the first page */
- mapped->iova = iova_bitmap_mapped_iova(bitmap);
-
- /*
- * offset of the page where pinned pages bit 0 is located.
- * This handles the case where the bitmap is not PAGE_SIZE
- * aligned.
- */
- mapped->pgoff = offset_in_page(addr);
- return 0;
-}
-
-/*
- * Unpins the bitmap user pages and clears @npages
- * (un)pinning is abstracted from API user and it's done when advancing
- * the index or freeing the bitmap.
- */
-static void iova_bitmap_put(struct iova_bitmap *bitmap)
-{
- struct iova_bitmap_map *mapped = &bitmap->mapped;
-
- if (mapped->npages) {
- unpin_user_pages(mapped->pages, mapped->npages);
- mapped->npages = 0;
- }
-}
-
-/**
- * iova_bitmap_alloc() - Allocates an IOVA bitmap object
- * @iova: Start address of the IOVA range
- * @length: Length of the IOVA range
- * @page_size: Page size of the IOVA bitmap. It defines what each bit
- * granularity represents
- * @data: Userspace address of the bitmap
- *
- * Allocates an IOVA object and initializes all its fields including the
- * first user pages of @data.
- *
- * Return: A pointer to a newly allocated struct iova_bitmap
- * or ERR_PTR() on error.
- */
-struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length,
- unsigned long page_size, u64 __user *data)
-{
- struct iova_bitmap_map *mapped;
- struct iova_bitmap *bitmap;
- int rc;
-
- bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
- if (!bitmap)
- return ERR_PTR(-ENOMEM);
-
- mapped = &bitmap->mapped;
- mapped->pgshift = __ffs(page_size);
- bitmap->bitmap = data;
- bitmap->mapped_total_index =
- iova_bitmap_offset_to_index(bitmap, length - 1) + 1;
- bitmap->iova = iova;
- bitmap->length = length;
- mapped->iova = iova;
- mapped->pages = (struct page **)__get_free_page(GFP_KERNEL);
- if (!mapped->pages) {
- rc = -ENOMEM;
- goto err;
- }
-
- rc = iova_bitmap_get(bitmap);
- if (rc)
- goto err;
- return bitmap;
-
-err:
- iova_bitmap_free(bitmap);
- return ERR_PTR(rc);
-}
-
-/**
- * iova_bitmap_free() - Frees an IOVA bitmap object
- * @bitmap: IOVA bitmap to free
- *
- * It unpins and releases pages array memory and clears any leftover
- * state.
- */
-void iova_bitmap_free(struct iova_bitmap *bitmap)
-{
- struct iova_bitmap_map *mapped = &bitmap->mapped;
-
- iova_bitmap_put(bitmap);
-
- if (mapped->pages) {
- free_page((unsigned long)mapped->pages);
- mapped->pages = NULL;
- }
-
- kfree(bitmap);
-}
-
-/*
- * Returns the remaining bitmap indexes from mapped_total_index to process for
- * the currently pinned bitmap pages.
- */
-static unsigned long iova_bitmap_mapped_remaining(struct iova_bitmap *bitmap)
-{
- unsigned long remaining, bytes;
-
- bytes = (bitmap->mapped.npages << PAGE_SHIFT) - bitmap->mapped.pgoff;
-
- remaining = bitmap->mapped_total_index - bitmap->mapped_base_index;
- remaining = min_t(unsigned long, remaining,
- bytes / sizeof(*bitmap->bitmap));
-
- return remaining;
-}
-
-/*
- * Returns the length of the mapped IOVA range.
- */
-static unsigned long iova_bitmap_mapped_length(struct iova_bitmap *bitmap)
-{
- unsigned long max_iova = bitmap->iova + bitmap->length - 1;
- unsigned long iova = iova_bitmap_mapped_iova(bitmap);
- unsigned long remaining;
-
- /*
- * iova_bitmap_mapped_remaining() returns a number of indexes which
- * when converted to IOVA gives us a max length that the bitmap
- * pinned data can cover. Afterwards, that is capped to
- * only cover the IOVA range in @bitmap::iova .. @bitmap::length.
- */
- remaining = iova_bitmap_index_to_offset(bitmap,
- iova_bitmap_mapped_remaining(bitmap));
-
- if (iova + remaining - 1 > max_iova)
- remaining -= ((iova + remaining - 1) - max_iova);
-
- return remaining;
-}
-
-/*
- * Returns true if there's not more data to iterate.
- */
-static bool iova_bitmap_done(struct iova_bitmap *bitmap)
-{
- return bitmap->mapped_base_index >= bitmap->mapped_total_index;
-}
-
-/*
- * Advances to the next range, releases the current pinned
- * pages and pins the next set of bitmap pages.
- * Returns 0 on success or otherwise errno.
- */
-static int iova_bitmap_advance(struct iova_bitmap *bitmap)
-{
- unsigned long iova = iova_bitmap_mapped_length(bitmap) - 1;
- unsigned long count = iova_bitmap_offset_to_index(bitmap, iova) + 1;
-
- bitmap->mapped_base_index += count;
-
- iova_bitmap_put(bitmap);
- if (iova_bitmap_done(bitmap))
- return 0;
-
- /* When advancing the index we pin the next set of bitmap pages */
- return iova_bitmap_get(bitmap);
-}
-
-/**
- * iova_bitmap_for_each() - Iterates over the bitmap
- * @bitmap: IOVA bitmap to iterate
- * @opaque: Additional argument to pass to the callback
- * @fn: Function that gets called for each IOVA range
- *
- * Helper function to iterate over bitmap data representing a portion of IOVA
- * space. It hides the complexity of iterating bitmaps and translating the
- * mapped bitmap user pages into IOVA ranges to process.
- *
- * Return: 0 on success, and an error on failure either upon
- * iteration or when the callback returns an error.
- */
-int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
- iova_bitmap_fn_t fn)
-{
- int ret = 0;
-
- for (; !iova_bitmap_done(bitmap) && !ret;
- ret = iova_bitmap_advance(bitmap)) {
- ret = fn(bitmap, iova_bitmap_mapped_iova(bitmap),
- iova_bitmap_mapped_length(bitmap), opaque);
- if (ret)
- break;
- }
-
- return ret;
-}
-
-/**
- * iova_bitmap_set() - Records an IOVA range in bitmap
- * @bitmap: IOVA bitmap
- * @iova: IOVA to start
- * @length: IOVA range length
- *
- * Set the bits corresponding to the range [iova .. iova+length-1] in
- * the user bitmap.
- *
- */
-void iova_bitmap_set(struct iova_bitmap *bitmap,
- unsigned long iova, size_t length)
-{
- struct iova_bitmap_map *mapped = &bitmap->mapped;
- unsigned long cur_bit = ((iova - mapped->iova) >>
- mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
- unsigned long last_bit = (((iova + length - 1) - mapped->iova) >>
- mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
-
- do {
- unsigned int page_idx = cur_bit / BITS_PER_PAGE;
- unsigned int offset = cur_bit % BITS_PER_PAGE;
- unsigned int nbits = min(BITS_PER_PAGE - offset,
- last_bit - cur_bit + 1);
- void *kaddr;
-
- kaddr = kmap_local_page(mapped->pages[page_idx]);
- bitmap_set(kaddr, offset, nbits);
- kunmap_local(kaddr);
- cur_bit += nbits;
- } while (cur_bit <= last_bit);
-}
-EXPORT_SYMBOL_GPL(iova_bitmap_set);
diff --git a/drivers/vfio/pci/mlx5/Kconfig b/drivers/vfio/pci/mlx5/Kconfig
index 7088edc4fb..c3ced56b77 100644
--- a/drivers/vfio/pci/mlx5/Kconfig
+++ b/drivers/vfio/pci/mlx5/Kconfig
@@ -3,6 +3,7 @@ config MLX5_VFIO_PCI
tristate "VFIO support for MLX5 PCI devices"
depends on MLX5_CORE
select VFIO_PCI_CORE
+ select IOMMUFD_DRIVER
help
This provides migration support for MLX5 devices using the VFIO
framework.
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index 33574b0447..efd1d252cd 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -86,7 +86,8 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
}
int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
- size_t *state_size, u8 query_flags)
+ size_t *state_size, u64 *total_size,
+ u8 query_flags)
{
u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
@@ -128,6 +129,7 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
MLX5_SET(query_vhca_migration_state_in, in, incremental,
query_flags & MLX5VF_QUERY_INC);
+ MLX5_SET(query_vhca_migration_state_in, in, chunk, mvdev->chunk_mode);
ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
out);
@@ -139,6 +141,11 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
*state_size = MLX5_GET(query_vhca_migration_state_out, out,
required_umem_size);
+ if (total_size)
+ *total_size = mvdev->chunk_mode ?
+ MLX5_GET64(query_vhca_migration_state_out, out,
+ remaining_total_size) : *state_size;
+
return 0;
}
@@ -254,6 +261,9 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
mvdev->core_device.vdev.migration_flags |=
VFIO_MIGRATION_PRE_COPY;
+ if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks))
+ mvdev->chunk_mode = 1;
+
end:
mlx5_vf_put_core_dev(mvdev->mdev);
}
@@ -428,6 +438,7 @@ end:
void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
{
spin_lock_irq(&buf->migf->list_lock);
+ buf->stop_copy_chunk_num = 0;
list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
spin_unlock_irq(&buf->migf->list_lock);
}
@@ -475,6 +486,15 @@ found:
return buf;
}
+static void
+mlx5vf_save_callback_complete(struct mlx5_vf_migration_file *migf,
+ struct mlx5vf_async_data *async_data)
+{
+ kvfree(async_data->out);
+ complete(&migf->save_comp);
+ fput(migf->filp);
+}
+
void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
{
struct mlx5vf_async_data *async_data = container_of(_work,
@@ -487,16 +507,15 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
mlx5vf_put_data_buffer(async_data->buf);
if (async_data->header_buf)
mlx5vf_put_data_buffer(async_data->header_buf);
- if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
+ if (!async_data->stop_copy_chunk &&
+ async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
else
migf->state = MLX5_MIGF_STATE_ERROR;
wake_up_interruptible(&migf->poll_wait);
}
mutex_unlock(&migf->lock);
- kvfree(async_data->out);
- complete(&migf->save_comp);
- fput(migf->filp);
+ mlx5vf_save_callback_complete(migf, async_data);
}
static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
@@ -536,13 +555,20 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
struct mlx5_vf_migration_file, async_data);
if (!status) {
+ size_t next_required_umem_size = 0;
+ bool stop_copy_last_chunk;
size_t image_size;
unsigned long flags;
bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY &&
- !async_data->last_chunk;
+ !async_data->stop_copy_chunk;
image_size = MLX5_GET(save_vhca_state_out, async_data->out,
actual_image_size);
+ if (async_data->buf->stop_copy_chunk_num)
+ next_required_umem_size = MLX5_GET(save_vhca_state_out,
+ async_data->out, next_required_umem_size);
+ stop_copy_last_chunk = async_data->stop_copy_chunk &&
+ !next_required_umem_size;
if (async_data->header_buf) {
status = add_buf_header(async_data->header_buf, image_size,
initial_pre_copy);
@@ -554,19 +580,34 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
migf->max_pos += async_data->buf->length;
spin_lock_irqsave(&migf->list_lock, flags);
list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
+ if (async_data->buf->stop_copy_chunk_num) {
+ migf->num_ready_chunks++;
+ if (next_required_umem_size &&
+ migf->num_ready_chunks >= MAX_NUM_CHUNKS) {
+ /* Delay the next SAVE till one chunk be consumed */
+ migf->next_required_umem_size = next_required_umem_size;
+ next_required_umem_size = 0;
+ }
+ }
spin_unlock_irqrestore(&migf->list_lock, flags);
- if (initial_pre_copy)
+ if (initial_pre_copy) {
migf->pre_copy_initial_bytes += image_size;
- migf->state = async_data->last_chunk ?
- MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY;
+ migf->state = MLX5_MIGF_STATE_PRE_COPY;
+ }
+ if (stop_copy_last_chunk)
+ migf->state = MLX5_MIGF_STATE_COMPLETE;
wake_up_interruptible(&migf->poll_wait);
+ if (next_required_umem_size)
+ mlx5vf_mig_file_set_save_work(migf,
+ /* Picking up the next chunk num */
+ (async_data->buf->stop_copy_chunk_num % MAX_NUM_CHUNKS) + 1,
+ next_required_umem_size);
+ mlx5vf_save_callback_complete(migf, async_data);
+ return;
}
err:
- /*
- * The error and the cleanup flows can't run from an
- * interrupt context
- */
+ /* The error flow can't run from an interrupt context */
if (status == -EREMOTEIO)
status = MLX5_GET(save_vhca_state_out, async_data->out, status);
async_data->status = status;
@@ -610,7 +651,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
async_data = &migf->async_data;
async_data->buf = buf;
- async_data->last_chunk = !track;
+ async_data->stop_copy_chunk = !track;
async_data->out = kvzalloc(out_size, GFP_KERNEL);
if (!async_data->out) {
err = -ENOMEM;
@@ -618,10 +659,15 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
}
if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
- if (async_data->last_chunk && migf->buf_header) {
- header_buf = migf->buf_header;
- migf->buf_header = NULL;
- } else {
+ if (async_data->stop_copy_chunk) {
+ u8 header_idx = buf->stop_copy_chunk_num ?
+ buf->stop_copy_chunk_num - 1 : 0;
+
+ header_buf = migf->buf_header[header_idx];
+ migf->buf_header[header_idx] = NULL;
+ }
+
+ if (!header_buf) {
header_buf = mlx5vf_get_data_buffer(migf,
sizeof(struct mlx5_vf_migration_header), DMA_NONE);
if (IS_ERR(header_buf)) {
@@ -631,8 +677,8 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
}
}
- if (async_data->last_chunk)
- migf->state = MLX5_MIGF_STATE_SAVE_LAST;
+ if (async_data->stop_copy_chunk)
+ migf->state = MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK;
async_data->header_buf = header_buf;
get_file(migf->filp);
@@ -707,18 +753,21 @@ void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
{
struct mlx5_vhca_data_buffer *entry;
+ int i;
lockdep_assert_held(&migf->mvdev->state_mutex);
WARN_ON(migf->mvdev->mdev_detach);
- if (migf->buf) {
- mlx5vf_free_data_buffer(migf->buf);
- migf->buf = NULL;
- }
+ for (i = 0; i < MAX_NUM_CHUNKS; i++) {
+ if (migf->buf[i]) {
+ mlx5vf_free_data_buffer(migf->buf[i]);
+ migf->buf[i] = NULL;
+ }
- if (migf->buf_header) {
- mlx5vf_free_data_buffer(migf->buf_header);
- migf->buf_header = NULL;
+ if (migf->buf_header[i]) {
+ mlx5vf_free_data_buffer(migf->buf_header[i]);
+ migf->buf_header[i] = NULL;
+ }
}
list_splice(&migf->avail_list, &migf->buf_list);
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index aec4c69dd6..f2c7227fa6 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -20,7 +20,7 @@ enum mlx5_vf_migf_state {
MLX5_MIGF_STATE_ERROR = 1,
MLX5_MIGF_STATE_PRE_COPY_ERROR,
MLX5_MIGF_STATE_PRE_COPY,
- MLX5_MIGF_STATE_SAVE_LAST,
+ MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK,
MLX5_MIGF_STATE_COMPLETE,
};
@@ -64,6 +64,7 @@ struct mlx5_vhca_data_buffer {
u32 mkey;
enum dma_data_direction dma_dir;
u8 dmaed:1;
+ u8 stop_copy_chunk_num;
struct list_head buf_elm;
struct mlx5_vf_migration_file *migf;
/* Optimize mlx5vf_get_migration_page() for sequential access */
@@ -78,10 +79,19 @@ struct mlx5vf_async_data {
struct mlx5_vhca_data_buffer *buf;
struct mlx5_vhca_data_buffer *header_buf;
int status;
- u8 last_chunk:1;
+ u8 stop_copy_chunk:1;
void *out;
};
+struct mlx5vf_save_work_data {
+ struct mlx5_vf_migration_file *migf;
+ size_t next_required_umem_size;
+ struct work_struct work;
+ u8 chunk_num;
+};
+
+#define MAX_NUM_CHUNKS 2
+
struct mlx5_vf_migration_file {
struct file *filp;
struct mutex lock;
@@ -94,8 +104,12 @@ struct mlx5_vf_migration_file {
u32 record_tag;
u64 stop_copy_prep_size;
u64 pre_copy_initial_bytes;
- struct mlx5_vhca_data_buffer *buf;
- struct mlx5_vhca_data_buffer *buf_header;
+ size_t next_required_umem_size;
+ u8 num_ready_chunks;
+ /* Upon chunk mode preserve another set of buffers for stop_copy phase */
+ struct mlx5_vhca_data_buffer *buf[MAX_NUM_CHUNKS];
+ struct mlx5_vhca_data_buffer *buf_header[MAX_NUM_CHUNKS];
+ struct mlx5vf_save_work_data save_data[MAX_NUM_CHUNKS];
spinlock_t list_lock;
struct list_head buf_list;
struct list_head avail_list;
@@ -164,6 +178,7 @@ struct mlx5vf_pci_core_device {
u8 deferred_reset:1;
u8 mdev_detach:1;
u8 log_active:1;
+ u8 chunk_mode:1;
struct completion tracker_comp;
/* protect migration state */
struct mutex state_mutex;
@@ -186,7 +201,8 @@ enum {
int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
- size_t *state_size, u8 query_flags);
+ size_t *state_size, u64 *total_size,
+ u8 query_flags);
void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
const struct vfio_migration_ops *mig_ops,
const struct vfio_log_ops *log_ops);
@@ -217,6 +233,8 @@ struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev);
void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev);
void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work);
+void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
+ u8 chunk_num, size_t next_required_umem_size);
int mlx5vf_start_page_tracker(struct vfio_device *vdev,
struct rb_root_cached *ranges, u32 nnodes, u64 *page_size);
int mlx5vf_stop_page_tracker(struct vfio_device *vdev);
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index 42ec574a86..fe09a8c8af 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -24,6 +24,8 @@
/* Device specification max LOAD size */
#define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
+#define MAX_CHUNK_SIZE SZ_8M
+
static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
{
struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
@@ -158,6 +160,41 @@ end:
return found ? buf : NULL;
}
+static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf)
+{
+ struct mlx5_vf_migration_file *migf = vhca_buf->migf;
+
+ if (vhca_buf->stop_copy_chunk_num) {
+ bool is_header = vhca_buf->dma_dir == DMA_NONE;
+ u8 chunk_num = vhca_buf->stop_copy_chunk_num;
+ size_t next_required_umem_size = 0;
+
+ if (is_header)
+ migf->buf_header[chunk_num - 1] = vhca_buf;
+ else
+ migf->buf[chunk_num - 1] = vhca_buf;
+
+ spin_lock_irq(&migf->list_lock);
+ list_del_init(&vhca_buf->buf_elm);
+ if (!is_header) {
+ next_required_umem_size =
+ migf->next_required_umem_size;
+ migf->next_required_umem_size = 0;
+ migf->num_ready_chunks--;
+ }
+ spin_unlock_irq(&migf->list_lock);
+ if (next_required_umem_size)
+ mlx5vf_mig_file_set_save_work(migf, chunk_num,
+ next_required_umem_size);
+ return;
+ }
+
+ spin_lock_irq(&migf->list_lock);
+ list_del_init(&vhca_buf->buf_elm);
+ list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
+ spin_unlock_irq(&migf->list_lock);
+}
+
static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
char __user **buf, size_t *len, loff_t *pos)
{
@@ -193,12 +230,8 @@ static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
copy_len -= page_len;
}
- if (*pos >= vhca_buf->start_pos + vhca_buf->length) {
- spin_lock_irq(&vhca_buf->migf->list_lock);
- list_del_init(&vhca_buf->buf_elm);
- list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
- spin_unlock_irq(&vhca_buf->migf->list_lock);
- }
+ if (*pos >= vhca_buf->start_pos + vhca_buf->length)
+ mlx5vf_buf_read_done(vhca_buf);
return done;
}
@@ -304,7 +337,75 @@ static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
wake_up_interruptible(&migf->poll_wait);
}
-static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
+void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
+ u8 chunk_num, size_t next_required_umem_size)
+{
+ migf->save_data[chunk_num - 1].next_required_umem_size =
+ next_required_umem_size;
+ migf->save_data[chunk_num - 1].migf = migf;
+ get_file(migf->filp);
+ queue_work(migf->mvdev->cb_wq,
+ &migf->save_data[chunk_num - 1].work);
+}
+
+static struct mlx5_vhca_data_buffer *
+mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
+ u8 index, size_t required_length)
+{
+ struct mlx5_vhca_data_buffer *buf = migf->buf[index];
+ u8 chunk_num;
+
+ WARN_ON(!buf);
+ chunk_num = buf->stop_copy_chunk_num;
+ buf->migf->buf[index] = NULL;
+ /* Checking whether the pre-allocated buffer can fit */
+ if (buf->allocated_length >= required_length)
+ return buf;
+
+ mlx5vf_put_data_buffer(buf);
+ buf = mlx5vf_get_data_buffer(buf->migf, required_length,
+ DMA_FROM_DEVICE);
+ if (IS_ERR(buf))
+ return buf;
+
+ buf->stop_copy_chunk_num = chunk_num;
+ return buf;
+}
+
+static void mlx5vf_mig_file_save_work(struct work_struct *_work)
+{
+ struct mlx5vf_save_work_data *save_data = container_of(_work,
+ struct mlx5vf_save_work_data, work);
+ struct mlx5_vf_migration_file *migf = save_data->migf;
+ struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
+ struct mlx5_vhca_data_buffer *buf;
+
+ mutex_lock(&mvdev->state_mutex);
+ if (migf->state == MLX5_MIGF_STATE_ERROR)
+ goto end;
+
+ buf = mlx5vf_mig_file_get_stop_copy_buf(migf,
+ save_data->chunk_num - 1,
+ save_data->next_required_umem_size);
+ if (IS_ERR(buf))
+ goto err;
+
+ if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false))
+ goto err_save;
+
+ goto end;
+
+err_save:
+ mlx5vf_put_data_buffer(buf);
+err:
+ mlx5vf_mark_err(migf);
+end:
+ mlx5vf_state_mutex_unlock(mvdev);
+ fput(migf->filp);
+}
+
+static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
+ bool track)
{
size_t size = sizeof(struct mlx5_vf_migration_header) +
sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
@@ -331,7 +432,7 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
to_buff = kmap_local_page(page);
memcpy(to_buff, &header, sizeof(header));
header_buf->length = sizeof(header);
- data.stop_copy_size = cpu_to_le64(migf->buf->allocated_length);
+ data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length);
memcpy(to_buff + sizeof(header), &data, sizeof(data));
header_buf->length += sizeof(data);
kunmap_local(to_buff);
@@ -340,48 +441,86 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
spin_lock_irqsave(&migf->list_lock, flags);
list_add_tail(&header_buf->buf_elm, &migf->buf_list);
spin_unlock_irqrestore(&migf->list_lock, flags);
- migf->pre_copy_initial_bytes = size;
+ if (track)
+ migf->pre_copy_initial_bytes = size;
return 0;
err:
mlx5vf_put_data_buffer(header_buf);
return ret;
}
-static int mlx5vf_prep_stop_copy(struct mlx5_vf_migration_file *migf,
- size_t state_size)
+static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
+ struct mlx5_vf_migration_file *migf,
+ size_t state_size, u64 full_size,
+ bool track)
{
struct mlx5_vhca_data_buffer *buf;
size_t inc_state_size;
+ int num_chunks;
int ret;
+ int i;
- /* let's be ready for stop_copy size that might grow by 10 percents */
- if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
- inc_state_size = state_size;
+ if (mvdev->chunk_mode) {
+ size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size);
- buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
- if (IS_ERR(buf))
- return PTR_ERR(buf);
+ /* from firmware perspective at least 'state_size' buffer should be set */
+ inc_state_size = max(state_size, chunk_size);
+ } else {
+ if (track) {
+ /* let's be ready for stop_copy size that might grow by 10 percents */
+ if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
+ inc_state_size = state_size;
+ } else {
+ inc_state_size = state_size;
+ }
+ }
- migf->buf = buf;
- buf = mlx5vf_get_data_buffer(migf,
- sizeof(struct mlx5_vf_migration_header), DMA_NONE);
- if (IS_ERR(buf)) {
- ret = PTR_ERR(buf);
- goto err;
+ /* let's not overflow the device specification max SAVE size */
+ inc_state_size = min_t(size_t, inc_state_size,
+ (BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE));
+
+ num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
+ for (i = 0; i < num_chunks; i++) {
+ buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto err;
+ }
+
+ migf->buf[i] = buf;
+ buf = mlx5vf_get_data_buffer(migf,
+ sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto err;
+ }
+ migf->buf_header[i] = buf;
+ if (mvdev->chunk_mode) {
+ migf->buf[i]->stop_copy_chunk_num = i + 1;
+ migf->buf_header[i]->stop_copy_chunk_num = i + 1;
+ INIT_WORK(&migf->save_data[i].work,
+ mlx5vf_mig_file_save_work);
+ migf->save_data[i].chunk_num = i + 1;
+ }
}
- migf->buf_header = buf;
- ret = mlx5vf_add_stop_copy_header(migf);
+ ret = mlx5vf_add_stop_copy_header(migf, track);
if (ret)
- goto err_header;
+ goto err;
return 0;
-err_header:
- mlx5vf_put_data_buffer(migf->buf_header);
- migf->buf_header = NULL;
err:
- mlx5vf_put_data_buffer(migf->buf);
- migf->buf = NULL;
+ for (i = 0; i < num_chunks; i++) {
+ if (migf->buf[i]) {
+ mlx5vf_put_data_buffer(migf->buf[i]);
+ migf->buf[i] = NULL;
+ }
+ if (migf->buf_header[i]) {
+ mlx5vf_put_data_buffer(migf->buf_header[i]);
+ migf->buf_header[i] = NULL;
+ }
+ }
+
return ret;
}
@@ -428,7 +567,7 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
* As so, the other code below is safe with the proper locks.
*/
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
- MLX5VF_QUERY_INC);
+ NULL, MLX5VF_QUERY_INC);
if (ret)
goto err_state_unlock;
}
@@ -505,21 +644,15 @@ static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
if (migf->state == MLX5_MIGF_STATE_ERROR)
return -ENODEV;
- ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length,
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL,
MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
if (ret)
goto err;
- /* Checking whether we have a matching pre-allocated buffer that can fit */
- if (migf->buf && migf->buf->allocated_length >= length) {
- buf = migf->buf;
- migf->buf = NULL;
- } else {
- buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
- if (IS_ERR(buf)) {
- ret = PTR_ERR(buf);
- goto err;
- }
+ buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto err;
}
ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
@@ -541,6 +674,7 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
struct mlx5_vf_migration_file *migf;
struct mlx5_vhca_data_buffer *buf;
size_t length;
+ u64 full_size;
int ret;
migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
@@ -574,20 +708,25 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
INIT_LIST_HEAD(&migf->buf_list);
INIT_LIST_HEAD(&migf->avail_list);
spin_lock_init(&migf->list_lock);
- ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0);
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
+ if (ret)
+ goto out_pd;
+
+ ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track);
if (ret)
goto out_pd;
if (track) {
- ret = mlx5vf_prep_stop_copy(migf, length);
- if (ret)
+ /* leave the allocated buffer ready for the stop-copy phase */
+ buf = mlx5vf_alloc_data_buffer(migf,
+ migf->buf[0]->allocated_length, DMA_FROM_DEVICE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
goto out_pd;
- }
-
- buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE);
- if (IS_ERR(buf)) {
- ret = PTR_ERR(buf);
- goto out_pd;
+ }
+ } else {
+ buf = migf->buf[0];
+ migf->buf[0] = NULL;
}
ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
@@ -820,8 +959,8 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
size_t len, loff_t *pos)
{
struct mlx5_vf_migration_file *migf = filp->private_data;
- struct mlx5_vhca_data_buffer *vhca_buf = migf->buf;
- struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header;
+ struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0];
+ struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0];
loff_t requested_length;
bool has_work = false;
ssize_t done = 0;
@@ -856,15 +995,15 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
if (vhca_buf_header->allocated_length < migf->record_size) {
mlx5vf_free_data_buffer(vhca_buf_header);
- migf->buf_header = mlx5vf_alloc_data_buffer(migf,
+ migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf,
migf->record_size, DMA_NONE);
- if (IS_ERR(migf->buf_header)) {
- ret = PTR_ERR(migf->buf_header);
- migf->buf_header = NULL;
+ if (IS_ERR(migf->buf_header[0])) {
+ ret = PTR_ERR(migf->buf_header[0]);
+ migf->buf_header[0] = NULL;
goto out_unlock;
}
- vhca_buf_header = migf->buf_header;
+ vhca_buf_header = migf->buf_header[0];
}
vhca_buf_header->start_pos = migf->max_pos;
@@ -884,15 +1023,15 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
if (vhca_buf->allocated_length < size) {
mlx5vf_free_data_buffer(vhca_buf);
- migf->buf = mlx5vf_alloc_data_buffer(migf,
+ migf->buf[0] = mlx5vf_alloc_data_buffer(migf,
size, DMA_TO_DEVICE);
- if (IS_ERR(migf->buf)) {
- ret = PTR_ERR(migf->buf);
- migf->buf = NULL;
+ if (IS_ERR(migf->buf[0])) {
+ ret = PTR_ERR(migf->buf[0]);
+ migf->buf[0] = NULL;
goto out_unlock;
}
- vhca_buf = migf->buf;
+ vhca_buf = migf->buf[0];
}
vhca_buf->start_pos = migf->max_pos;
@@ -974,7 +1113,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
goto out_pd;
}
- migf->buf = buf;
+ migf->buf[0] = buf;
if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
buf = mlx5vf_alloc_data_buffer(migf,
sizeof(struct mlx5_vf_migration_header), DMA_NONE);
@@ -983,7 +1122,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
goto out_buf;
}
- migf->buf_header = buf;
+ migf->buf_header[0] = buf;
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
} else {
/* Initial state will be to read the image */
@@ -997,7 +1136,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
spin_lock_init(&migf->list_lock);
return migf;
out_buf:
- mlx5vf_free_data_buffer(migf->buf);
+ mlx5vf_free_data_buffer(migf->buf[0]);
out_pd:
mlx5vf_cmd_dealloc_pd(migf);
out_free:
@@ -1019,6 +1158,7 @@ void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
cancel_work_sync(&mvdev->saving_migf->async_data.work);
mlx5vf_disable_fd(mvdev->saving_migf);
+ wake_up_interruptible(&mvdev->saving_migf->poll_wait);
mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
fput(mvdev->saving_migf->filp);
mvdev->saving_migf = NULL;
@@ -1100,7 +1240,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
ret = mlx5vf_cmd_load_vhca_state(mvdev,
mvdev->resuming_migf,
- mvdev->resuming_migf->buf);
+ mvdev->resuming_migf->buf[0]);
if (ret)
return ERR_PTR(ret);
}
@@ -1194,13 +1334,14 @@ static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
struct mlx5vf_pci_core_device *mvdev = container_of(
vdev, struct mlx5vf_pci_core_device, core_device.vdev);
size_t state_size;
+ u64 total_size;
int ret;
mutex_lock(&mvdev->state_mutex);
- ret = mlx5vf_cmd_query_vhca_migration_state(mvdev,
- &state_size, 0);
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size,
+ &total_size, 0);
if (!ret)
- *stop_copy_length = state_size;
+ *stop_copy_length = total_size;
mlx5vf_state_mutex_unlock(mvdev);
return ret;
}
@@ -1376,6 +1517,7 @@ static struct pci_driver mlx5vf_pci_driver = {
module_pci_driver(mlx5vf_pci_driver);
+MODULE_IMPORT_NS(IOMMUFD);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
diff --git a/drivers/vfio/pci/pds/Kconfig b/drivers/vfio/pci/pds/Kconfig
index 6eceef7b02..fec9b167c7 100644
--- a/drivers/vfio/pci/pds/Kconfig
+++ b/drivers/vfio/pci/pds/Kconfig
@@ -5,6 +5,7 @@ config PDS_VFIO_PCI
tristate "VFIO support for PDS PCI devices"
depends on PDS_CORE && PCI_IOV
select VFIO_PCI_CORE
+ select IOMMUFD_DRIVER
help
This provides generic PCI support for PDS devices using the VFIO
framework.
diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c
index caffa1a2cf..a34dda5166 100644
--- a/drivers/vfio/pci/pds/pci_drv.c
+++ b/drivers/vfio/pci/pds/pci_drv.c
@@ -204,6 +204,7 @@ static struct pci_driver pds_vfio_pci_driver = {
module_pci_driver(pds_vfio_pci_driver);
+MODULE_IMPORT_NS(IOMMUFD);
MODULE_DESCRIPTION(PDS_VFIO_DRV_DESCRIPTION);
MODULE_AUTHOR("Brett Creeley <brett.creeley@amd.com>");
MODULE_LICENSE("GPL");
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 40732e8ed4..8d4995ada7 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -946,6 +946,11 @@ void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
unsigned long last;
comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
+
+ /* Empty list */
+ if (WARN_ON_ONCE(!comb_start))
+ return;
+
curr = comb_start;
while (curr) {
last = curr->last;
@@ -975,6 +980,11 @@ void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
prev = curr;
curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
}
+
+ /* Empty list or no nodes to combine */
+ if (WARN_ON_ONCE(min_gap == ULONG_MAX))
+ break;
+
comb_start->last = comb_end->last;
interval_tree_remove(comb_end, root);
cur_nodes--;
@@ -1693,6 +1703,7 @@ static void __exit vfio_cleanup(void)
module_init(vfio_init);
module_exit(vfio_cleanup);
+MODULE_IMPORT_NS(IOMMUFD);
MODULE_VERSION(DRIVER_VERSION);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR(DRIVER_AUTHOR);