From dc50eab76b709d68175a358d6e23a5a3890764d3 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 18 May 2024 19:39:57 +0200 Subject: Merging upstream version 6.7.7. Signed-off-by: Daniel Baumann --- drivers/vfio/Makefile | 3 +- drivers/vfio/cdx/main.c | 58 +++++- drivers/vfio/cdx/private.h | 2 + drivers/vfio/iova_bitmap.c | 423 ----------------------------------------- drivers/vfio/pci/mlx5/Kconfig | 1 + drivers/vfio/pci/mlx5/cmd.c | 103 +++++++--- drivers/vfio/pci/mlx5/cmd.h | 28 ++- drivers/vfio/pci/mlx5/main.c | 284 ++++++++++++++++++++------- drivers/vfio/pci/pds/Kconfig | 1 + drivers/vfio/pci/pds/pci_drv.c | 1 + drivers/vfio/vfio_main.c | 11 ++ 11 files changed, 386 insertions(+), 529 deletions(-) delete mode 100644 drivers/vfio/iova_bitmap.c (limited to 'drivers/vfio') diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index c82ea032d3..68c0570520 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -1,8 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_VFIO) += vfio.o -vfio-y += vfio_main.o \ - iova_bitmap.o +vfio-y += vfio_main.o vfio-$(CONFIG_VFIO_DEVICE_CDEV) += device_cdev.o vfio-$(CONFIG_VFIO_GROUP) += group.o vfio-$(CONFIG_IOMMUFD) += iommufd.o diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c index de56686581..9cff8d7578 100644 --- a/drivers/vfio/cdx/main.c +++ b/drivers/vfio/cdx/main.c @@ -14,7 +14,7 @@ static int vfio_cdx_open_device(struct vfio_device *core_vdev) container_of(core_vdev, struct vfio_cdx_device, vdev); struct cdx_device *cdx_dev = to_cdx_device(core_vdev->dev); int count = cdx_dev->res_count; - int i; + int i, ret; vdev->regions = kcalloc(count, sizeof(struct vfio_cdx_region), GFP_KERNEL_ACCOUNT); @@ -39,6 +39,17 @@ static int vfio_cdx_open_device(struct vfio_device *core_vdev) if (!(cdx_dev->res[i].flags & IORESOURCE_READONLY)) vdev->regions[i].flags |= VFIO_REGION_INFO_FLAG_WRITE; } + ret = cdx_dev_reset(core_vdev->dev); + if (ret) { + kfree(vdev->regions); + vdev->regions = NULL; + return ret; + } + ret = cdx_clear_master(cdx_dev); + if (ret) + vdev->flags &= ~BME_SUPPORT; + else + vdev->flags |= BME_SUPPORT; return 0; } @@ -52,6 +63,49 @@ static void vfio_cdx_close_device(struct vfio_device *core_vdev) cdx_dev_reset(core_vdev->dev); } +static int vfio_cdx_bm_ctrl(struct vfio_device *core_vdev, u32 flags, + void __user *arg, size_t argsz) +{ + size_t minsz = + offsetofend(struct vfio_device_feature_bus_master, op); + struct vfio_cdx_device *vdev = + container_of(core_vdev, struct vfio_cdx_device, vdev); + struct cdx_device *cdx_dev = to_cdx_device(core_vdev->dev); + struct vfio_device_feature_bus_master ops; + int ret; + + if (!(vdev->flags & BME_SUPPORT)) + return -ENOTTY; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, + sizeof(ops)); + if (ret != 1) + return ret; + + if (copy_from_user(&ops, arg, minsz)) + return -EFAULT; + + switch (ops.op) { + case VFIO_DEVICE_FEATURE_CLEAR_MASTER: + return cdx_clear_master(cdx_dev); + case VFIO_DEVICE_FEATURE_SET_MASTER: + return cdx_set_master(cdx_dev); + default: + return -EINVAL; + } +} + +static int vfio_cdx_ioctl_feature(struct vfio_device *device, u32 flags, + void __user *arg, size_t argsz) +{ + switch (flags & VFIO_DEVICE_FEATURE_MASK) { + case VFIO_DEVICE_FEATURE_BUS_MASTER: + return vfio_cdx_bm_ctrl(device, flags, arg, argsz); + default: + return -ENOTTY; + } +} + static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev, struct vfio_device_info __user *arg) { @@ -169,6 +223,7 @@ static const struct vfio_device_ops vfio_cdx_ops = { .open_device = vfio_cdx_open_device, .close_device = vfio_cdx_close_device, .ioctl = vfio_cdx_ioctl, + .device_feature = vfio_cdx_ioctl_feature, .mmap = vfio_cdx_mmap, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, @@ -231,3 +286,4 @@ module_driver(vfio_cdx_driver, cdx_driver_register, cdx_driver_unregister); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("VFIO for CDX devices - User Level meta-driver"); +MODULE_IMPORT_NS(CDX_BUS); diff --git a/drivers/vfio/cdx/private.h b/drivers/vfio/cdx/private.h index 8bdc117ea8..8e9d259137 100644 --- a/drivers/vfio/cdx/private.h +++ b/drivers/vfio/cdx/private.h @@ -23,6 +23,8 @@ struct vfio_cdx_region { struct vfio_cdx_device { struct vfio_device vdev; struct vfio_cdx_region *regions; + u32 flags; +#define BME_SUPPORT BIT(0) }; #endif /* VFIO_CDX_PRIVATE_H */ diff --git a/drivers/vfio/iova_bitmap.c b/drivers/vfio/iova_bitmap.c deleted file mode 100644 index 0848f920ef..0000000000 --- a/drivers/vfio/iova_bitmap.c +++ /dev/null @@ -1,423 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2022, Oracle and/or its affiliates. - * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved - */ -#include -#include -#include -#include - -#define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE) - -/* - * struct iova_bitmap_map - A bitmap representing an IOVA range - * - * Main data structure for tracking mapped user pages of bitmap data. - * - * For example, for something recording dirty IOVAs, it will be provided a - * struct iova_bitmap structure, as a general structure for iterating the - * total IOVA range. The struct iova_bitmap_map, though, represents the - * subset of said IOVA space that is pinned by its parent structure (struct - * iova_bitmap). - * - * The user does not need to exact location of the bits in the bitmap. - * From user perspective the only API available is iova_bitmap_set() which - * records the IOVA *range* in the bitmap by setting the corresponding - * bits. - * - * The bitmap is an array of u64 whereas each bit represents an IOVA of - * range of (1 << pgshift). Thus formula for the bitmap data to be set is: - * - * data[(iova / page_size) / 64] & (1ULL << (iova % 64)) - */ -struct iova_bitmap_map { - /* base IOVA representing bit 0 of the first page */ - unsigned long iova; - - /* page size order that each bit granules to */ - unsigned long pgshift; - - /* page offset of the first user page pinned */ - unsigned long pgoff; - - /* number of pages pinned */ - unsigned long npages; - - /* pinned pages representing the bitmap data */ - struct page **pages; -}; - -/* - * struct iova_bitmap - The IOVA bitmap object - * - * Main data structure for iterating over the bitmap data. - * - * Abstracts the pinning work and iterates in IOVA ranges. - * It uses a windowing scheme and pins the bitmap in relatively - * big ranges e.g. - * - * The bitmap object uses one base page to store all the pinned pages - * pointers related to the bitmap. For sizeof(struct page*) == 8 it stores - * 512 struct page pointers which, if the base page size is 4K, it means - * 2M of bitmap data is pinned at a time. If the iova_bitmap page size is - * also 4K then the range window to iterate is 64G. - * - * For example iterating on a total IOVA range of 4G..128G, it will walk - * through this set of ranges: - * - * 4G - 68G-1 (64G) - * 68G - 128G-1 (64G) - * - * An example of the APIs on how to use/iterate over the IOVA bitmap: - * - * bitmap = iova_bitmap_alloc(iova, length, page_size, data); - * if (IS_ERR(bitmap)) - * return PTR_ERR(bitmap); - * - * ret = iova_bitmap_for_each(bitmap, arg, dirty_reporter_fn); - * - * iova_bitmap_free(bitmap); - * - * Each iteration of the @dirty_reporter_fn is called with a unique @iova - * and @length argument, indicating the current range available through the - * iova_bitmap. The @dirty_reporter_fn uses iova_bitmap_set() to mark dirty - * areas (@iova_length) within that provided range, as following: - * - * iova_bitmap_set(bitmap, iova, iova_length); - * - * The internals of the object uses an index @mapped_base_index that indexes - * which u64 word of the bitmap is mapped, up to @mapped_total_index. - * Those keep being incremented until @mapped_total_index is reached while - * mapping up to PAGE_SIZE / sizeof(struct page*) maximum of pages. - * - * The IOVA bitmap is usually located on what tracks DMA mapped ranges or - * some form of IOVA range tracking that co-relates to the user passed - * bitmap. - */ -struct iova_bitmap { - /* IOVA range representing the currently mapped bitmap data */ - struct iova_bitmap_map mapped; - - /* userspace address of the bitmap */ - u64 __user *bitmap; - - /* u64 index that @mapped points to */ - unsigned long mapped_base_index; - - /* how many u64 can we walk in total */ - unsigned long mapped_total_index; - - /* base IOVA of the whole bitmap */ - unsigned long iova; - - /* length of the IOVA range for the whole bitmap */ - size_t length; -}; - -/* - * Converts a relative IOVA to a bitmap index. - * This function provides the index into the u64 array (bitmap::bitmap) - * for a given IOVA offset. - * Relative IOVA means relative to the bitmap::mapped base IOVA - * (stored in mapped::iova). All computations in this file are done using - * relative IOVAs and thus avoid an extra subtraction against mapped::iova. - * The user API iova_bitmap_set() always uses a regular absolute IOVAs. - */ -static unsigned long iova_bitmap_offset_to_index(struct iova_bitmap *bitmap, - unsigned long iova) -{ - unsigned long pgsize = 1 << bitmap->mapped.pgshift; - - return iova / (BITS_PER_TYPE(*bitmap->bitmap) * pgsize); -} - -/* - * Converts a bitmap index to a *relative* IOVA. - */ -static unsigned long iova_bitmap_index_to_offset(struct iova_bitmap *bitmap, - unsigned long index) -{ - unsigned long pgshift = bitmap->mapped.pgshift; - - return (index * BITS_PER_TYPE(*bitmap->bitmap)) << pgshift; -} - -/* - * Returns the base IOVA of the mapped range. - */ -static unsigned long iova_bitmap_mapped_iova(struct iova_bitmap *bitmap) -{ - unsigned long skip = bitmap->mapped_base_index; - - return bitmap->iova + iova_bitmap_index_to_offset(bitmap, skip); -} - -/* - * Pins the bitmap user pages for the current range window. - * This is internal to IOVA bitmap and called when advancing the - * index (@mapped_base_index) or allocating the bitmap. - */ -static int iova_bitmap_get(struct iova_bitmap *bitmap) -{ - struct iova_bitmap_map *mapped = &bitmap->mapped; - unsigned long npages; - u64 __user *addr; - long ret; - - /* - * @mapped_base_index is the index of the currently mapped u64 words - * that we have access. Anything before @mapped_base_index is not - * mapped. The range @mapped_base_index .. @mapped_total_index-1 is - * mapped but capped at a maximum number of pages. - */ - npages = DIV_ROUND_UP((bitmap->mapped_total_index - - bitmap->mapped_base_index) * - sizeof(*bitmap->bitmap), PAGE_SIZE); - - /* - * We always cap at max number of 'struct page' a base page can fit. - * This is, for example, on x86 means 2M of bitmap data max. - */ - npages = min(npages, PAGE_SIZE / sizeof(struct page *)); - - /* - * Bitmap address to be pinned is calculated via pointer arithmetic - * with bitmap u64 word index. - */ - addr = bitmap->bitmap + bitmap->mapped_base_index; - - ret = pin_user_pages_fast((unsigned long)addr, npages, - FOLL_WRITE, mapped->pages); - if (ret <= 0) - return -EFAULT; - - mapped->npages = (unsigned long)ret; - /* Base IOVA where @pages point to i.e. bit 0 of the first page */ - mapped->iova = iova_bitmap_mapped_iova(bitmap); - - /* - * offset of the page where pinned pages bit 0 is located. - * This handles the case where the bitmap is not PAGE_SIZE - * aligned. - */ - mapped->pgoff = offset_in_page(addr); - return 0; -} - -/* - * Unpins the bitmap user pages and clears @npages - * (un)pinning is abstracted from API user and it's done when advancing - * the index or freeing the bitmap. - */ -static void iova_bitmap_put(struct iova_bitmap *bitmap) -{ - struct iova_bitmap_map *mapped = &bitmap->mapped; - - if (mapped->npages) { - unpin_user_pages(mapped->pages, mapped->npages); - mapped->npages = 0; - } -} - -/** - * iova_bitmap_alloc() - Allocates an IOVA bitmap object - * @iova: Start address of the IOVA range - * @length: Length of the IOVA range - * @page_size: Page size of the IOVA bitmap. It defines what each bit - * granularity represents - * @data: Userspace address of the bitmap - * - * Allocates an IOVA object and initializes all its fields including the - * first user pages of @data. - * - * Return: A pointer to a newly allocated struct iova_bitmap - * or ERR_PTR() on error. - */ -struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length, - unsigned long page_size, u64 __user *data) -{ - struct iova_bitmap_map *mapped; - struct iova_bitmap *bitmap; - int rc; - - bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); - if (!bitmap) - return ERR_PTR(-ENOMEM); - - mapped = &bitmap->mapped; - mapped->pgshift = __ffs(page_size); - bitmap->bitmap = data; - bitmap->mapped_total_index = - iova_bitmap_offset_to_index(bitmap, length - 1) + 1; - bitmap->iova = iova; - bitmap->length = length; - mapped->iova = iova; - mapped->pages = (struct page **)__get_free_page(GFP_KERNEL); - if (!mapped->pages) { - rc = -ENOMEM; - goto err; - } - - rc = iova_bitmap_get(bitmap); - if (rc) - goto err; - return bitmap; - -err: - iova_bitmap_free(bitmap); - return ERR_PTR(rc); -} - -/** - * iova_bitmap_free() - Frees an IOVA bitmap object - * @bitmap: IOVA bitmap to free - * - * It unpins and releases pages array memory and clears any leftover - * state. - */ -void iova_bitmap_free(struct iova_bitmap *bitmap) -{ - struct iova_bitmap_map *mapped = &bitmap->mapped; - - iova_bitmap_put(bitmap); - - if (mapped->pages) { - free_page((unsigned long)mapped->pages); - mapped->pages = NULL; - } - - kfree(bitmap); -} - -/* - * Returns the remaining bitmap indexes from mapped_total_index to process for - * the currently pinned bitmap pages. - */ -static unsigned long iova_bitmap_mapped_remaining(struct iova_bitmap *bitmap) -{ - unsigned long remaining, bytes; - - bytes = (bitmap->mapped.npages << PAGE_SHIFT) - bitmap->mapped.pgoff; - - remaining = bitmap->mapped_total_index - bitmap->mapped_base_index; - remaining = min_t(unsigned long, remaining, - bytes / sizeof(*bitmap->bitmap)); - - return remaining; -} - -/* - * Returns the length of the mapped IOVA range. - */ -static unsigned long iova_bitmap_mapped_length(struct iova_bitmap *bitmap) -{ - unsigned long max_iova = bitmap->iova + bitmap->length - 1; - unsigned long iova = iova_bitmap_mapped_iova(bitmap); - unsigned long remaining; - - /* - * iova_bitmap_mapped_remaining() returns a number of indexes which - * when converted to IOVA gives us a max length that the bitmap - * pinned data can cover. Afterwards, that is capped to - * only cover the IOVA range in @bitmap::iova .. @bitmap::length. - */ - remaining = iova_bitmap_index_to_offset(bitmap, - iova_bitmap_mapped_remaining(bitmap)); - - if (iova + remaining - 1 > max_iova) - remaining -= ((iova + remaining - 1) - max_iova); - - return remaining; -} - -/* - * Returns true if there's not more data to iterate. - */ -static bool iova_bitmap_done(struct iova_bitmap *bitmap) -{ - return bitmap->mapped_base_index >= bitmap->mapped_total_index; -} - -/* - * Advances to the next range, releases the current pinned - * pages and pins the next set of bitmap pages. - * Returns 0 on success or otherwise errno. - */ -static int iova_bitmap_advance(struct iova_bitmap *bitmap) -{ - unsigned long iova = iova_bitmap_mapped_length(bitmap) - 1; - unsigned long count = iova_bitmap_offset_to_index(bitmap, iova) + 1; - - bitmap->mapped_base_index += count; - - iova_bitmap_put(bitmap); - if (iova_bitmap_done(bitmap)) - return 0; - - /* When advancing the index we pin the next set of bitmap pages */ - return iova_bitmap_get(bitmap); -} - -/** - * iova_bitmap_for_each() - Iterates over the bitmap - * @bitmap: IOVA bitmap to iterate - * @opaque: Additional argument to pass to the callback - * @fn: Function that gets called for each IOVA range - * - * Helper function to iterate over bitmap data representing a portion of IOVA - * space. It hides the complexity of iterating bitmaps and translating the - * mapped bitmap user pages into IOVA ranges to process. - * - * Return: 0 on success, and an error on failure either upon - * iteration or when the callback returns an error. - */ -int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque, - iova_bitmap_fn_t fn) -{ - int ret = 0; - - for (; !iova_bitmap_done(bitmap) && !ret; - ret = iova_bitmap_advance(bitmap)) { - ret = fn(bitmap, iova_bitmap_mapped_iova(bitmap), - iova_bitmap_mapped_length(bitmap), opaque); - if (ret) - break; - } - - return ret; -} - -/** - * iova_bitmap_set() - Records an IOVA range in bitmap - * @bitmap: IOVA bitmap - * @iova: IOVA to start - * @length: IOVA range length - * - * Set the bits corresponding to the range [iova .. iova+length-1] in - * the user bitmap. - * - */ -void iova_bitmap_set(struct iova_bitmap *bitmap, - unsigned long iova, size_t length) -{ - struct iova_bitmap_map *mapped = &bitmap->mapped; - unsigned long cur_bit = ((iova - mapped->iova) >> - mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; - unsigned long last_bit = (((iova + length - 1) - mapped->iova) >> - mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE; - - do { - unsigned int page_idx = cur_bit / BITS_PER_PAGE; - unsigned int offset = cur_bit % BITS_PER_PAGE; - unsigned int nbits = min(BITS_PER_PAGE - offset, - last_bit - cur_bit + 1); - void *kaddr; - - kaddr = kmap_local_page(mapped->pages[page_idx]); - bitmap_set(kaddr, offset, nbits); - kunmap_local(kaddr); - cur_bit += nbits; - } while (cur_bit <= last_bit); -} -EXPORT_SYMBOL_GPL(iova_bitmap_set); diff --git a/drivers/vfio/pci/mlx5/Kconfig b/drivers/vfio/pci/mlx5/Kconfig index 7088edc4fb..c3ced56b77 100644 --- a/drivers/vfio/pci/mlx5/Kconfig +++ b/drivers/vfio/pci/mlx5/Kconfig @@ -3,6 +3,7 @@ config MLX5_VFIO_PCI tristate "VFIO support for MLX5 PCI devices" depends on MLX5_CORE select VFIO_PCI_CORE + select IOMMUFD_DRIVER help This provides migration support for MLX5 devices using the VFIO framework. diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 33574b0447..efd1d252cd 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -86,7 +86,8 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) } int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, - size_t *state_size, u8 query_flags) + size_t *state_size, u64 *total_size, + u8 query_flags) { u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; @@ -128,6 +129,7 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0); MLX5_SET(query_vhca_migration_state_in, in, incremental, query_flags & MLX5VF_QUERY_INC); + MLX5_SET(query_vhca_migration_state_in, in, chunk, mvdev->chunk_mode); ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, out); @@ -139,6 +141,11 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, *state_size = MLX5_GET(query_vhca_migration_state_out, out, required_umem_size); + if (total_size) + *total_size = mvdev->chunk_mode ? + MLX5_GET64(query_vhca_migration_state_out, out, + remaining_total_size) : *state_size; + return 0; } @@ -254,6 +261,9 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, mvdev->core_device.vdev.migration_flags |= VFIO_MIGRATION_PRE_COPY; + if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks)) + mvdev->chunk_mode = 1; + end: mlx5_vf_put_core_dev(mvdev->mdev); } @@ -428,6 +438,7 @@ end: void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) { spin_lock_irq(&buf->migf->list_lock); + buf->stop_copy_chunk_num = 0; list_add_tail(&buf->buf_elm, &buf->migf->avail_list); spin_unlock_irq(&buf->migf->list_lock); } @@ -475,6 +486,15 @@ found: return buf; } +static void +mlx5vf_save_callback_complete(struct mlx5_vf_migration_file *migf, + struct mlx5vf_async_data *async_data) +{ + kvfree(async_data->out); + complete(&migf->save_comp); + fput(migf->filp); +} + void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) { struct mlx5vf_async_data *async_data = container_of(_work, @@ -487,16 +507,15 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) mlx5vf_put_data_buffer(async_data->buf); if (async_data->header_buf) mlx5vf_put_data_buffer(async_data->header_buf); - if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR) + if (!async_data->stop_copy_chunk && + async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR) migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR; else migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } mutex_unlock(&migf->lock); - kvfree(async_data->out); - complete(&migf->save_comp); - fput(migf->filp); + mlx5vf_save_callback_complete(migf, async_data); } static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, @@ -536,13 +555,20 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) struct mlx5_vf_migration_file, async_data); if (!status) { + size_t next_required_umem_size = 0; + bool stop_copy_last_chunk; size_t image_size; unsigned long flags; bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY && - !async_data->last_chunk; + !async_data->stop_copy_chunk; image_size = MLX5_GET(save_vhca_state_out, async_data->out, actual_image_size); + if (async_data->buf->stop_copy_chunk_num) + next_required_umem_size = MLX5_GET(save_vhca_state_out, + async_data->out, next_required_umem_size); + stop_copy_last_chunk = async_data->stop_copy_chunk && + !next_required_umem_size; if (async_data->header_buf) { status = add_buf_header(async_data->header_buf, image_size, initial_pre_copy); @@ -554,19 +580,34 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) migf->max_pos += async_data->buf->length; spin_lock_irqsave(&migf->list_lock, flags); list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); + if (async_data->buf->stop_copy_chunk_num) { + migf->num_ready_chunks++; + if (next_required_umem_size && + migf->num_ready_chunks >= MAX_NUM_CHUNKS) { + /* Delay the next SAVE till one chunk be consumed */ + migf->next_required_umem_size = next_required_umem_size; + next_required_umem_size = 0; + } + } spin_unlock_irqrestore(&migf->list_lock, flags); - if (initial_pre_copy) + if (initial_pre_copy) { migf->pre_copy_initial_bytes += image_size; - migf->state = async_data->last_chunk ? - MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY; + migf->state = MLX5_MIGF_STATE_PRE_COPY; + } + if (stop_copy_last_chunk) + migf->state = MLX5_MIGF_STATE_COMPLETE; wake_up_interruptible(&migf->poll_wait); + if (next_required_umem_size) + mlx5vf_mig_file_set_save_work(migf, + /* Picking up the next chunk num */ + (async_data->buf->stop_copy_chunk_num % MAX_NUM_CHUNKS) + 1, + next_required_umem_size); + mlx5vf_save_callback_complete(migf, async_data); + return; } err: - /* - * The error and the cleanup flows can't run from an - * interrupt context - */ + /* The error flow can't run from an interrupt context */ if (status == -EREMOTEIO) status = MLX5_GET(save_vhca_state_out, async_data->out, status); async_data->status = status; @@ -610,7 +651,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, async_data = &migf->async_data; async_data->buf = buf; - async_data->last_chunk = !track; + async_data->stop_copy_chunk = !track; async_data->out = kvzalloc(out_size, GFP_KERNEL); if (!async_data->out) { err = -ENOMEM; @@ -618,10 +659,15 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, } if (MLX5VF_PRE_COPY_SUPP(mvdev)) { - if (async_data->last_chunk && migf->buf_header) { - header_buf = migf->buf_header; - migf->buf_header = NULL; - } else { + if (async_data->stop_copy_chunk) { + u8 header_idx = buf->stop_copy_chunk_num ? + buf->stop_copy_chunk_num - 1 : 0; + + header_buf = migf->buf_header[header_idx]; + migf->buf_header[header_idx] = NULL; + } + + if (!header_buf) { header_buf = mlx5vf_get_data_buffer(migf, sizeof(struct mlx5_vf_migration_header), DMA_NONE); if (IS_ERR(header_buf)) { @@ -631,8 +677,8 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, } } - if (async_data->last_chunk) - migf->state = MLX5_MIGF_STATE_SAVE_LAST; + if (async_data->stop_copy_chunk) + migf->state = MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK; async_data->header_buf = header_buf; get_file(migf->filp); @@ -707,18 +753,21 @@ void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) { struct mlx5_vhca_data_buffer *entry; + int i; lockdep_assert_held(&migf->mvdev->state_mutex); WARN_ON(migf->mvdev->mdev_detach); - if (migf->buf) { - mlx5vf_free_data_buffer(migf->buf); - migf->buf = NULL; - } + for (i = 0; i < MAX_NUM_CHUNKS; i++) { + if (migf->buf[i]) { + mlx5vf_free_data_buffer(migf->buf[i]); + migf->buf[i] = NULL; + } - if (migf->buf_header) { - mlx5vf_free_data_buffer(migf->buf_header); - migf->buf_header = NULL; + if (migf->buf_header[i]) { + mlx5vf_free_data_buffer(migf->buf_header[i]); + migf->buf_header[i] = NULL; + } } list_splice(&migf->avail_list, &migf->buf_list); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index aec4c69dd6..f2c7227fa6 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -20,7 +20,7 @@ enum mlx5_vf_migf_state { MLX5_MIGF_STATE_ERROR = 1, MLX5_MIGF_STATE_PRE_COPY_ERROR, MLX5_MIGF_STATE_PRE_COPY, - MLX5_MIGF_STATE_SAVE_LAST, + MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK, MLX5_MIGF_STATE_COMPLETE, }; @@ -64,6 +64,7 @@ struct mlx5_vhca_data_buffer { u32 mkey; enum dma_data_direction dma_dir; u8 dmaed:1; + u8 stop_copy_chunk_num; struct list_head buf_elm; struct mlx5_vf_migration_file *migf; /* Optimize mlx5vf_get_migration_page() for sequential access */ @@ -78,10 +79,19 @@ struct mlx5vf_async_data { struct mlx5_vhca_data_buffer *buf; struct mlx5_vhca_data_buffer *header_buf; int status; - u8 last_chunk:1; + u8 stop_copy_chunk:1; void *out; }; +struct mlx5vf_save_work_data { + struct mlx5_vf_migration_file *migf; + size_t next_required_umem_size; + struct work_struct work; + u8 chunk_num; +}; + +#define MAX_NUM_CHUNKS 2 + struct mlx5_vf_migration_file { struct file *filp; struct mutex lock; @@ -94,8 +104,12 @@ struct mlx5_vf_migration_file { u32 record_tag; u64 stop_copy_prep_size; u64 pre_copy_initial_bytes; - struct mlx5_vhca_data_buffer *buf; - struct mlx5_vhca_data_buffer *buf_header; + size_t next_required_umem_size; + u8 num_ready_chunks; + /* Upon chunk mode preserve another set of buffers for stop_copy phase */ + struct mlx5_vhca_data_buffer *buf[MAX_NUM_CHUNKS]; + struct mlx5_vhca_data_buffer *buf_header[MAX_NUM_CHUNKS]; + struct mlx5vf_save_work_data save_data[MAX_NUM_CHUNKS]; spinlock_t list_lock; struct list_head buf_list; struct list_head avail_list; @@ -164,6 +178,7 @@ struct mlx5vf_pci_core_device { u8 deferred_reset:1; u8 mdev_detach:1; u8 log_active:1; + u8 chunk_mode:1; struct completion tracker_comp; /* protect migration state */ struct mutex state_mutex; @@ -186,7 +201,8 @@ enum { int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, - size_t *state_size, u8 query_flags); + size_t *state_size, u64 *total_size, + u8 query_flags); void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, const struct vfio_migration_ops *mig_ops, const struct vfio_log_ops *log_ops); @@ -217,6 +233,8 @@ struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); +void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf, + u8 chunk_num, size_t next_required_umem_size); int mlx5vf_start_page_tracker(struct vfio_device *vdev, struct rb_root_cached *ranges, u32 nnodes, u64 *page_size); int mlx5vf_stop_page_tracker(struct vfio_device *vdev); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 42ec574a86..fe09a8c8af 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -24,6 +24,8 @@ /* Device specification max LOAD size */ #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1) +#define MAX_CHUNK_SIZE SZ_8M + static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) { struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); @@ -158,6 +160,41 @@ end: return found ? buf : NULL; } +static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf) +{ + struct mlx5_vf_migration_file *migf = vhca_buf->migf; + + if (vhca_buf->stop_copy_chunk_num) { + bool is_header = vhca_buf->dma_dir == DMA_NONE; + u8 chunk_num = vhca_buf->stop_copy_chunk_num; + size_t next_required_umem_size = 0; + + if (is_header) + migf->buf_header[chunk_num - 1] = vhca_buf; + else + migf->buf[chunk_num - 1] = vhca_buf; + + spin_lock_irq(&migf->list_lock); + list_del_init(&vhca_buf->buf_elm); + if (!is_header) { + next_required_umem_size = + migf->next_required_umem_size; + migf->next_required_umem_size = 0; + migf->num_ready_chunks--; + } + spin_unlock_irq(&migf->list_lock); + if (next_required_umem_size) + mlx5vf_mig_file_set_save_work(migf, chunk_num, + next_required_umem_size); + return; + } + + spin_lock_irq(&migf->list_lock); + list_del_init(&vhca_buf->buf_elm); + list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); + spin_unlock_irq(&migf->list_lock); +} + static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, char __user **buf, size_t *len, loff_t *pos) { @@ -193,12 +230,8 @@ static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, copy_len -= page_len; } - if (*pos >= vhca_buf->start_pos + vhca_buf->length) { - spin_lock_irq(&vhca_buf->migf->list_lock); - list_del_init(&vhca_buf->buf_elm); - list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); - spin_unlock_irq(&vhca_buf->migf->list_lock); - } + if (*pos >= vhca_buf->start_pos + vhca_buf->length) + mlx5vf_buf_read_done(vhca_buf); return done; } @@ -304,7 +337,75 @@ static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) wake_up_interruptible(&migf->poll_wait); } -static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf) +void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf, + u8 chunk_num, size_t next_required_umem_size) +{ + migf->save_data[chunk_num - 1].next_required_umem_size = + next_required_umem_size; + migf->save_data[chunk_num - 1].migf = migf; + get_file(migf->filp); + queue_work(migf->mvdev->cb_wq, + &migf->save_data[chunk_num - 1].work); +} + +static struct mlx5_vhca_data_buffer * +mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf, + u8 index, size_t required_length) +{ + struct mlx5_vhca_data_buffer *buf = migf->buf[index]; + u8 chunk_num; + + WARN_ON(!buf); + chunk_num = buf->stop_copy_chunk_num; + buf->migf->buf[index] = NULL; + /* Checking whether the pre-allocated buffer can fit */ + if (buf->allocated_length >= required_length) + return buf; + + mlx5vf_put_data_buffer(buf); + buf = mlx5vf_get_data_buffer(buf->migf, required_length, + DMA_FROM_DEVICE); + if (IS_ERR(buf)) + return buf; + + buf->stop_copy_chunk_num = chunk_num; + return buf; +} + +static void mlx5vf_mig_file_save_work(struct work_struct *_work) +{ + struct mlx5vf_save_work_data *save_data = container_of(_work, + struct mlx5vf_save_work_data, work); + struct mlx5_vf_migration_file *migf = save_data->migf; + struct mlx5vf_pci_core_device *mvdev = migf->mvdev; + struct mlx5_vhca_data_buffer *buf; + + mutex_lock(&mvdev->state_mutex); + if (migf->state == MLX5_MIGF_STATE_ERROR) + goto end; + + buf = mlx5vf_mig_file_get_stop_copy_buf(migf, + save_data->chunk_num - 1, + save_data->next_required_umem_size); + if (IS_ERR(buf)) + goto err; + + if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false)) + goto err_save; + + goto end; + +err_save: + mlx5vf_put_data_buffer(buf); +err: + mlx5vf_mark_err(migf); +end: + mlx5vf_state_mutex_unlock(mvdev); + fput(migf->filp); +} + +static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf, + bool track) { size_t size = sizeof(struct mlx5_vf_migration_header) + sizeof(struct mlx5_vf_migration_tag_stop_copy_data); @@ -331,7 +432,7 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf) to_buff = kmap_local_page(page); memcpy(to_buff, &header, sizeof(header)); header_buf->length = sizeof(header); - data.stop_copy_size = cpu_to_le64(migf->buf->allocated_length); + data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length); memcpy(to_buff + sizeof(header), &data, sizeof(data)); header_buf->length += sizeof(data); kunmap_local(to_buff); @@ -340,48 +441,86 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf) spin_lock_irqsave(&migf->list_lock, flags); list_add_tail(&header_buf->buf_elm, &migf->buf_list); spin_unlock_irqrestore(&migf->list_lock, flags); - migf->pre_copy_initial_bytes = size; + if (track) + migf->pre_copy_initial_bytes = size; return 0; err: mlx5vf_put_data_buffer(header_buf); return ret; } -static int mlx5vf_prep_stop_copy(struct mlx5_vf_migration_file *migf, - size_t state_size) +static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev, + struct mlx5_vf_migration_file *migf, + size_t state_size, u64 full_size, + bool track) { struct mlx5_vhca_data_buffer *buf; size_t inc_state_size; + int num_chunks; int ret; + int i; - /* let's be ready for stop_copy size that might grow by 10 percents */ - if (check_add_overflow(state_size, state_size / 10, &inc_state_size)) - inc_state_size = state_size; + if (mvdev->chunk_mode) { + size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size); - buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE); - if (IS_ERR(buf)) - return PTR_ERR(buf); + /* from firmware perspective at least 'state_size' buffer should be set */ + inc_state_size = max(state_size, chunk_size); + } else { + if (track) { + /* let's be ready for stop_copy size that might grow by 10 percents */ + if (check_add_overflow(state_size, state_size / 10, &inc_state_size)) + inc_state_size = state_size; + } else { + inc_state_size = state_size; + } + } - migf->buf = buf; - buf = mlx5vf_get_data_buffer(migf, - sizeof(struct mlx5_vf_migration_header), DMA_NONE); - if (IS_ERR(buf)) { - ret = PTR_ERR(buf); - goto err; + /* let's not overflow the device specification max SAVE size */ + inc_state_size = min_t(size_t, inc_state_size, + (BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE)); + + num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1; + for (i = 0; i < num_chunks; i++) { + buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto err; + } + + migf->buf[i] = buf; + buf = mlx5vf_get_data_buffer(migf, + sizeof(struct mlx5_vf_migration_header), DMA_NONE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto err; + } + migf->buf_header[i] = buf; + if (mvdev->chunk_mode) { + migf->buf[i]->stop_copy_chunk_num = i + 1; + migf->buf_header[i]->stop_copy_chunk_num = i + 1; + INIT_WORK(&migf->save_data[i].work, + mlx5vf_mig_file_save_work); + migf->save_data[i].chunk_num = i + 1; + } } - migf->buf_header = buf; - ret = mlx5vf_add_stop_copy_header(migf); + ret = mlx5vf_add_stop_copy_header(migf, track); if (ret) - goto err_header; + goto err; return 0; -err_header: - mlx5vf_put_data_buffer(migf->buf_header); - migf->buf_header = NULL; err: - mlx5vf_put_data_buffer(migf->buf); - migf->buf = NULL; + for (i = 0; i < num_chunks; i++) { + if (migf->buf[i]) { + mlx5vf_put_data_buffer(migf->buf[i]); + migf->buf[i] = NULL; + } + if (migf->buf_header[i]) { + mlx5vf_put_data_buffer(migf->buf_header[i]); + migf->buf_header[i] = NULL; + } + } + return ret; } @@ -428,7 +567,7 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, * As so, the other code below is safe with the proper locks. */ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length, - MLX5VF_QUERY_INC); + NULL, MLX5VF_QUERY_INC); if (ret) goto err_state_unlock; } @@ -505,21 +644,15 @@ static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) if (migf->state == MLX5_MIGF_STATE_ERROR) return -ENODEV; - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL, MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL); if (ret) goto err; - /* Checking whether we have a matching pre-allocated buffer that can fit */ - if (migf->buf && migf->buf->allocated_length >= length) { - buf = migf->buf; - migf->buf = NULL; - } else { - buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE); - if (IS_ERR(buf)) { - ret = PTR_ERR(buf); - goto err; - } + buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto err; } ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); @@ -541,6 +674,7 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) struct mlx5_vf_migration_file *migf; struct mlx5_vhca_data_buffer *buf; size_t length; + u64 full_size; int ret; migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); @@ -574,20 +708,25 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) INIT_LIST_HEAD(&migf->buf_list); INIT_LIST_HEAD(&migf->avail_list); spin_lock_init(&migf->list_lock); - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0); + if (ret) + goto out_pd; + + ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track); if (ret) goto out_pd; if (track) { - ret = mlx5vf_prep_stop_copy(migf, length); - if (ret) + /* leave the allocated buffer ready for the stop-copy phase */ + buf = mlx5vf_alloc_data_buffer(migf, + migf->buf[0]->allocated_length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); goto out_pd; - } - - buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE); - if (IS_ERR(buf)) { - ret = PTR_ERR(buf); - goto out_pd; + } + } else { + buf = migf->buf[0]; + migf->buf[0] = NULL; } ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track); @@ -820,8 +959,8 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; - struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; - struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header; + struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0]; + struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0]; loff_t requested_length; bool has_work = false; ssize_t done = 0; @@ -856,15 +995,15 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, if (vhca_buf_header->allocated_length < migf->record_size) { mlx5vf_free_data_buffer(vhca_buf_header); - migf->buf_header = mlx5vf_alloc_data_buffer(migf, + migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf, migf->record_size, DMA_NONE); - if (IS_ERR(migf->buf_header)) { - ret = PTR_ERR(migf->buf_header); - migf->buf_header = NULL; + if (IS_ERR(migf->buf_header[0])) { + ret = PTR_ERR(migf->buf_header[0]); + migf->buf_header[0] = NULL; goto out_unlock; } - vhca_buf_header = migf->buf_header; + vhca_buf_header = migf->buf_header[0]; } vhca_buf_header->start_pos = migf->max_pos; @@ -884,15 +1023,15 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, if (vhca_buf->allocated_length < size) { mlx5vf_free_data_buffer(vhca_buf); - migf->buf = mlx5vf_alloc_data_buffer(migf, + migf->buf[0] = mlx5vf_alloc_data_buffer(migf, size, DMA_TO_DEVICE); - if (IS_ERR(migf->buf)) { - ret = PTR_ERR(migf->buf); - migf->buf = NULL; + if (IS_ERR(migf->buf[0])) { + ret = PTR_ERR(migf->buf[0]); + migf->buf[0] = NULL; goto out_unlock; } - vhca_buf = migf->buf; + vhca_buf = migf->buf[0]; } vhca_buf->start_pos = migf->max_pos; @@ -974,7 +1113,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) goto out_pd; } - migf->buf = buf; + migf->buf[0] = buf; if (MLX5VF_PRE_COPY_SUPP(mvdev)) { buf = mlx5vf_alloc_data_buffer(migf, sizeof(struct mlx5_vf_migration_header), DMA_NONE); @@ -983,7 +1122,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) goto out_buf; } - migf->buf_header = buf; + migf->buf_header[0] = buf; migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; } else { /* Initial state will be to read the image */ @@ -997,7 +1136,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) spin_lock_init(&migf->list_lock); return migf; out_buf: - mlx5vf_free_data_buffer(migf->buf); + mlx5vf_free_data_buffer(migf->buf[0]); out_pd: mlx5vf_cmd_dealloc_pd(migf); out_free: @@ -1019,6 +1158,7 @@ void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); cancel_work_sync(&mvdev->saving_migf->async_data.work); mlx5vf_disable_fd(mvdev->saving_migf); + wake_up_interruptible(&mvdev->saving_migf->poll_wait); mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); fput(mvdev->saving_migf->filp); mvdev->saving_migf = NULL; @@ -1100,7 +1240,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, if (!MLX5VF_PRE_COPY_SUPP(mvdev)) { ret = mlx5vf_cmd_load_vhca_state(mvdev, mvdev->resuming_migf, - mvdev->resuming_migf->buf); + mvdev->resuming_migf->buf[0]); if (ret) return ERR_PTR(ret); } @@ -1194,13 +1334,14 @@ static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, struct mlx5vf_pci_core_device *mvdev = container_of( vdev, struct mlx5vf_pci_core_device, core_device.vdev); size_t state_size; + u64 total_size; int ret; mutex_lock(&mvdev->state_mutex); - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, - &state_size, 0); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size, + &total_size, 0); if (!ret) - *stop_copy_length = state_size; + *stop_copy_length = total_size; mlx5vf_state_mutex_unlock(mvdev); return ret; } @@ -1376,6 +1517,7 @@ static struct pci_driver mlx5vf_pci_driver = { module_pci_driver(mlx5vf_pci_driver); +MODULE_IMPORT_NS(IOMMUFD); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Max Gurtovoy "); MODULE_AUTHOR("Yishai Hadas "); diff --git a/drivers/vfio/pci/pds/Kconfig b/drivers/vfio/pci/pds/Kconfig index 6eceef7b02..fec9b167c7 100644 --- a/drivers/vfio/pci/pds/Kconfig +++ b/drivers/vfio/pci/pds/Kconfig @@ -5,6 +5,7 @@ config PDS_VFIO_PCI tristate "VFIO support for PDS PCI devices" depends on PDS_CORE && PCI_IOV select VFIO_PCI_CORE + select IOMMUFD_DRIVER help This provides generic PCI support for PDS devices using the VFIO framework. diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c index caffa1a2cf..a34dda5166 100644 --- a/drivers/vfio/pci/pds/pci_drv.c +++ b/drivers/vfio/pci/pds/pci_drv.c @@ -204,6 +204,7 @@ static struct pci_driver pds_vfio_pci_driver = { module_pci_driver(pds_vfio_pci_driver); +MODULE_IMPORT_NS(IOMMUFD); MODULE_DESCRIPTION(PDS_VFIO_DRV_DESCRIPTION); MODULE_AUTHOR("Brett Creeley "); MODULE_LICENSE("GPL"); diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 40732e8ed4..8d4995ada7 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -946,6 +946,11 @@ void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes, unsigned long last; comb_start = interval_tree_iter_first(root, 0, ULONG_MAX); + + /* Empty list */ + if (WARN_ON_ONCE(!comb_start)) + return; + curr = comb_start; while (curr) { last = curr->last; @@ -975,6 +980,11 @@ void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes, prev = curr; curr = interval_tree_iter_next(curr, 0, ULONG_MAX); } + + /* Empty list or no nodes to combine */ + if (WARN_ON_ONCE(min_gap == ULONG_MAX)) + break; + comb_start->last = comb_end->last; interval_tree_remove(comb_end, root); cur_nodes--; @@ -1693,6 +1703,7 @@ static void __exit vfio_cleanup(void) module_init(vfio_init); module_exit(vfio_cleanup); +MODULE_IMPORT_NS(IOMMUFD); MODULE_VERSION(DRIVER_VERSION); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR(DRIVER_AUTHOR); -- cgit v1.2.3