diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 20 | ||||
-rw-r--r-- | block/bdev.c | 287 | ||||
-rw-r--r-- | block/bio-integrity.c | 218 | ||||
-rw-r--r-- | block/bio.c | 5 | ||||
-rw-r--r-- | block/blk-cgroup.c | 18 | ||||
-rw-r--r-- | block/blk-cgroup.h | 2 | ||||
-rw-r--r-- | block/blk-core.c | 38 | ||||
-rw-r--r-- | block/blk-iocost.c | 2 | ||||
-rw-r--r-- | block/blk-merge.c | 6 | ||||
-rw-r--r-- | block/blk-mq-debugfs.c | 18 | ||||
-rw-r--r-- | block/blk-mq-sched.c | 2 | ||||
-rw-r--r-- | block/blk-mq.c | 22 | ||||
-rw-r--r-- | block/blk-rq-qos.h | 2 | ||||
-rw-r--r-- | block/blk-settings.c | 107 | ||||
-rw-r--r-- | block/blk-stat.c | 2 | ||||
-rw-r--r-- | block/blk-sysfs.c | 11 | ||||
-rw-r--r-- | block/blk-wbt.c | 13 | ||||
-rw-r--r-- | block/blk-wbt.h | 5 | ||||
-rw-r--r-- | block/blk-zoned.c | 21 | ||||
-rw-r--r-- | block/blk.h | 2 | ||||
-rw-r--r-- | block/fops.c | 23 | ||||
-rw-r--r-- | block/ioctl.c | 3 | ||||
-rw-r--r-- | block/ioprio.c | 26 | ||||
-rw-r--r-- | block/partitions/core.c | 16 |
24 files changed, 524 insertions, 345 deletions
diff --git a/block/Kconfig b/block/Kconfig index 55ae2286a4..1de4682d48 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -78,6 +78,26 @@ config BLK_DEV_INTEGRITY_T10 select CRC_T10DIF select CRC64_ROCKSOFT +config BLK_DEV_WRITE_MOUNTED + bool "Allow writing to mounted block devices" + default y + help + When a block device is mounted, writing to its buffer cache is very + likely going to cause filesystem corruption. It is also rather easy to + crash the kernel in this way since the filesystem has no practical way + of detecting these writes to buffer cache and verifying its metadata + integrity. However there are some setups that need this capability + like running fsck on read-only mounted root device, modifying some + features on mounted ext4 filesystem, and similar. If you say N, the + kernel will prevent processes from writing to block devices that are + mounted by filesystems which provides some more protection from runaway + privileged processes and generally makes it much harder to crash + filesystem drivers. Note however that this does not prevent + underlying device(s) from being modified by other means, e.g. by + directly submitting SCSI commands or through access to lower layers of + storage stack. If in doubt, say Y. The configuration can be overridden + with the bdev_allow_write_mounted boot option. + config BLK_DEV_ZONED bool "Zoned block device support" select MQ_IOSCHED_DEADLINE diff --git a/block/bdev.c b/block/bdev.c index 750aec178b..812872cdc0 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -30,6 +30,9 @@ #include "../fs/internal.h" #include "blk.h" +/* Should we allow writing to mounted block devices? */ +static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED); + struct bdev_inode { struct block_device bdev; struct inode vfs_inode; @@ -207,85 +210,88 @@ int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend) EXPORT_SYMBOL(sync_blockdev_range); /** - * freeze_bdev - lock a filesystem and force it into a consistent state + * bdev_freeze - lock a filesystem and force it into a consistent state * @bdev: blockdevice to lock * * If a superblock is found on this device, we take the s_umount semaphore * on it to make sure nobody unmounts until the snapshot creation is done. * The reference counter (bd_fsfreeze_count) guarantees that only the last * unfreeze process can unfreeze the frozen filesystem actually when multiple - * freeze requests arrive simultaneously. It counts up in freeze_bdev() and - * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze + * freeze requests arrive simultaneously. It counts up in bdev_freeze() and + * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze * actually. + * + * Return: On success zero is returned, negative error code on failure. */ -int freeze_bdev(struct block_device *bdev) +int bdev_freeze(struct block_device *bdev) { - struct super_block *sb; int error = 0; mutex_lock(&bdev->bd_fsfreeze_mutex); - if (++bdev->bd_fsfreeze_count > 1) - goto done; - - sb = get_active_super(bdev); - if (!sb) - goto sync; - if (sb->s_op->freeze_super) - error = sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE); - else - error = freeze_super(sb, FREEZE_HOLDER_USERSPACE); - deactivate_super(sb); - if (error) { - bdev->bd_fsfreeze_count--; - goto done; + if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return 0; } - bdev->bd_fsfreeze_sb = sb; -sync: - sync_blockdev(bdev); -done: + mutex_lock(&bdev->bd_holder_lock); + if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) { + error = bdev->bd_holder_ops->freeze(bdev); + lockdep_assert_not_held(&bdev->bd_holder_lock); + } else { + mutex_unlock(&bdev->bd_holder_lock); + error = sync_blockdev(bdev); + } + + if (error) + atomic_dec(&bdev->bd_fsfreeze_count); + mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; } -EXPORT_SYMBOL(freeze_bdev); +EXPORT_SYMBOL(bdev_freeze); /** - * thaw_bdev - unlock filesystem + * bdev_thaw - unlock filesystem * @bdev: blockdevice to unlock * - * Unlocks the filesystem and marks it writeable again after freeze_bdev(). + * Unlocks the filesystem and marks it writeable again after bdev_freeze(). + * + * Return: On success zero is returned, negative error code on failure. */ -int thaw_bdev(struct block_device *bdev) +int bdev_thaw(struct block_device *bdev) { - struct super_block *sb; - int error = -EINVAL; + int error = -EINVAL, nr_freeze; mutex_lock(&bdev->bd_fsfreeze_mutex); - if (!bdev->bd_fsfreeze_count) + + /* + * If this returns < 0 it means that @bd_fsfreeze_count was + * already 0 and no decrement was performed. + */ + nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count); + if (nr_freeze < 0) goto out; error = 0; - if (--bdev->bd_fsfreeze_count > 0) + if (nr_freeze > 0) goto out; - sb = bdev->bd_fsfreeze_sb; - if (!sb) - goto out; + mutex_lock(&bdev->bd_holder_lock); + if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) { + error = bdev->bd_holder_ops->thaw(bdev); + lockdep_assert_not_held(&bdev->bd_holder_lock); + } else { + mutex_unlock(&bdev->bd_holder_lock); + } - if (sb->s_op->thaw_super) - error = sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE); - else - error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); if (error) - bdev->bd_fsfreeze_count++; - else - bdev->bd_fsfreeze_sb = NULL; + atomic_inc(&bdev->bd_fsfreeze_count); out: mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; } -EXPORT_SYMBOL(thaw_bdev); +EXPORT_SYMBOL(bdev_thaw); /* * pseudo-fs @@ -633,6 +639,14 @@ static void blkdev_flush_mapping(struct block_device *bdev) bdev_write_inode(bdev); } +static void blkdev_put_whole(struct block_device *bdev) +{ + if (atomic_dec_and_test(&bdev->bd_openers)) + blkdev_flush_mapping(bdev); + if (bdev->bd_disk->fops->release) + bdev->bd_disk->fops->release(bdev->bd_disk); +} + static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) { struct gendisk *disk = bdev->bd_disk; @@ -651,20 +665,21 @@ static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) if (!atomic_read(&bdev->bd_openers)) set_init_blocksize(bdev); - if (test_bit(GD_NEED_PART_SCAN, &disk->state)) - bdev_disk_changed(disk, false); atomic_inc(&bdev->bd_openers); + if (test_bit(GD_NEED_PART_SCAN, &disk->state)) { + /* + * Only return scanning errors if we are called from contexts + * that explicitly want them, e.g. the BLKRRPART ioctl. + */ + ret = bdev_disk_changed(disk, false); + if (ret && (mode & BLK_OPEN_STRICT_SCAN)) { + blkdev_put_whole(bdev); + return ret; + } + } return 0; } -static void blkdev_put_whole(struct block_device *bdev) -{ - if (atomic_dec_and_test(&bdev->bd_openers)) - blkdev_flush_mapping(bdev); - if (bdev->bd_disk->fops->release) - bdev->bd_disk->fops->release(bdev->bd_disk); -} - static int blkdev_get_part(struct block_device *part, blk_mode_t mode) { struct gendisk *disk = part->bd_disk; @@ -729,9 +744,60 @@ void blkdev_put_no_open(struct block_device *bdev) { put_device(&bdev->bd_device); } - + +static bool bdev_writes_blocked(struct block_device *bdev) +{ + return bdev->bd_writers < 0; +} + +static void bdev_block_writes(struct block_device *bdev) +{ + bdev->bd_writers--; +} + +static void bdev_unblock_writes(struct block_device *bdev) +{ + bdev->bd_writers++; +} + +static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode) +{ + if (bdev_allow_write_mounted) + return true; + /* Writes blocked? */ + if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev)) + return false; + if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0) + return false; + return true; +} + +static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode) +{ + if (bdev_allow_write_mounted) + return; + + /* Claim exclusive or shared write access. */ + if (mode & BLK_OPEN_RESTRICT_WRITES) + bdev_block_writes(bdev); + else if (mode & BLK_OPEN_WRITE) + bdev->bd_writers++; +} + +static void bdev_yield_write_access(struct block_device *bdev, blk_mode_t mode) +{ + if (bdev_allow_write_mounted) + return; + + /* Yield exclusive or shared write access. */ + if (mode & BLK_OPEN_RESTRICT_WRITES) + bdev_unblock_writes(bdev); + else if (mode & BLK_OPEN_WRITE) + bdev->bd_writers--; +} + /** - * blkdev_get_by_dev - open a block device by device number + * bdev_open_by_dev - open a block device by device number * @dev: device number of block device to open * @mode: open mode (BLK_OPEN_*) * @holder: exclusive holder identifier @@ -743,32 +809,46 @@ void blkdev_put_no_open(struct block_device *bdev) * * Use this interface ONLY if you really do not have anything better - i.e. when * you are behind a truly sucky interface and all you are given is a device - * number. Everything else should use blkdev_get_by_path(). + * number. Everything else should use bdev_open_by_path(). * * CONTEXT: * Might sleep. * * RETURNS: - * Reference to the block_device on success, ERR_PTR(-errno) on failure. + * Handle with a reference to the block_device on success, ERR_PTR(-errno) on + * failure. */ -struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder, - const struct blk_holder_ops *hops) +struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, + const struct blk_holder_ops *hops) { - bool unblock_events = true; + struct bdev_handle *handle = kmalloc(sizeof(struct bdev_handle), + GFP_KERNEL); struct block_device *bdev; + bool unblock_events = true; struct gendisk *disk; int ret; + if (!handle) + return ERR_PTR(-ENOMEM); + ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, MAJOR(dev), MINOR(dev), ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) | ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0)); if (ret) - return ERR_PTR(ret); + goto free_handle; + + /* Blocking writes requires exclusive opener */ + if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) { + ret = -EINVAL; + goto free_handle; + } bdev = blkdev_get_no_open(dev); - if (!bdev) - return ERR_PTR(-ENXIO); + if (!bdev) { + ret = -ENXIO; + goto free_handle; + } disk = bdev->bd_disk; if (holder) { @@ -791,12 +871,16 @@ struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder, goto abort_claiming; if (!try_module_get(disk->fops->owner)) goto abort_claiming; + ret = -EBUSY; + if (!bdev_may_open(bdev, mode)) + goto put_module; if (bdev_is_partition(bdev)) ret = blkdev_get_part(bdev, mode); else ret = blkdev_get_whole(bdev, mode); if (ret) goto put_module; + bdev_claim_write_access(bdev, mode); if (holder) { bd_finish_claiming(bdev, holder, hops); @@ -817,7 +901,10 @@ struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder, if (unblock_events) disk_unblock_events(disk); - return bdev; + handle->bdev = bdev; + handle->holder = holder; + handle->mode = mode; + return handle; put_module: module_put(disk->fops->owner); abort_claiming: @@ -827,34 +914,14 @@ abort_claiming: disk_unblock_events(disk); put_blkdev: blkdev_put_no_open(bdev); +free_handle: + kfree(handle); return ERR_PTR(ret); } -EXPORT_SYMBOL(blkdev_get_by_dev); - -struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, - const struct blk_holder_ops *hops) -{ - struct bdev_handle *handle = kmalloc(sizeof(*handle), GFP_KERNEL); - struct block_device *bdev; - - if (!handle) - return ERR_PTR(-ENOMEM); - bdev = blkdev_get_by_dev(dev, mode, holder, hops); - if (IS_ERR(bdev)) { - kfree(handle); - return ERR_CAST(bdev); - } - handle->bdev = bdev; - handle->holder = holder; - if (holder) - mode |= BLK_OPEN_EXCL; - handle->mode = mode; - return handle; -} EXPORT_SYMBOL(bdev_open_by_dev); /** - * blkdev_get_by_path - open a block device by name + * bdev_open_by_path - open a block device by name * @path: path to the block device to open * @mode: open mode (BLK_OPEN_*) * @holder: exclusive holder identifier @@ -868,29 +935,9 @@ EXPORT_SYMBOL(bdev_open_by_dev); * Might sleep. * * RETURNS: - * Reference to the block_device on success, ERR_PTR(-errno) on failure. + * Handle with a reference to the block_device on success, ERR_PTR(-errno) on + * failure. */ -struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode, - void *holder, const struct blk_holder_ops *hops) -{ - struct block_device *bdev; - dev_t dev; - int error; - - error = lookup_bdev(path, &dev); - if (error) - return ERR_PTR(error); - - bdev = blkdev_get_by_dev(dev, mode, holder, hops); - if (!IS_ERR(bdev) && (mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { - blkdev_put(bdev, holder); - return ERR_PTR(-EACCES); - } - - return bdev; -} -EXPORT_SYMBOL(blkdev_get_by_path); - struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops) { @@ -913,8 +960,9 @@ struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, } EXPORT_SYMBOL(bdev_open_by_path); -void blkdev_put(struct block_device *bdev, void *holder) +void bdev_release(struct bdev_handle *handle) { + struct block_device *bdev = handle->bdev; struct gendisk *disk = bdev->bd_disk; /* @@ -928,8 +976,10 @@ void blkdev_put(struct block_device *bdev, void *holder) sync_blockdev(bdev); mutex_lock(&disk->open_mutex); - if (holder) - bd_end_claim(bdev, holder); + bdev_yield_write_access(bdev, handle->mode); + + if (handle->holder) + bd_end_claim(bdev, handle->holder); /* * Trigger event checking and tell drivers to flush MEDIA_CHANGE @@ -946,12 +996,6 @@ void blkdev_put(struct block_device *bdev, void *holder) module_put(disk->fops->owner); blkdev_put_no_open(bdev); -} -EXPORT_SYMBOL(blkdev_put); - -void bdev_release(struct bdev_handle *handle) -{ - blkdev_put(handle->bdev, handle->holder); kfree(handle); } EXPORT_SYMBOL(bdev_release); @@ -1102,3 +1146,12 @@ void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) blkdev_put_no_open(bdev); } + +static int __init setup_bdev_allow_write_mounted(char *str) +{ + if (kstrtobool(str, &bdev_allow_write_mounted)) + pr_warn("Invalid option string for bdev_allow_write_mounted:" + " '%s'\n", str); + return 1; +} +__setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index ec8ac8cf6e..c9a16fba58 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -69,15 +69,15 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, memset(bip, 0, sizeof(*bip)); + /* always report as many vecs as asked explicitly, not inline vecs */ + bip->bip_max_vcnt = nr_vecs; if (nr_vecs > inline_vecs) { - bip->bip_max_vcnt = nr_vecs; bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool, &bip->bip_max_vcnt, gfp_mask); if (!bip->bip_vec) goto err; } else { bip->bip_vec = bip->bip_inline_vecs; - bip->bip_max_vcnt = inline_vecs; } bip->bip_bio = bio; @@ -91,6 +91,47 @@ err: } EXPORT_SYMBOL(bio_integrity_alloc); +static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs, + bool dirty) +{ + int i; + + for (i = 0; i < nr_vecs; i++) { + if (dirty && !PageCompound(bv[i].bv_page)) + set_page_dirty_lock(bv[i].bv_page); + unpin_user_page(bv[i].bv_page); + } +} + +static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip) +{ + unsigned short nr_vecs = bip->bip_max_vcnt - 1; + struct bio_vec *copy = &bip->bip_vec[1]; + size_t bytes = bip->bip_iter.bi_size; + struct iov_iter iter; + int ret; + + iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes); + ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter); + WARN_ON_ONCE(ret != bytes); + + bio_integrity_unpin_bvec(copy, nr_vecs, true); +} + +static void bio_integrity_unmap_user(struct bio_integrity_payload *bip) +{ + bool dirty = bio_data_dir(bip->bip_bio) == READ; + + if (bip->bip_flags & BIP_COPY_USER) { + if (dirty) + bio_integrity_uncopy_user(bip); + kfree(bvec_virt(bip->bip_vec)); + return; + } + + bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt, dirty); +} + /** * bio_integrity_free - Free bio integrity payload * @bio: bio containing bip to be freed @@ -105,6 +146,8 @@ void bio_integrity_free(struct bio *bio) if (bip->bip_flags & BIP_BLOCK_INTEGRITY) kfree(bvec_virt(bip->bip_vec)); + else if (bip->bip_flags & BIP_INTEGRITY_USER) + bio_integrity_unmap_user(bip); __bio_integrity_free(bs, bip); bio->bi_integrity = NULL; @@ -160,6 +203,177 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, } EXPORT_SYMBOL(bio_integrity_add_page); +static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, + int nr_vecs, unsigned int len, + unsigned int direction, u32 seed) +{ + bool write = direction == ITER_SOURCE; + struct bio_integrity_payload *bip; + struct iov_iter iter; + void *buf; + int ret; + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (write) { + iov_iter_bvec(&iter, direction, bvec, nr_vecs, len); + if (!copy_from_iter_full(buf, len, &iter)) { + ret = -EFAULT; + goto free_buf; + } + + bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); + } else { + memset(buf, 0, len); + + /* + * We need to preserve the original bvec and the number of vecs + * in it for completion handling + */ + bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs + 1); + } + + if (IS_ERR(bip)) { + ret = PTR_ERR(bip); + goto free_buf; + } + + if (write) + bio_integrity_unpin_bvec(bvec, nr_vecs, false); + else + memcpy(&bip->bip_vec[1], bvec, nr_vecs * sizeof(*bvec)); + + ret = bio_integrity_add_page(bio, virt_to_page(buf), len, + offset_in_page(buf)); + if (ret != len) { + ret = -ENOMEM; + goto free_bip; + } + + bip->bip_flags |= BIP_INTEGRITY_USER | BIP_COPY_USER; + bip->bip_iter.bi_sector = seed; + return 0; +free_bip: + bio_integrity_free(bio); +free_buf: + kfree(buf); + return ret; +} + +static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec, + int nr_vecs, unsigned int len, u32 seed) +{ + struct bio_integrity_payload *bip; + + bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs); + if (IS_ERR(bip)) + return PTR_ERR(bip); + + memcpy(bip->bip_vec, bvec, nr_vecs * sizeof(*bvec)); + bip->bip_flags |= BIP_INTEGRITY_USER; + bip->bip_iter.bi_sector = seed; + bip->bip_iter.bi_size = len; + return 0; +} + +static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, + int nr_vecs, ssize_t bytes, ssize_t offset) +{ + unsigned int nr_bvecs = 0; + int i, j; + + for (i = 0; i < nr_vecs; i = j) { + size_t size = min_t(size_t, bytes, PAGE_SIZE - offset); + struct folio *folio = page_folio(pages[i]); + + bytes -= size; + for (j = i + 1; j < nr_vecs; j++) { + size_t next = min_t(size_t, PAGE_SIZE, bytes); + + if (page_folio(pages[j]) != folio || + pages[j] != pages[j - 1] + 1) + break; + unpin_user_page(pages[j]); + size += next; + bytes -= next; + } + + bvec_set_page(&bvec[nr_bvecs], pages[i], size, offset); + offset = 0; + nr_bvecs++; + } + + return nr_bvecs; +} + +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, + u32 seed) +{ + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + unsigned int align = q->dma_pad_mask | queue_dma_alignment(q); + struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages; + struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec; + unsigned int direction, nr_bvecs; + struct iov_iter iter; + int ret, nr_vecs; + size_t offset; + bool copy; + + if (bio_integrity(bio)) + return -EINVAL; + if (bytes >> SECTOR_SHIFT > queue_max_hw_sectors(q)) + return -E2BIG; + + if (bio_data_dir(bio) == READ) + direction = ITER_DEST; + else + direction = ITER_SOURCE; + + iov_iter_ubuf(&iter, direction, ubuf, bytes); + nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1); + if (nr_vecs > BIO_MAX_VECS) + return -E2BIG; + if (nr_vecs > UIO_FASTIOV) { + bvec = kcalloc(nr_vecs, sizeof(*bvec), GFP_KERNEL); + if (!bvec) + return -ENOMEM; + pages = NULL; + } + + copy = !iov_iter_is_aligned(&iter, align, align); + ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset); + if (unlikely(ret < 0)) + goto free_bvec; + + nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset); + if (pages != stack_pages) + kvfree(pages); + if (nr_bvecs > queue_max_integrity_segments(q)) + copy = true; + + if (copy) + ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes, + direction, seed); + else + ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes, seed); + if (ret) + goto release_pages; + if (bvec != stack_vec) + kfree(bvec); + + return 0; + +release_pages: + bio_integrity_unpin_bvec(bvec, nr_bvecs, false); +free_bvec: + if (bvec != stack_vec) + kfree(bvec); + return ret; +} +EXPORT_SYMBOL_GPL(bio_integrity_map_user); + /** * bio_integrity_process - Process integrity metadata for a bio * @bio: bio to generate/verify integrity metadata for diff --git a/block/bio.c b/block/bio.c index 62419aa09d..b52b56067e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -966,10 +966,13 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page) { + unsigned int max_size = max_sectors << SECTOR_SHIFT; + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; - if (((bio->bi_iter.bi_size + len) >> SECTOR_SHIFT) > max_sectors) + len = min3(len, max_size, queue_max_segment_size(q)); + if (len > max_size - bio->bi_iter.bi_size) return 0; if (bio->bi_vcnt > 0) { diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4b48c2c440..4529122e0c 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -300,7 +300,7 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) * @disk: gendisk the new blkg is associated with * @gfp_mask: allocation mask to use * - * Allocate a new blkg assocating @blkcg and @q. + * Allocate a new blkg associating @blkcg and @disk. */ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, gfp_t gfp_mask) @@ -575,13 +575,13 @@ static void blkg_destroy(struct blkcg_gq *blkg) static void blkg_destroy_all(struct gendisk *disk) { struct request_queue *q = disk->queue; - struct blkcg_gq *blkg, *n; + struct blkcg_gq *blkg; int count = BLKG_DESTROY_BATCH_SIZE; int i; restart: spin_lock_irq(&q->queue_lock); - list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { + list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; if (hlist_unhashed(&blkg->blkcg_node)) @@ -1409,6 +1409,12 @@ static int blkcg_css_online(struct cgroup_subsys_state *css) return 0; } +void blkg_init_queue(struct request_queue *q) +{ + INIT_LIST_HEAD(&q->blkg_list); + mutex_init(&q->blkcg_mutex); +} + int blkcg_init_disk(struct gendisk *disk) { struct request_queue *q = disk->queue; @@ -1416,9 +1422,6 @@ int blkcg_init_disk(struct gendisk *disk) bool preloaded; int ret; - INIT_LIST_HEAD(&q->blkg_list); - mutex_init(&q->blkcg_mutex); - new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) return -ENOMEM; @@ -2064,6 +2067,9 @@ void bio_associate_blkg(struct bio *bio) { struct cgroup_subsys_state *css; + if (blk_op_is_passthrough(bio->bi_opf)) + return; + rcu_read_lock(); if (bio->bi_blkg) diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index b927a4a0ad..5b0bdc268a 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -188,6 +188,7 @@ struct blkcg_policy { extern struct blkcg blkcg_root; extern bool blkcg_debug_stats; +void blkg_init_queue(struct request_queue *q); int blkcg_init_disk(struct gendisk *disk); void blkcg_exit_disk(struct gendisk *disk); @@ -481,6 +482,7 @@ struct blkcg { }; static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } +static inline void blkg_init_queue(struct request_queue *q) { } static inline int blkcg_init_disk(struct gendisk *disk) { return 0; } static inline void blkcg_exit_disk(struct gendisk *disk) { } static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } diff --git a/block/blk-core.c b/block/blk-core.c index 2eca76ccf4..99d6840857 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -49,6 +49,7 @@ #include "blk-pm.h" #include "blk-cgroup.h" #include "blk-throttle.h" +#include "blk-ioprio.h" struct dentry *blk_debugfs_root; @@ -430,6 +431,8 @@ struct request_queue *blk_alloc_queue(int node_id) init_waitqueue_head(&q->mq_freeze_wq); mutex_init(&q->mq_freeze_lock); + blkg_init_queue(q); + /* * Init percpu_ref in atomic mode so that it's faster to shutdown. * See blk_register_queue() for details. @@ -772,6 +775,15 @@ void submit_bio_noacct(struct bio *bio) bio_clear_polled(bio); switch (bio_op(bio)) { + case REQ_OP_READ: + case REQ_OP_WRITE: + break; + case REQ_OP_FLUSH: + /* + * REQ_OP_FLUSH can't be submitted through bios, it is only + * synthetized in struct request by the flush state machine. + */ + goto not_supported; case REQ_OP_DISCARD: if (!bdev_max_discard_sectors(bdev)) goto not_supported; @@ -785,6 +797,10 @@ void submit_bio_noacct(struct bio *bio) if (status != BLK_STS_OK) goto end_io; break; + case REQ_OP_WRITE_ZEROES: + if (!q->limits.max_write_zeroes_sectors) + goto not_supported; + break; case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: @@ -796,12 +812,15 @@ void submit_bio_noacct(struct bio *bio) if (!bdev_is_zoned(bio->bi_bdev) || !blk_queue_zone_resetall(q)) goto not_supported; break; - case REQ_OP_WRITE_ZEROES: - if (!q->limits.max_write_zeroes_sectors) - goto not_supported; - break; + case REQ_OP_DRV_IN: + case REQ_OP_DRV_OUT: + /* + * Driver private operations are only used with passthrough + * requests. + */ + fallthrough; default: - break; + goto not_supported; } if (blk_throtl_bio(bio)) @@ -817,6 +836,14 @@ end_io: } EXPORT_SYMBOL(submit_bio_noacct); +static void bio_set_ioprio(struct bio *bio) +{ + /* Nobody set ioprio so far? Initialize it based on task's nice value */ + if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE) + bio->bi_ioprio = get_current_ioprio(); + blkcg_set_ioprio(bio); +} + /** * submit_bio - submit a bio to the block device layer for I/O * @bio: The &struct bio which describes the I/O @@ -839,6 +866,7 @@ void submit_bio(struct bio *bio) count_vm_events(PGPGOUT, bio_sectors(bio)); } + bio_set_ioprio(bio); submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio); diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 7ee8d85c2c..04d44f0bcb 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1261,7 +1261,7 @@ static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now) static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; - u64 last_period, cur_period; + u64 __maybe_unused last_period, cur_period; u64 vtime, vtarget; int i; diff --git a/block/blk-merge.c b/block/blk-merge.c index 65e75efa9b..2d470cf217 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -115,17 +115,13 @@ static struct bio *bio_split_discard(struct bio *bio, *nsegs = 1; - /* Zero-sector (unknown) and one-sector granularities are the same. */ granularity = max(lim->discard_granularity >> 9, 1U); max_discard_sectors = min(lim->max_discard_sectors, bio_allowed_max_sectors(lim)); max_discard_sectors -= max_discard_sectors % granularity; - - if (unlikely(!max_discard_sectors)) { - /* XXX: warn */ + if (unlikely(!max_discard_sectors)) return NULL; - } if (bio_sectors(bio) <= max_discard_sectors) return NULL; diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 5cbeb9344f..94668e72ab 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -479,23 +479,6 @@ out: return res; } -static int hctx_run_show(void *data, struct seq_file *m) -{ - struct blk_mq_hw_ctx *hctx = data; - - seq_printf(m, "%lu\n", hctx->run); - return 0; -} - -static ssize_t hctx_run_write(void *data, const char __user *buf, size_t count, - loff_t *ppos) -{ - struct blk_mq_hw_ctx *hctx = data; - - hctx->run = 0; - return count; -} - static int hctx_active_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; @@ -624,7 +607,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"tags_bitmap", 0400, hctx_tags_bitmap_show}, {"sched_tags", 0400, hctx_sched_tags_show}, {"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show}, - {"run", 0600, hctx_run_show, hctx_run_write}, {"active", 0400, hctx_active_show}, {"dispatch_busy", 0400, hctx_dispatch_busy_show}, {"type", 0400, hctx_type_show}, diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 67c95f31b1..451a2c1f1f 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -324,8 +324,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) return; - hctx->run++; - /* * A return of -EAGAIN is an indication that hctx->dispatch is not * empty and we must run again in order to avoid starving flushes. diff --git a/block/blk-mq.c b/block/blk-mq.c index a71974a5e5..25d2f3239e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -40,7 +40,6 @@ #include "blk-stat.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" -#include "blk-ioprio.h" static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd); @@ -1248,7 +1247,8 @@ void blk_mq_start_request(struct request *rq) trace_block_rq_issue(rq); - if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { + if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) && + !blk_rq_is_passthrough(rq)) { rq->io_start_time_ns = ktime_get_ns(); rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; @@ -2906,8 +2906,11 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, return NULL; } -/* return true if this @rq can be used for @bio */ -static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug, +/* + * Check if we can use the passed on request for submitting the passed in bio, + * and remove it from the request list if it can be used. + */ +static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, struct bio *bio) { enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf); @@ -2935,14 +2938,6 @@ static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug, return true; } -static void bio_set_ioprio(struct bio *bio) -{ - /* Nobody set ioprio so far? Initialize it based on task's nice value */ - if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE) - bio->bi_ioprio = get_current_ioprio(); - blkcg_set_ioprio(bio); -} - /** * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. @@ -2967,7 +2962,6 @@ void blk_mq_submit_bio(struct bio *bio) blk_status_t ret; bio = blk_queue_bounce(bio, q); - bio_set_ioprio(bio); if (plug) { rq = rq_list_peek(&plug->cached_rq); @@ -2984,7 +2978,7 @@ void blk_mq_submit_bio(struct bio *bio) return; if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) return; - if (blk_mq_can_use_cached_rq(rq, plug, bio)) + if (blk_mq_use_cached_rq(rq, plug, bio)) goto done; percpu_ref_get(&q->q_usage_counter); } else { diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index f48ee150d6..37245c97ee 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -118,7 +118,7 @@ static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio) static inline void rq_qos_done(struct request_queue *q, struct request *rq) { - if (q->rq_qos) + if (q->rq_qos && !blk_rq_is_passthrough(rq)) __rq_qos_done(q->rq_qos, rq); } diff --git a/block/blk-settings.c b/block/blk-settings.c index 7019b8e204..5adadce084 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -48,7 +48,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->max_discard_sectors = 0; lim->max_hw_discard_sectors = 0; lim->max_secure_erase_sectors = 0; - lim->discard_granularity = 0; + lim->discard_granularity = 512; lim->discard_alignment = 0; lim->discard_misaligned = 0; lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; @@ -56,7 +56,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->alignment_offset = 0; lim->io_opt = 0; lim->misaligned = 0; - lim->zoned = BLK_ZONED_NONE; + lim->zoned = false; lim->zone_write_granularity = 0; lim->dma_alignment = 511; } @@ -127,8 +127,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto if ((max_hw_sectors << 9) < PAGE_SIZE) { max_hw_sectors = 1 << (PAGE_SHIFT - 9); - printk(KERN_INFO "%s: set to minimum %d\n", - __func__, max_hw_sectors); + pr_info("%s: set to minimum %u\n", __func__, max_hw_sectors); } max_hw_sectors = round_down(max_hw_sectors, @@ -140,7 +139,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto if (limits->max_user_sectors) max_sectors = min(max_sectors, limits->max_user_sectors); else - max_sectors = min(max_sectors, BLK_DEF_MAX_SECTORS); + max_sectors = min(max_sectors, BLK_DEF_MAX_SECTORS_CAP); max_sectors = round_down(max_sectors, limits->logical_block_size >> SECTOR_SHIFT); @@ -248,8 +247,7 @@ void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments { if (!max_segments) { max_segments = 1; - printk(KERN_INFO "%s: set to minimum %d\n", - __func__, max_segments); + pr_info("%s: set to minimum %u\n", __func__, max_segments); } q->limits.max_segments = max_segments; @@ -285,8 +283,7 @@ void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size) { if (max_size < PAGE_SIZE) { max_size = PAGE_SIZE; - printk(KERN_INFO "%s: set to minimum %d\n", - __func__, max_size); + pr_info("%s: set to minimum %u\n", __func__, max_size); } /* see blk_queue_virt_boundary() for the explanation */ @@ -312,6 +309,9 @@ void blk_queue_logical_block_size(struct request_queue *q, unsigned int size) limits->logical_block_size = size; + if (limits->discard_granularity < limits->logical_block_size) + limits->discard_granularity = limits->logical_block_size; + if (limits->physical_block_size < size) limits->physical_block_size = size; @@ -342,6 +342,9 @@ void blk_queue_physical_block_size(struct request_queue *q, unsigned int size) if (q->limits.physical_block_size < q->limits.logical_block_size) q->limits.physical_block_size = q->limits.logical_block_size; + if (q->limits.discard_granularity < q->limits.physical_block_size) + q->limits.discard_granularity = q->limits.physical_block_size; + if (q->limits.io_min < q->limits.physical_block_size) q->limits.io_min = q->limits.physical_block_size; } @@ -744,8 +747,7 @@ void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask) { if (mask < PAGE_SIZE - 1) { mask = PAGE_SIZE - 1; - printk(KERN_INFO "%s: set to minimum %lx\n", - __func__, mask); + pr_info("%s: set to minimum %lx\n", __func__, mask); } q->limits.seg_boundary_mask = mask; @@ -845,8 +847,6 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) blk_queue_flag_set(QUEUE_FLAG_FUA, q); else blk_queue_flag_clear(QUEUE_FLAG_FUA, q); - - wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); } EXPORT_SYMBOL_GPL(blk_queue_write_cache); @@ -888,81 +888,22 @@ bool blk_queue_can_use_dma_map_merging(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging); -static bool disk_has_partitions(struct gendisk *disk) -{ - unsigned long idx; - struct block_device *part; - bool ret = false; - - rcu_read_lock(); - xa_for_each(&disk->part_tbl, idx, part) { - if (bdev_is_partition(part)) { - ret = true; - break; - } - } - rcu_read_unlock(); - - return ret; -} - /** - * disk_set_zoned - configure the zoned model for a disk - * @disk: the gendisk of the queue to configure - * @model: the zoned model to set - * - * Set the zoned model of @disk to @model. - * - * When @model is BLK_ZONED_HM (host managed), this should be called only - * if zoned block device support is enabled (CONFIG_BLK_DEV_ZONED option). - * If @model specifies BLK_ZONED_HA (host aware), the effective model used - * depends on CONFIG_BLK_DEV_ZONED settings and on the existence of partitions - * on the disk. + * disk_set_zoned - inidicate a zoned device + * @disk: gendisk to configure */ -void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model) +void disk_set_zoned(struct gendisk *disk) { struct request_queue *q = disk->queue; - unsigned int old_model = q->limits.zoned; - - switch (model) { - case BLK_ZONED_HM: - /* - * Host managed devices are supported only if - * CONFIG_BLK_DEV_ZONED is enabled. - */ - WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)); - break; - case BLK_ZONED_HA: - /* - * Host aware devices can be treated either as regular block - * devices (similar to drive managed devices) or as zoned block - * devices to take advantage of the zone command set, similarly - * to host managed devices. We try the latter if there are no - * partitions and zoned block device support is enabled, else - * we do nothing special as far as the block layer is concerned. - */ - if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || - disk_has_partitions(disk)) - model = BLK_ZONED_NONE; - break; - case BLK_ZONED_NONE: - default: - if (WARN_ON_ONCE(model != BLK_ZONED_NONE)) - model = BLK_ZONED_NONE; - break; - } - q->limits.zoned = model; - if (model != BLK_ZONED_NONE) { - /* - * Set the zone write granularity to the device logical block - * size by default. The driver can change this value if needed. - */ - blk_queue_zone_write_granularity(q, - queue_logical_block_size(q)); - } else if (old_model != BLK_ZONED_NONE) { - disk_clear_zone_settings(disk); - } + WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)); + + /* + * Set the zone write granularity to the device logical block + * size by default. The driver can change this value if needed. + */ + q->limits.zoned = true; + blk_queue_zone_write_granularity(q, queue_logical_block_size(q)); } EXPORT_SYMBOL_GPL(disk_set_zoned); diff --git a/block/blk-stat.c b/block/blk-stat.c index 7ff76ae6c7..e42c263e53 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -27,7 +27,7 @@ void blk_rq_stat_init(struct blk_rq_stat *stat) /* src is a per-cpu stat, mean isn't initialized */ void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) { - if (!src->nr_samples) + if (dst->nr_samples + src->nr_samples <= dst->nr_samples) return; dst->min = min(dst->min, src->min); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 0b2d047663..6b2429cad8 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -241,7 +241,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) if (max_sectors_kb == 0) { q->limits.max_user_sectors = 0; max_sectors_kb = min(max_hw_sectors_kb, - BLK_DEF_MAX_SECTORS >> 1); + BLK_DEF_MAX_SECTORS_CAP >> 1); } else { if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) @@ -309,14 +309,9 @@ QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0); static ssize_t queue_zoned_show(struct request_queue *q, char *page) { - switch (blk_queue_zoned_model(q)) { - case BLK_ZONED_HA: - return sprintf(page, "host-aware\n"); - case BLK_ZONED_HM: + if (blk_queue_is_zoned(q)) return sprintf(page, "host-managed\n"); - default: - return sprintf(page, "none\n"); - } + return sprintf(page, "none\n"); } static ssize_t queue_nr_zones_show(struct request_queue *q, char *page) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index f8fda9cf58..0c0e270a82 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -84,8 +84,6 @@ struct rq_wb { u64 sync_issue; void *sync_cookie; - unsigned int wc; - unsigned long last_issue; /* last non-throttled issue */ unsigned long last_comp; /* last non-throttled comp */ unsigned long min_lat_nsec; @@ -207,7 +205,8 @@ static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw, */ if (wb_acct & WBT_DISCARD) limit = rwb->wb_background; - else if (rwb->wc && !wb_recent_wait(rwb)) + else if (test_bit(QUEUE_FLAG_WC, &rwb->rqos.disk->queue->queue_flags) && + !wb_recent_wait(rwb)) limit = 0; else limit = rwb->wb_normal; @@ -699,13 +698,6 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq) } } -void wbt_set_write_cache(struct request_queue *q, bool write_cache_on) -{ - struct rq_qos *rqos = wbt_rq_qos(q); - if (rqos) - RQWB(rqos)->wc = write_cache_on; -} - /* * Enable wbt if defaults are configured that way */ @@ -918,7 +910,6 @@ int wbt_init(struct gendisk *disk) rwb->last_comp = rwb->last_issue = jiffies; rwb->win_nsec = RWB_WINDOW_NSEC; rwb->enable_state = WBT_STATE_ON_DEFAULT; - rwb->wc = test_bit(QUEUE_FLAG_WC, &q->queue_flags); rwb->rq_depth.default_depth = RWB_DEF_DEPTH; rwb->min_lat_nsec = wbt_default_latency_nsec(q); rwb->rq_depth.queue_depth = blk_queue_depth(q); diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 8a029e138f..e5fc653b9b 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -12,8 +12,6 @@ u64 wbt_get_min_lat(struct request_queue *q); void wbt_set_min_lat(struct request_queue *q, u64 val); bool wbt_disabled(struct request_queue *); -void wbt_set_write_cache(struct request_queue *, bool); - u64 wbt_default_latency_nsec(struct request_queue *); #else @@ -24,9 +22,6 @@ static inline void wbt_disable_default(struct gendisk *disk) static inline void wbt_enable_default(struct gendisk *disk) { } -static inline void wbt_set_write_cache(struct request_queue *q, bool wc) -{ -} #endif /* CONFIG_BLK_WBT */ diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 619ee41a51..d343e5756a 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -498,7 +498,6 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, set_bit(idx, args->conv_zones_bitmap); break; case BLK_ZONE_TYPE_SEQWRITE_REQ: - case BLK_ZONE_TYPE_SEQWRITE_PREF: if (!args->seq_zones_wlock) { args->seq_zones_wlock = blk_alloc_zone_bitmap(q->node, args->nr_zones); @@ -506,6 +505,7 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, return -ENOMEM; } break; + case BLK_ZONE_TYPE_SEQWRITE_PREF: default: pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", disk->disk_name, (int)zone->type, zone->start); @@ -615,22 +615,3 @@ int blk_revalidate_disk_zones(struct gendisk *disk, return ret; } EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); - -void disk_clear_zone_settings(struct gendisk *disk) -{ - struct request_queue *q = disk->queue; - - blk_mq_freeze_queue(q); - - disk_free_zone_bitmaps(disk); - blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q); - q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE; - disk->nr_zones = 0; - disk->max_open_zones = 0; - disk->max_active_zones = 0; - q->limits.chunk_sectors = 0; - q->limits.zone_write_granularity = 0; - q->limits.max_zone_append_sectors = 0; - - blk_mq_unfreeze_queue(q); -} diff --git a/block/blk.h b/block/blk.h index 08a358bc09..1ef920f72e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -395,14 +395,12 @@ static inline struct bio *blk_queue_bounce(struct bio *bio, #ifdef CONFIG_BLK_DEV_ZONED void disk_free_zone_bitmaps(struct gendisk *disk); -void disk_clear_zone_settings(struct gendisk *disk); int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg); int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); #else /* CONFIG_BLK_DEV_ZONED */ static inline void disk_free_zone_bitmaps(struct gendisk *disk) {} -static inline void disk_clear_zone_settings(struct gendisk *disk) {} static inline int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg) { diff --git a/block/fops.c b/block/fops.c index 0abaac705d..0cf8cf72cd 100644 --- a/block/fops.c +++ b/block/fops.c @@ -410,9 +410,24 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock, return 0; } -static int blkdev_writepage(struct page *page, struct writeback_control *wbc) +/* + * We cannot call mpage_writepages() as it does not take the buffer lock. + * We must use block_write_full_folio() directly which holds the buffer + * lock. The buffer lock provides the synchronisation with writeback + * that filesystems rely on when they use the blockdev's mapping. + */ +static int blkdev_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - return block_write_full_page(page, blkdev_get_block, wbc); + struct blk_plug plug; + int err; + + blk_start_plug(&plug); + err = write_cache_pages(mapping, wbc, block_write_full_folio, + blkdev_get_block); + blk_finish_plug(&plug); + + return err; } static int blkdev_read_folio(struct file *file, struct folio *folio) @@ -449,7 +464,7 @@ const struct address_space_operations def_blk_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = blkdev_read_folio, .readahead = blkdev_readahead, - .writepage = blkdev_writepage, + .writepages = blkdev_writepages, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, .migrate_folio = buffer_migrate_folio_norefs, @@ -500,7 +515,7 @@ const struct address_space_operations def_blk_aops = { .readahead = blkdev_readahead, .writepages = blkdev_writepages, .is_partially_uptodate = iomap_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .migrate_folio = filemap_migrate_folio, }; #endif /* CONFIG_BUFFER_HEAD */ diff --git a/block/ioctl.c b/block/ioctl.c index 438f79c564..5f8c988239 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -556,7 +556,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, return -EACCES; if (bdev_is_partition(bdev)) return -EINVAL; - return disk_scan_partitions(bdev->bd_disk, mode); + return disk_scan_partitions(bdev->bd_disk, + mode | BLK_OPEN_STRICT_SCAN); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACETEARDOWN: diff --git a/block/ioprio.c b/block/ioprio.c index b5a942519a..73301a2614 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -139,32 +139,6 @@ out: return ret; } -/* - * If the task has set an I/O priority, use that. Otherwise, return - * the default I/O priority. - * - * Expected to be called for current task or with task_lock() held to keep - * io_context stable. - */ -int __get_task_ioprio(struct task_struct *p) -{ - struct io_context *ioc = p->io_context; - int prio; - - if (p != current) - lockdep_assert_held(&p->alloc_lock); - if (ioc) - prio = ioc->ioprio; - else - prio = IOPRIO_DEFAULT; - - if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE) - prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p), - task_nice_ioprio(p)); - return prio; -} -EXPORT_SYMBOL_GPL(__get_task_ioprio); - static int get_task_ioprio(struct task_struct *p) { int ret; diff --git a/block/partitions/core.c b/block/partitions/core.c index f14602022c..5f5ed5c75f 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -305,18 +305,10 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, * Partitions are not supported on zoned block devices that are used as * such. */ - switch (disk->queue->limits.zoned) { - case BLK_ZONED_HM: + if (bdev_is_zoned(disk->part0)) { pr_warn("%s: partitions not supported on host managed zoned block device\n", disk->disk_name); return ERR_PTR(-ENXIO); - case BLK_ZONED_HA: - pr_info("%s: disabling host aware zoned block device support due to partitions\n", - disk->disk_name); - disk_set_zoned(disk, BLK_ZONED_NONE); - break; - case BLK_ZONED_NONE: - break; } if (xa_load(&disk->part_tbl, partno)) @@ -575,8 +567,8 @@ static bool blk_add_partition(struct gendisk *disk, part = add_partition(disk, p, from, size, state->parts[p].flags, &state->parts[p].info); if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) { - printk(KERN_ERR " %s: p%d could not be added: %ld\n", - disk->disk_name, p, -PTR_ERR(part)); + printk(KERN_ERR " %s: p%d could not be added: %pe\n", + disk->disk_name, p, part); return true; } @@ -618,7 +610,7 @@ static int blk_add_partitions(struct gendisk *disk) /* * Partitions are not supported on host managed zoned block devices. */ - if (disk->queue->limits.zoned == BLK_ZONED_HM) { + if (bdev_is_zoned(disk->part0)) { pr_warn("%s: ignoring partition table on host managed zoned block device\n", disk->disk_name); ret = 0; |