diff options
Diffstat (limited to 'block')
38 files changed, 1378 insertions, 786 deletions
diff --git a/block/Kconfig b/block/Kconfig index 55ae2286a..1de4682d4 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -78,6 +78,26 @@ config BLK_DEV_INTEGRITY_T10 select CRC_T10DIF select CRC64_ROCKSOFT +config BLK_DEV_WRITE_MOUNTED + bool "Allow writing to mounted block devices" + default y + help + When a block device is mounted, writing to its buffer cache is very + likely going to cause filesystem corruption. It is also rather easy to + crash the kernel in this way since the filesystem has no practical way + of detecting these writes to buffer cache and verifying its metadata + integrity. However there are some setups that need this capability + like running fsck on read-only mounted root device, modifying some + features on mounted ext4 filesystem, and similar. If you say N, the + kernel will prevent processes from writing to block devices that are + mounted by filesystems which provides some more protection from runaway + privileged processes and generally makes it much harder to crash + filesystem drivers. Note however that this does not prevent + underlying device(s) from being modified by other means, e.g. by + directly submitting SCSI commands or through access to lower layers of + storage stack. If in doubt, say Y. The configuration can be overridden + with the bdev_allow_write_mounted boot option. + config BLK_DEV_ZONED bool "Zoned block device support" select MQ_IOSCHED_DEADLINE diff --git a/block/bdev.c b/block/bdev.c index 750aec178..da2a167a4 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -30,6 +30,9 @@ #include "../fs/internal.h" #include "blk.h" +/* Should we allow writing to mounted block devices? */ +static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED); + struct bdev_inode { struct block_device bdev; struct inode vfs_inode; @@ -46,6 +49,12 @@ struct block_device *I_BDEV(struct inode *inode) } EXPORT_SYMBOL(I_BDEV); +struct block_device *file_bdev(struct file *bdev_file) +{ + return I_BDEV(bdev_file->f_mapping->host); +} +EXPORT_SYMBOL(file_bdev); + static void bdev_write_inode(struct block_device *bdev) { struct inode *inode = bdev->bd_inode; @@ -207,85 +216,88 @@ int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend) EXPORT_SYMBOL(sync_blockdev_range); /** - * freeze_bdev - lock a filesystem and force it into a consistent state + * bdev_freeze - lock a filesystem and force it into a consistent state * @bdev: blockdevice to lock * * If a superblock is found on this device, we take the s_umount semaphore * on it to make sure nobody unmounts until the snapshot creation is done. * The reference counter (bd_fsfreeze_count) guarantees that only the last * unfreeze process can unfreeze the frozen filesystem actually when multiple - * freeze requests arrive simultaneously. It counts up in freeze_bdev() and - * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze + * freeze requests arrive simultaneously. It counts up in bdev_freeze() and + * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze * actually. + * + * Return: On success zero is returned, negative error code on failure. */ -int freeze_bdev(struct block_device *bdev) +int bdev_freeze(struct block_device *bdev) { - struct super_block *sb; int error = 0; mutex_lock(&bdev->bd_fsfreeze_mutex); - if (++bdev->bd_fsfreeze_count > 1) - goto done; - - sb = get_active_super(bdev); - if (!sb) - goto sync; - if (sb->s_op->freeze_super) - error = sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE); - else - error = freeze_super(sb, FREEZE_HOLDER_USERSPACE); - deactivate_super(sb); - if (error) { - bdev->bd_fsfreeze_count--; - goto done; + if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return 0; } - bdev->bd_fsfreeze_sb = sb; -sync: - sync_blockdev(bdev); -done: + mutex_lock(&bdev->bd_holder_lock); + if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) { + error = bdev->bd_holder_ops->freeze(bdev); + lockdep_assert_not_held(&bdev->bd_holder_lock); + } else { + mutex_unlock(&bdev->bd_holder_lock); + error = sync_blockdev(bdev); + } + + if (error) + atomic_dec(&bdev->bd_fsfreeze_count); + mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; } -EXPORT_SYMBOL(freeze_bdev); +EXPORT_SYMBOL(bdev_freeze); /** - * thaw_bdev - unlock filesystem + * bdev_thaw - unlock filesystem * @bdev: blockdevice to unlock * - * Unlocks the filesystem and marks it writeable again after freeze_bdev(). + * Unlocks the filesystem and marks it writeable again after bdev_freeze(). + * + * Return: On success zero is returned, negative error code on failure. */ -int thaw_bdev(struct block_device *bdev) +int bdev_thaw(struct block_device *bdev) { - struct super_block *sb; - int error = -EINVAL; + int error = -EINVAL, nr_freeze; mutex_lock(&bdev->bd_fsfreeze_mutex); - if (!bdev->bd_fsfreeze_count) + + /* + * If this returns < 0 it means that @bd_fsfreeze_count was + * already 0 and no decrement was performed. + */ + nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count); + if (nr_freeze < 0) goto out; error = 0; - if (--bdev->bd_fsfreeze_count > 0) + if (nr_freeze > 0) goto out; - sb = bdev->bd_fsfreeze_sb; - if (!sb) - goto out; + mutex_lock(&bdev->bd_holder_lock); + if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) { + error = bdev->bd_holder_ops->thaw(bdev); + lockdep_assert_not_held(&bdev->bd_holder_lock); + } else { + mutex_unlock(&bdev->bd_holder_lock); + } - if (sb->s_op->thaw_super) - error = sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE); - else - error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); if (error) - bdev->bd_fsfreeze_count++; - else - bdev->bd_fsfreeze_sb = NULL; + atomic_inc(&bdev->bd_fsfreeze_count); out: mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; } -EXPORT_SYMBOL(thaw_bdev); +EXPORT_SYMBOL(bdev_thaw); /* * pseudo-fs @@ -362,24 +374,24 @@ static struct file_system_type bd_type = { }; struct super_block *blockdev_superblock __ro_after_init; +struct vfsmount *blockdev_mnt __ro_after_init; EXPORT_SYMBOL_GPL(blockdev_superblock); void __init bdev_cache_init(void) { int err; - static struct vfsmount *bd_mnt __ro_after_init; bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC), + SLAB_ACCOUNT|SLAB_PANIC), init_once); err = register_filesystem(&bd_type); if (err) panic("Cannot register bdev pseudo-fs"); - bd_mnt = kern_mount(&bd_type); - if (IS_ERR(bd_mnt)) + blockdev_mnt = kern_mount(&bd_type); + if (IS_ERR(blockdev_mnt)) panic("Cannot create bdev pseudo-fs"); - blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ + blockdev_superblock = blockdev_mnt->mnt_sb; /* For writeback */ } struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) @@ -633,6 +645,14 @@ static void blkdev_flush_mapping(struct block_device *bdev) bdev_write_inode(bdev); } +static void blkdev_put_whole(struct block_device *bdev) +{ + if (atomic_dec_and_test(&bdev->bd_openers)) + blkdev_flush_mapping(bdev); + if (bdev->bd_disk->fops->release) + bdev->bd_disk->fops->release(bdev->bd_disk); +} + static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) { struct gendisk *disk = bdev->bd_disk; @@ -651,20 +671,21 @@ static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) if (!atomic_read(&bdev->bd_openers)) set_init_blocksize(bdev); - if (test_bit(GD_NEED_PART_SCAN, &disk->state)) - bdev_disk_changed(disk, false); atomic_inc(&bdev->bd_openers); + if (test_bit(GD_NEED_PART_SCAN, &disk->state)) { + /* + * Only return scanning errors if we are called from contexts + * that explicitly want them, e.g. the BLKRRPART ioctl. + */ + ret = bdev_disk_changed(disk, false); + if (ret && (mode & BLK_OPEN_STRICT_SCAN)) { + blkdev_put_whole(bdev); + return ret; + } + } return 0; } -static void blkdev_put_whole(struct block_device *bdev) -{ - if (atomic_dec_and_test(&bdev->bd_openers)) - blkdev_flush_mapping(bdev); - if (bdev->bd_disk->fops->release) - bdev->bd_disk->fops->release(bdev->bd_disk); -} - static int blkdev_get_part(struct block_device *part, blk_mode_t mode) { struct gendisk *disk = part->bd_disk; @@ -690,6 +711,31 @@ out_blkdev_put: return ret; } +int bdev_permission(dev_t dev, blk_mode_t mode, void *holder) +{ + int ret; + + ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, + MAJOR(dev), MINOR(dev), + ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) | + ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0)); + if (ret) + return ret; + + /* Blocking writes requires exclusive opener */ + if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) + return -EINVAL; + + /* + * We're using error pointers to indicate to ->release() when we + * failed to open that block device. Also this doesn't make sense. + */ + if (WARN_ON_ONCE(IS_ERR(holder))) + return -EINVAL; + + return 0; +} + static void blkdev_put_part(struct block_device *part) { struct block_device *whole = bdev_whole(part); @@ -729,58 +775,101 @@ void blkdev_put_no_open(struct block_device *bdev) { put_device(&bdev->bd_device); } - + +static bool bdev_writes_blocked(struct block_device *bdev) +{ + return bdev->bd_writers < 0; +} + +static void bdev_block_writes(struct block_device *bdev) +{ + bdev->bd_writers--; +} + +static void bdev_unblock_writes(struct block_device *bdev) +{ + bdev->bd_writers++; +} + +static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode) +{ + if (bdev_allow_write_mounted) + return true; + /* Writes blocked? */ + if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev)) + return false; + if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0) + return false; + return true; +} + +static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode) +{ + if (bdev_allow_write_mounted) + return; + + /* Claim exclusive or shared write access. */ + if (mode & BLK_OPEN_RESTRICT_WRITES) + bdev_block_writes(bdev); + else if (mode & BLK_OPEN_WRITE) + bdev->bd_writers++; +} + +static inline bool bdev_unclaimed(const struct file *bdev_file) +{ + return bdev_file->private_data == BDEV_I(bdev_file->f_mapping->host); +} + +static void bdev_yield_write_access(struct file *bdev_file) +{ + struct block_device *bdev; + + if (bdev_allow_write_mounted) + return; + + if (bdev_unclaimed(bdev_file)) + return; + + bdev = file_bdev(bdev_file); + + if (bdev_file->f_mode & FMODE_WRITE_RESTRICTED) + bdev_unblock_writes(bdev); + else if (bdev_file->f_mode & FMODE_WRITE) + bdev->bd_writers--; +} + /** - * blkdev_get_by_dev - open a block device by device number - * @dev: device number of block device to open + * bdev_open - open a block device + * @bdev: block device to open * @mode: open mode (BLK_OPEN_*) * @holder: exclusive holder identifier * @hops: holder operations + * @bdev_file: file for the block device * - * Open the block device described by device number @dev. If @holder is not - * %NULL, the block device is opened with exclusive access. Exclusive opens may - * nest for the same @holder. - * - * Use this interface ONLY if you really do not have anything better - i.e. when - * you are behind a truly sucky interface and all you are given is a device - * number. Everything else should use blkdev_get_by_path(). + * Open the block device. If @holder is not %NULL, the block device is opened + * with exclusive access. Exclusive opens may nest for the same @holder. * * CONTEXT: * Might sleep. * * RETURNS: - * Reference to the block_device on success, ERR_PTR(-errno) on failure. + * zero on success, -errno on failure. */ -struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder, - const struct blk_holder_ops *hops) +int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, + const struct blk_holder_ops *hops, struct file *bdev_file) { bool unblock_events = true; - struct block_device *bdev; - struct gendisk *disk; + struct gendisk *disk = bdev->bd_disk; int ret; - ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, - MAJOR(dev), MINOR(dev), - ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) | - ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0)); - if (ret) - return ERR_PTR(ret); - - bdev = blkdev_get_no_open(dev); - if (!bdev) - return ERR_PTR(-ENXIO); - disk = bdev->bd_disk; - if (holder) { mode |= BLK_OPEN_EXCL; ret = bd_prepare_to_claim(bdev, holder, hops); if (ret) - goto put_blkdev; + return ret; } else { - if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) { - ret = -EIO; - goto put_blkdev; - } + if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) + return -EIO; } disk_block_events(disk); @@ -791,12 +880,16 @@ struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder, goto abort_claiming; if (!try_module_get(disk->fops->owner)) goto abort_claiming; + ret = -EBUSY; + if (!bdev_may_open(bdev, mode)) + goto put_module; if (bdev_is_partition(bdev)) ret = blkdev_get_part(bdev, mode); else ret = blkdev_get_whole(bdev, mode); if (ret) goto put_module; + bdev_claim_write_access(bdev, mode); if (holder) { bd_finish_claiming(bdev, holder, hops); @@ -817,7 +910,18 @@ struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder, if (unblock_events) disk_unblock_events(disk); - return bdev; + + bdev_file->f_flags |= O_LARGEFILE; + bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; + if (bdev_nowait(bdev)) + bdev_file->f_mode |= FMODE_NOWAIT; + if (mode & BLK_OPEN_RESTRICT_WRITES) + bdev_file->f_mode |= FMODE_WRITE_RESTRICTED; + bdev_file->f_mapping = bdev->bd_inode->i_mapping; + bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping); + bdev_file->private_data = holder; + + return 0; put_module: module_put(disk->fops->owner); abort_claiming: @@ -825,76 +929,80 @@ abort_claiming: bd_abort_claiming(bdev, holder); mutex_unlock(&disk->open_mutex); disk_unblock_events(disk); -put_blkdev: - blkdev_put_no_open(bdev); - return ERR_PTR(ret); + return ret; } -EXPORT_SYMBOL(blkdev_get_by_dev); -struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, - const struct blk_holder_ops *hops) +/* + * If BLK_OPEN_WRITE_IOCTL is set then this is a historical quirk + * associated with the floppy driver where it has allowed ioctls if the + * file was opened for writing, but does not allow reads or writes. + * Make sure that this quirk is reflected in @f_flags. + * + * It can also happen if a block device is opened as O_RDWR | O_WRONLY. + */ +static unsigned blk_to_file_flags(blk_mode_t mode) { - struct bdev_handle *handle = kmalloc(sizeof(*handle), GFP_KERNEL); - struct block_device *bdev; + unsigned int flags = 0; + + if ((mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) == + (BLK_OPEN_READ | BLK_OPEN_WRITE)) + flags |= O_RDWR; + else if (mode & BLK_OPEN_WRITE_IOCTL) + flags |= O_RDWR | O_WRONLY; + else if (mode & BLK_OPEN_WRITE) + flags |= O_WRONLY; + else if (mode & BLK_OPEN_READ) + flags |= O_RDONLY; /* homeopathic, because O_RDONLY is 0 */ + else + WARN_ON_ONCE(true); - if (!handle) - return ERR_PTR(-ENOMEM); - bdev = blkdev_get_by_dev(dev, mode, holder, hops); - if (IS_ERR(bdev)) { - kfree(handle); - return ERR_CAST(bdev); - } - handle->bdev = bdev; - handle->holder = holder; - if (holder) - mode |= BLK_OPEN_EXCL; - handle->mode = mode; - return handle; + if (mode & BLK_OPEN_NDELAY) + flags |= O_NDELAY; + + return flags; } -EXPORT_SYMBOL(bdev_open_by_dev); -/** - * blkdev_get_by_path - open a block device by name - * @path: path to the block device to open - * @mode: open mode (BLK_OPEN_*) - * @holder: exclusive holder identifier - * @hops: holder operations - * - * Open the block device described by the device file at @path. If @holder is - * not %NULL, the block device is opened with exclusive access. Exclusive opens - * may nest for the same @holder. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * Reference to the block_device on success, ERR_PTR(-errno) on failure. - */ -struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode, - void *holder, const struct blk_holder_ops *hops) +struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, + const struct blk_holder_ops *hops) { + struct file *bdev_file; struct block_device *bdev; - dev_t dev; - int error; + unsigned int flags; + int ret; - error = lookup_bdev(path, &dev); - if (error) - return ERR_PTR(error); + ret = bdev_permission(dev, mode, holder); + if (ret) + return ERR_PTR(ret); + + bdev = blkdev_get_no_open(dev); + if (!bdev) + return ERR_PTR(-ENXIO); - bdev = blkdev_get_by_dev(dev, mode, holder, hops); - if (!IS_ERR(bdev) && (mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { - blkdev_put(bdev, holder); - return ERR_PTR(-EACCES); + flags = blk_to_file_flags(mode); + bdev_file = alloc_file_pseudo_noaccount(bdev->bd_inode, + blockdev_mnt, "", flags | O_LARGEFILE, &def_blk_fops); + if (IS_ERR(bdev_file)) { + blkdev_put_no_open(bdev); + return bdev_file; } + ihold(bdev->bd_inode); - return bdev; + ret = bdev_open(bdev, mode, holder, hops, bdev_file); + if (ret) { + /* We failed to open the block device. Let ->release() know. */ + bdev_file->private_data = ERR_PTR(ret); + fput(bdev_file); + return ERR_PTR(ret); + } + return bdev_file; } -EXPORT_SYMBOL(blkdev_get_by_path); +EXPORT_SYMBOL(bdev_file_open_by_dev); -struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, - void *holder, const struct blk_holder_ops *hops) +struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, + void *holder, + const struct blk_holder_ops *hops) { - struct bdev_handle *handle; + struct file *file; dev_t dev; int error; @@ -902,21 +1010,42 @@ struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, if (error) return ERR_PTR(error); - handle = bdev_open_by_dev(dev, mode, holder, hops); - if (!IS_ERR(handle) && (mode & BLK_OPEN_WRITE) && - bdev_read_only(handle->bdev)) { - bdev_release(handle); - return ERR_PTR(-EACCES); + file = bdev_file_open_by_dev(dev, mode, holder, hops); + if (!IS_ERR(file) && (mode & BLK_OPEN_WRITE)) { + if (bdev_read_only(file_bdev(file))) { + fput(file); + file = ERR_PTR(-EACCES); + } } - return handle; + return file; +} +EXPORT_SYMBOL(bdev_file_open_by_path); + +static inline void bd_yield_claim(struct file *bdev_file) +{ + struct block_device *bdev = file_bdev(bdev_file); + void *holder = bdev_file->private_data; + + lockdep_assert_held(&bdev->bd_disk->open_mutex); + + if (WARN_ON_ONCE(IS_ERR_OR_NULL(holder))) + return; + + if (!bdev_unclaimed(bdev_file)) + bd_end_claim(bdev, holder); } -EXPORT_SYMBOL(bdev_open_by_path); -void blkdev_put(struct block_device *bdev, void *holder) +void bdev_release(struct file *bdev_file) { + struct block_device *bdev = file_bdev(bdev_file); + void *holder = bdev_file->private_data; struct gendisk *disk = bdev->bd_disk; + /* We failed to open that block device. */ + if (IS_ERR(holder)) + goto put_no_open; + /* * Sync early if it looks like we're the last one. If someone else * opens the block device between now and the decrement of bd_openers @@ -928,8 +1057,10 @@ void blkdev_put(struct block_device *bdev, void *holder) sync_blockdev(bdev); mutex_lock(&disk->open_mutex); + bdev_yield_write_access(bdev_file); + if (holder) - bd_end_claim(bdev, holder); + bd_yield_claim(bdev_file); /* * Trigger event checking and tell drivers to flush MEDIA_CHANGE @@ -945,16 +1076,42 @@ void blkdev_put(struct block_device *bdev, void *holder) mutex_unlock(&disk->open_mutex); module_put(disk->fops->owner); +put_no_open: blkdev_put_no_open(bdev); } -EXPORT_SYMBOL(blkdev_put); -void bdev_release(struct bdev_handle *handle) +/** + * bdev_fput - yield claim to the block device and put the file + * @bdev_file: open block device + * + * Yield claim on the block device and put the file. Ensure that the + * block device can be reclaimed before the file is closed which is a + * deferred operation. + */ +void bdev_fput(struct file *bdev_file) { - blkdev_put(handle->bdev, handle->holder); - kfree(handle); + if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops)) + return; + + if (bdev_file->private_data) { + struct block_device *bdev = file_bdev(bdev_file); + struct gendisk *disk = bdev->bd_disk; + + mutex_lock(&disk->open_mutex); + bdev_yield_write_access(bdev_file); + bd_yield_claim(bdev_file); + /* + * Tell release we already gave up our hold on the + * device and if write restrictions are available that + * we already gave up write access to the device. + */ + bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host); + mutex_unlock(&disk->open_mutex); + } + + fput(bdev_file); } -EXPORT_SYMBOL(bdev_release); +EXPORT_SYMBOL(bdev_fput); /** * lookup_bdev() - Look up a struct block_device by name. @@ -1102,3 +1259,12 @@ void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) blkdev_put_no_open(bdev); } + +static int __init setup_bdev_allow_write_mounted(char *str) +{ + if (kstrtobool(str, &bdev_allow_write_mounted)) + pr_warn("Invalid option string for bdev_allow_write_mounted:" + " '%s'\n", str); + return 1; +} +__setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted); diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 2c90e5de0..d442ee358 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -127,7 +127,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) if (!bfqg_stats_waiting(stats)) return; - now = ktime_get_ns(); + now = blk_time_get_ns(); if (now > stats->start_group_wait_time) bfq_stat_add(&stats->group_wait_time, now - stats->start_group_wait_time); @@ -144,7 +144,7 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, return; if (bfqg == curr_bfqg) return; - stats->start_group_wait_time = ktime_get_ns(); + stats->start_group_wait_time = blk_time_get_ns(); bfqg_stats_mark_waiting(stats); } @@ -156,7 +156,7 @@ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) if (!bfqg_stats_empty(stats)) return; - now = ktime_get_ns(); + now = blk_time_get_ns(); if (now > stats->start_empty_time) bfq_stat_add(&stats->empty_time, now - stats->start_empty_time); @@ -183,7 +183,7 @@ void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) if (bfqg_stats_empty(stats)) return; - stats->start_empty_time = ktime_get_ns(); + stats->start_empty_time = blk_time_get_ns(); bfqg_stats_mark_empty(stats); } @@ -192,7 +192,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg) struct bfqg_stats *stats = &bfqg->stats; if (bfqg_stats_idling(stats)) { - u64 now = ktime_get_ns(); + u64 now = blk_time_get_ns(); if (now > stats->start_idle_time) bfq_stat_add(&stats->idle_time, @@ -205,7 +205,7 @@ void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; - stats->start_idle_time = ktime_get_ns(); + stats->start_idle_time = blk_time_get_ns(); bfqg_stats_mark_idling(stats); } @@ -242,7 +242,7 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, u64 io_start_time_ns, blk_opf_t opf) { struct bfqg_stats *stats = &bfqg->stats; - u64 now = ktime_get_ns(); + u64 now = blk_time_get_ns(); if (now > io_start_time_ns) blkg_rwstat_add(&stats->service_time, opf, diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 3cce6de46..4b88a54a9 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1005,7 +1005,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq, rq = rq_entry_fifo(bfqq->fifo.next); - if (rq == last || ktime_get_ns() < rq->fifo_time) + if (rq == last || blk_time_get_ns() < rq->fifo_time) return NULL; bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); @@ -1829,7 +1829,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, * bfq_bfqq_update_budg_for_activation for * details on the usage of the next variable. */ - arrived_in_time = ktime_get_ns() <= + arrived_in_time = blk_time_get_ns() <= bfqq->ttime.last_end_request + bfqd->bfq_slice_idle * 3; unsigned int act_idx = bfq_actuator_index(bfqd, rq->bio); @@ -2208,7 +2208,7 @@ static void bfq_add_request(struct request *rq) struct request *next_rq, *prev; unsigned int old_wr_coeff = bfqq->wr_coeff; bool interactive = false; - u64 now_ns = ktime_get_ns(); + u64 now_ns = blk_time_get_ns(); bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); bfqq->queued[rq_is_sync(rq)]++; @@ -2262,7 +2262,7 @@ static void bfq_add_request(struct request *rq) bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) && time_is_before_eq_jiffies(bfqq->decrease_time_jif + msecs_to_jiffies(10))) { - bfqd->last_empty_occupied_ns = ktime_get_ns(); + bfqd->last_empty_occupied_ns = blk_time_get_ns(); /* * Start the state machine for measuring the * total service time of rq: setting @@ -3294,7 +3294,7 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd, else timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; - bfqd->last_budget_start = ktime_get(); + bfqd->last_budget_start = blk_time_get(); bfqq->budget_timeout = jiffies + bfqd->bfq_timeout * timeout_coeff; @@ -3394,7 +3394,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) else if (bfqq->wr_coeff > 1) sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC); - bfqd->last_idling_start = ktime_get(); + bfqd->last_idling_start = blk_time_get(); bfqd->last_idling_start_jiffies = jiffies; hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), @@ -3433,7 +3433,7 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) { if (rq != NULL) { /* new rq dispatch now, reset accordingly */ - bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns(); + bfqd->last_dispatch = bfqd->first_dispatch = blk_time_get_ns(); bfqd->peak_rate_samples = 1; bfqd->sequential_samples = 0; bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = @@ -3590,7 +3590,7 @@ reset_computation: */ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) { - u64 now_ns = ktime_get_ns(); + u64 now_ns = blk_time_get_ns(); if (bfqd->peak_rate_samples == 0) { /* first dispatch */ bfq_log(bfqd, "update_peak_rate: goto reset, samples %d", @@ -4162,7 +4162,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (compensate) delta_ktime = bfqd->last_idling_start; else - delta_ktime = ktime_get(); + delta_ktime = blk_time_get(); delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); delta_usecs = ktime_to_us(delta_ktime); @@ -5591,7 +5591,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_io_cq *bic, pid_t pid, int is_sync, unsigned int act_idx) { - u64 now_ns = ktime_get_ns(); + u64 now_ns = blk_time_get_ns(); bfqq->actuator_idx = act_idx; RB_CLEAR_NODE(&bfqq->entity.rb_node); @@ -5903,7 +5903,7 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd, */ if (bfqq->dispatched || bfq_bfqq_busy(bfqq)) return; - elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; + elapsed = blk_time_get_ns() - bfqq->ttime.last_end_request; elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle); ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; @@ -6194,7 +6194,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) bfq_add_request(rq); idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); - rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; + rq->fifo_time = blk_time_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; list_add_tail(&rq->queuelist, &bfqq->fifo); bfq_rq_enqueued(bfqd, bfqq, rq); @@ -6370,7 +6370,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) bfq_weights_tree_remove(bfqq); } - now_ns = ktime_get_ns(); + now_ns = blk_time_get_ns(); bfqq->ttime.last_end_request = now_ns; @@ -6585,7 +6585,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) static void bfq_update_inject_limit(struct bfq_data *bfqd, struct bfq_queue *bfqq) { - u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns; + u64 tot_time_ns = blk_time_get_ns() - bfqd->last_empty_occupied_ns; unsigned int old_limit = bfqq->inject_limit; if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) { diff --git a/block/bio-integrity.c b/block/bio-integrity.c index ec8ac8cf6..2e3e8e049 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -69,15 +69,15 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, memset(bip, 0, sizeof(*bip)); + /* always report as many vecs as asked explicitly, not inline vecs */ + bip->bip_max_vcnt = nr_vecs; if (nr_vecs > inline_vecs) { - bip->bip_max_vcnt = nr_vecs; bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool, &bip->bip_max_vcnt, gfp_mask); if (!bip->bip_vec) goto err; } else { bip->bip_vec = bip->bip_inline_vecs; - bip->bip_max_vcnt = inline_vecs; } bip->bip_bio = bio; @@ -91,6 +91,47 @@ err: } EXPORT_SYMBOL(bio_integrity_alloc); +static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs, + bool dirty) +{ + int i; + + for (i = 0; i < nr_vecs; i++) { + if (dirty && !PageCompound(bv[i].bv_page)) + set_page_dirty_lock(bv[i].bv_page); + unpin_user_page(bv[i].bv_page); + } +} + +static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip) +{ + unsigned short nr_vecs = bip->bip_max_vcnt - 1; + struct bio_vec *copy = &bip->bip_vec[1]; + size_t bytes = bip->bip_iter.bi_size; + struct iov_iter iter; + int ret; + + iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes); + ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter); + WARN_ON_ONCE(ret != bytes); + + bio_integrity_unpin_bvec(copy, nr_vecs, true); +} + +static void bio_integrity_unmap_user(struct bio_integrity_payload *bip) +{ + bool dirty = bio_data_dir(bip->bip_bio) == READ; + + if (bip->bip_flags & BIP_COPY_USER) { + if (dirty) + bio_integrity_uncopy_user(bip); + kfree(bvec_virt(bip->bip_vec)); + return; + } + + bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt, dirty); +} + /** * bio_integrity_free - Free bio integrity payload * @bio: bio containing bip to be freed @@ -105,6 +146,8 @@ void bio_integrity_free(struct bio *bio) if (bip->bip_flags & BIP_BLOCK_INTEGRITY) kfree(bvec_virt(bip->bip_vec)); + else if (bip->bip_flags & BIP_INTEGRITY_USER) + bio_integrity_unmap_user(bip); __bio_integrity_free(bs, bip); bio->bi_integrity = NULL; @@ -160,6 +203,177 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, } EXPORT_SYMBOL(bio_integrity_add_page); +static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, + int nr_vecs, unsigned int len, + unsigned int direction, u32 seed) +{ + bool write = direction == ITER_SOURCE; + struct bio_integrity_payload *bip; + struct iov_iter iter; + void *buf; + int ret; + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (write) { + iov_iter_bvec(&iter, direction, bvec, nr_vecs, len); + if (!copy_from_iter_full(buf, len, &iter)) { + ret = -EFAULT; + goto free_buf; + } + + bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); + } else { + memset(buf, 0, len); + + /* + * We need to preserve the original bvec and the number of vecs + * in it for completion handling + */ + bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs + 1); + } + + if (IS_ERR(bip)) { + ret = PTR_ERR(bip); + goto free_buf; + } + + if (write) + bio_integrity_unpin_bvec(bvec, nr_vecs, false); + else + memcpy(&bip->bip_vec[1], bvec, nr_vecs * sizeof(*bvec)); + + ret = bio_integrity_add_page(bio, virt_to_page(buf), len, + offset_in_page(buf)); + if (ret != len) { + ret = -ENOMEM; + goto free_bip; + } + + bip->bip_flags |= BIP_INTEGRITY_USER | BIP_COPY_USER; + bip->bip_iter.bi_sector = seed; + return 0; +free_bip: + bio_integrity_free(bio); +free_buf: + kfree(buf); + return ret; +} + +static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec, + int nr_vecs, unsigned int len, u32 seed) +{ + struct bio_integrity_payload *bip; + + bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs); + if (IS_ERR(bip)) + return PTR_ERR(bip); + + memcpy(bip->bip_vec, bvec, nr_vecs * sizeof(*bvec)); + bip->bip_flags |= BIP_INTEGRITY_USER; + bip->bip_iter.bi_sector = seed; + bip->bip_iter.bi_size = len; + return 0; +} + +static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, + int nr_vecs, ssize_t bytes, ssize_t offset) +{ + unsigned int nr_bvecs = 0; + int i, j; + + for (i = 0; i < nr_vecs; i = j) { + size_t size = min_t(size_t, bytes, PAGE_SIZE - offset); + struct folio *folio = page_folio(pages[i]); + + bytes -= size; + for (j = i + 1; j < nr_vecs; j++) { + size_t next = min_t(size_t, PAGE_SIZE, bytes); + + if (page_folio(pages[j]) != folio || + pages[j] != pages[j - 1] + 1) + break; + unpin_user_page(pages[j]); + size += next; + bytes -= next; + } + + bvec_set_page(&bvec[nr_bvecs], pages[i], size, offset); + offset = 0; + nr_bvecs++; + } + + return nr_bvecs; +} + +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, + u32 seed) +{ + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + unsigned int align = q->dma_pad_mask | queue_dma_alignment(q); + struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages; + struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec; + unsigned int direction, nr_bvecs; + struct iov_iter iter; + int ret, nr_vecs; + size_t offset; + bool copy; + + if (bio_integrity(bio)) + return -EINVAL; + if (bytes >> SECTOR_SHIFT > queue_max_hw_sectors(q)) + return -E2BIG; + + if (bio_data_dir(bio) == READ) + direction = ITER_DEST; + else + direction = ITER_SOURCE; + + iov_iter_ubuf(&iter, direction, ubuf, bytes); + nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1); + if (nr_vecs > BIO_MAX_VECS) + return -E2BIG; + if (nr_vecs > UIO_FASTIOV) { + bvec = kcalloc(nr_vecs, sizeof(*bvec), GFP_KERNEL); + if (!bvec) + return -ENOMEM; + pages = NULL; + } + + copy = !iov_iter_is_aligned(&iter, align, align); + ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset); + if (unlikely(ret < 0)) + goto free_bvec; + + nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset); + if (pages != stack_pages) + kvfree(pages); + if (nr_bvecs > queue_max_integrity_segments(q)) + copy = true; + + if (copy) + ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes, + direction, seed); + else + ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes, seed); + if (ret) + goto release_pages; + if (bvec != stack_vec) + kfree(bvec); + + return 0; + +release_pages: + bio_integrity_unpin_bvec(bvec, nr_bvecs, false); +free_bvec: + if (bvec != stack_vec) + kfree(bvec); + return ret; +} +EXPORT_SYMBOL_GPL(bio_integrity_map_user); + /** * bio_integrity_process - Process integrity metadata for a bio * @bio: bio to generate/verify integrity metadata for @@ -181,6 +395,7 @@ static blk_status_t bio_integrity_process(struct bio *bio, iter.tuple_size = bi->tuple_size; iter.seed = proc_iter->bi_sector; iter.prot_buf = bvec_virt(bip->bip_vec); + iter.pi_offset = bi->pi_offset; __bio_for_each_segment(bv, bio, bviter, *proc_iter) { void *kaddr = bvec_kmap_local(&bv); diff --git a/block/bio.c b/block/bio.c index 62419aa09..d24420ed1 100644 --- a/block/bio.c +++ b/block/bio.c @@ -16,7 +16,6 @@ #include <linux/workqueue.h> #include <linux/cgroup.h> #include <linux/highmem.h> -#include <linux/sched/sysctl.h> #include <linux/blk-crypto.h> #include <linux/xarray.h> @@ -251,6 +250,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, bio->bi_opf = opf; bio->bi_flags = 0; bio->bi_ioprio = 0; + bio->bi_write_hint = 0; bio->bi_status = 0; bio->bi_iter.bi_sector = 0; bio->bi_iter.bi_size = 0; @@ -762,29 +762,31 @@ static inline void bio_put_percpu_cache(struct bio *bio) struct bio_alloc_cache *cache; cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); - if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) { - put_cpu(); - bio_free(bio); - return; - } + if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) + goto out_free; - bio_uninit(bio); - - if ((bio->bi_opf & REQ_POLLED) && !WARN_ON_ONCE(in_interrupt())) { + if (in_task()) { + bio_uninit(bio); bio->bi_next = cache->free_list; + /* Not necessary but helps not to iopoll already freed bios */ bio->bi_bdev = NULL; cache->free_list = bio; cache->nr++; - } else { - unsigned long flags; + } else if (in_hardirq()) { + lockdep_assert_irqs_disabled(); - local_irq_save(flags); + bio_uninit(bio); bio->bi_next = cache->free_list_irq; cache->free_list_irq = bio; cache->nr_irq++; - local_irq_restore(flags); + } else { + goto out_free; } put_cpu(); + return; +out_free: + put_cpu(); + bio_free(bio); } /** @@ -813,6 +815,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) { bio_set_flag(bio, BIO_CLONED); bio->bi_ioprio = bio_src->bi_ioprio; + bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter = bio_src->bi_iter; if (bio->bi_bdev) { @@ -966,10 +969,13 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page) { + unsigned int max_size = max_sectors << SECTOR_SHIFT; + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; - if (((bio->bi_iter.bi_size + len) >> SECTOR_SHIFT) > max_sectors) + len = min3(len, max_size, queue_max_segment_size(q)); + if (len > max_size - bio->bi_iter.bi_size) return 0; if (bio->bi_vcnt > 0) { @@ -1367,21 +1373,12 @@ int submit_bio_wait(struct bio *bio) { DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_bdev->bd_disk->lockdep_map); - unsigned long hang_check; bio->bi_private = &done; bio->bi_end_io = submit_bio_wait_endio; bio->bi_opf |= REQ_SYNC; submit_bio(bio); - - /* Prevent hang_check timer from firing at us during very long I/O */ - hang_check = sysctl_hung_task_timeout_secs; - if (hang_check) - while (!wait_for_completion_io_timeout(&done, - hang_check * (HZ/2))) - ; - else - wait_for_completion_io(&done); + blk_wait_io(&done); return blk_status_to_errno(bio->bi_status); } diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4b48c2c44..059467086 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -300,7 +300,7 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) * @disk: gendisk the new blkg is associated with * @gfp_mask: allocation mask to use * - * Allocate a new blkg assocating @blkcg and @q. + * Allocate a new blkg associating @blkcg and @disk. */ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, gfp_t gfp_mask) @@ -575,13 +575,13 @@ static void blkg_destroy(struct blkcg_gq *blkg) static void blkg_destroy_all(struct gendisk *disk) { struct request_queue *q = disk->queue; - struct blkcg_gq *blkg, *n; + struct blkcg_gq *blkg; int count = BLKG_DESTROY_BATCH_SIZE; int i; restart: spin_lock_irq(&q->queue_lock); - list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { + list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; if (hlist_unhashed(&blkg->blkcg_node)) @@ -1409,6 +1409,12 @@ static int blkcg_css_online(struct cgroup_subsys_state *css) return 0; } +void blkg_init_queue(struct request_queue *q) +{ + INIT_LIST_HEAD(&q->blkg_list); + mutex_init(&q->blkcg_mutex); +} + int blkcg_init_disk(struct gendisk *disk) { struct request_queue *q = disk->queue; @@ -1416,9 +1422,6 @@ int blkcg_init_disk(struct gendisk *disk) bool preloaded; int ret; - INIT_LIST_HEAD(&q->blkg_list); - mutex_init(&q->blkcg_mutex); - new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) return -ENOMEM; @@ -1846,7 +1849,7 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { unsigned long pflags; bool clamp; - u64 now = ktime_to_ns(ktime_get()); + u64 now = blk_time_get_ns(); u64 exp; u64 delay_nsec = 0; int tok; @@ -2064,6 +2067,9 @@ void bio_associate_blkg(struct bio *bio) { struct cgroup_subsys_state *css; + if (blk_op_is_passthrough(bio->bi_opf)) + return; + rcu_read_lock(); if (bio->bi_blkg) diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index b927a4a0a..90b3959d8 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -19,6 +19,7 @@ #include <linux/kthread.h> #include <linux/blk-mq.h> #include <linux/llist.h> +#include "blk.h" struct blkcg_gq; struct blkg_policy_data; @@ -188,6 +189,7 @@ struct blkcg_policy { extern struct blkcg blkcg_root; extern bool blkcg_debug_stats; +void blkg_init_queue(struct request_queue *q); int blkcg_init_disk(struct gendisk *disk); void blkcg_exit_disk(struct gendisk *disk); @@ -481,6 +483,7 @@ struct blkcg { }; static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } +static inline void blkg_init_queue(struct request_queue *q) { } static inline int blkcg_init_disk(struct gendisk *disk) { return 0; } static inline void blkcg_exit_disk(struct gendisk *disk) { } static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } diff --git a/block/blk-core.c b/block/blk-core.c index 2eca76ccf..b795ac177 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -49,6 +49,7 @@ #include "blk-pm.h" #include "blk-cgroup.h" #include "blk-throttle.h" +#include "blk-ioprio.h" struct dentry *blk_debugfs_root; @@ -393,24 +394,34 @@ static void blk_timeout_work(struct work_struct *work) { } -struct request_queue *blk_alloc_queue(int node_id) +struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) { struct request_queue *q; + int error; q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO, node_id); if (!q) - return NULL; + return ERR_PTR(-ENOMEM); q->last_merge = NULL; q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL); - if (q->id < 0) + if (q->id < 0) { + error = q->id; goto fail_q; + } q->stats = blk_alloc_queue_stats(); - if (!q->stats) + if (!q->stats) { + error = -ENOMEM; goto fail_id; + } + + error = blk_set_default_limits(lim); + if (error) + goto fail_stats; + q->limits = *lim; q->node = node_id; @@ -424,22 +435,25 @@ struct request_queue *blk_alloc_queue(int node_id) mutex_init(&q->debugfs_mutex); mutex_init(&q->sysfs_lock); mutex_init(&q->sysfs_dir_lock); + mutex_init(&q->limits_lock); mutex_init(&q->rq_qos_mutex); spin_lock_init(&q->queue_lock); init_waitqueue_head(&q->mq_freeze_wq); mutex_init(&q->mq_freeze_lock); + blkg_init_queue(q); + /* * Init percpu_ref in atomic mode so that it's faster to shutdown. * See blk_register_queue() for details. */ - if (percpu_ref_init(&q->q_usage_counter, + error = percpu_ref_init(&q->q_usage_counter, blk_queue_usage_counter_release, - PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) + PERCPU_REF_INIT_ATOMIC, GFP_KERNEL); + if (error) goto fail_stats; - blk_set_default_limits(&q->limits); q->nr_requests = BLKDEV_DEFAULT_RQ; return q; @@ -450,7 +464,7 @@ fail_id: ida_free(&blk_queue_ida, q->id); fail_q: kmem_cache_free(blk_requestq_cachep, q); - return NULL; + return ERR_PTR(error); } /** @@ -772,6 +786,15 @@ void submit_bio_noacct(struct bio *bio) bio_clear_polled(bio); switch (bio_op(bio)) { + case REQ_OP_READ: + case REQ_OP_WRITE: + break; + case REQ_OP_FLUSH: + /* + * REQ_OP_FLUSH can't be submitted through bios, it is only + * synthetized in struct request by the flush state machine. + */ + goto not_supported; case REQ_OP_DISCARD: if (!bdev_max_discard_sectors(bdev)) goto not_supported; @@ -785,6 +808,10 @@ void submit_bio_noacct(struct bio *bio) if (status != BLK_STS_OK) goto end_io; break; + case REQ_OP_WRITE_ZEROES: + if (!q->limits.max_write_zeroes_sectors) + goto not_supported; + break; case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: @@ -796,12 +823,15 @@ void submit_bio_noacct(struct bio *bio) if (!bdev_is_zoned(bio->bi_bdev) || !blk_queue_zone_resetall(q)) goto not_supported; break; - case REQ_OP_WRITE_ZEROES: - if (!q->limits.max_write_zeroes_sectors) - goto not_supported; - break; + case REQ_OP_DRV_IN: + case REQ_OP_DRV_OUT: + /* + * Driver private operations are only used with passthrough + * requests. + */ + fallthrough; default: - break; + goto not_supported; } if (blk_throtl_bio(bio)) @@ -817,6 +847,14 @@ end_io: } EXPORT_SYMBOL(submit_bio_noacct); +static void bio_set_ioprio(struct bio *bio) +{ + /* Nobody set ioprio so far? Initialize it based on task's nice value */ + if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE) + bio->bi_ioprio = get_current_ioprio(); + blkcg_set_ioprio(bio); +} + /** * submit_bio - submit a bio to the block device layer for I/O * @bio: The &struct bio which describes the I/O @@ -839,6 +877,7 @@ void submit_bio(struct bio *bio) count_vm_events(PGPGOUT, bio_sectors(bio)); } + bio_set_ioprio(bio); submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio); @@ -1057,6 +1096,7 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios) if (tsk->plug) return; + plug->cur_ktime = 0; plug->mq_list = NULL; plug->cached_rq = NULL; plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT); @@ -1156,6 +1196,9 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) */ if (unlikely(!rq_list_empty(plug->cached_rq))) blk_mq_free_plug_rqs(plug); + + plug->cur_ktime = 0; + current->flags &= ~PF_BLOCK_TS; } /** @@ -1203,8 +1246,7 @@ int __init blk_dev_init(void) if (!kblockd_workqueue) panic("Failed to create kblockd\n"); - blk_requestq_cachep = kmem_cache_create("request_queue", - sizeof(struct request_queue), 0, SLAB_PANIC, NULL); + blk_requestq_cachep = KMEM_CACHE(request_queue, SLAB_PANIC); blk_debugfs_root = debugfs_create_dir("block", NULL); diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index e6468eab2..b1e7415f8 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -172,6 +172,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) if (bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); bio->bi_ioprio = bio_src->bi_ioprio; + bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; diff --git a/block/blk-flush.c b/block/blk-flush.c index 3f4d41952..b0f314f4b 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -143,7 +143,7 @@ static void blk_account_io_flush(struct request *rq) part_stat_lock(); part_stat_inc(part, ios[STAT_FLUSH]); part_stat_add(part, nsecs[STAT_FLUSH], - ktime_get_ns() - rq->start_time_ns); + blk_time_get_ns() - rq->start_time_ns); part_stat_unlock(); } diff --git a/block/blk-integrity.c b/block/blk-integrity.c index d4e9b4556..ccbeb6dfa 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -370,6 +370,7 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template bi->profile = template->profile ? template->profile : &nop_profile; bi->tuple_size = template->tuple_size; bi->tag_size = template->tag_size; + bi->pi_offset = template->pi_offset; blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 7ee8d85c2..690ca99df 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -829,7 +829,7 @@ static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk) /* step up/down based on the vrate */ vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); - now_ns = ktime_get_ns(); + now_ns = blk_time_get_ns(); if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { if (!ioc->autop_too_fast_at) @@ -1044,7 +1044,7 @@ static void ioc_now(struct ioc *ioc, struct ioc_now *now) unsigned seq; u64 vrate; - now->now_ns = ktime_get(); + now->now_ns = blk_time_get_ns(); now->now = ktime_to_us(now->now_ns); vrate = atomic64_read(&ioc->vtime_rate); @@ -1261,7 +1261,7 @@ static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now) static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; - u64 last_period, cur_period; + u64 __maybe_unused last_period, cur_period; u64 vtime, vtarget; int i; @@ -1347,7 +1347,7 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); - u64 tdelta, delay, new_delay; + u64 tdelta, delay, new_delay, shift; s64 vover, vover_pct; u32 hwa; @@ -1362,8 +1362,9 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) /* calculate the current delay in effect - 1/2 every second */ tdelta = now->now - iocg->delay_at; - if (iocg->delay) - delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC); + shift = div64_u64(tdelta, USEC_PER_SEC); + if (iocg->delay && shift < BITS_PER_LONG) + delay = iocg->delay >> shift; else delay = 0; @@ -1438,8 +1439,11 @@ static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay, lockdep_assert_held(&iocg->ioc->lock); lockdep_assert_held(&iocg->waitq.lock); - /* make sure that nobody messed with @iocg */ - WARN_ON_ONCE(list_empty(&iocg->active_list)); + /* + * make sure that nobody messed with @iocg. Check iocg->pd.online + * to avoid warn when removing blkcg or disk. + */ + WARN_ON_ONCE(list_empty(&iocg->active_list) && iocg->pd.online); WARN_ON_ONCE(iocg->inuse > 1); iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt); @@ -2817,7 +2821,7 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) return; } - on_q_ns = ktime_get_ns() - rq->alloc_time_ns; + on_q_ns = blk_time_get_ns() - rq->alloc_time_ns; rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC); @@ -2900,7 +2904,7 @@ static int blk_iocost_init(struct gendisk *disk) ioc->vtime_base_rate = VTIME_PER_USEC; atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock); - ioc->period_at = ktime_to_us(ktime_get()); + ioc->period_at = ktime_to_us(blk_time_get()); atomic64_set(&ioc->cur_period, 0); atomic_set(&ioc->hweight_gen, 0); diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index c1a6aba1d..ebb522788 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -609,7 +609,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) if (!iolat->blkiolat->enabled) return; - now = ktime_to_ns(ktime_get()); + now = blk_time_get_ns(); while (blkg && blkg->parent) { iolat = blkg_to_lat(blkg); if (!iolat) { @@ -661,7 +661,7 @@ static void blkiolatency_timer_fn(struct timer_list *t) struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer); struct blkcg_gq *blkg; struct cgroup_subsys_state *pos_css; - u64 now = ktime_to_ns(ktime_get()); + u64 now = blk_time_get_ns(); rcu_read_lock(); blkg_for_each_descendant_pre(blkg, pos_css, @@ -985,7 +985,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd) struct blkcg_gq *blkg = lat_to_blkg(iolat); struct rq_qos *rqos = iolat_rq_qos(blkg->q); struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); - u64 now = ktime_to_ns(ktime_get()); + u64 now = blk_time_get_ns(); int cpu; if (blk_queue_nonrot(blkg->q)) diff --git a/block/blk-lib.c b/block/blk-lib.c index e59c3069e..a6954eafb 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -120,31 +120,28 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev, struct bio **biop, unsigned flags) { struct bio *bio = *biop; - unsigned int max_write_zeroes_sectors; + unsigned int max_sectors; if (bdev_read_only(bdev)) return -EPERM; - /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */ - max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev); + /* Ensure that max_sectors doesn't overflow bi_size */ + max_sectors = bdev_write_zeroes_sectors(bdev); - if (max_write_zeroes_sectors == 0) + if (max_sectors == 0) return -EOPNOTSUPP; while (nr_sects) { + unsigned int len = min_t(sector_t, nr_sects, max_sectors); + bio = blk_next_bio(bio, bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask); bio->bi_iter.bi_sector = sector; if (flags & BLKDEV_ZERO_NOUNMAP) bio->bi_opf |= REQ_NOUNMAP; - if (nr_sects > max_write_zeroes_sectors) { - bio->bi_iter.bi_size = max_write_zeroes_sectors << 9; - nr_sects -= max_write_zeroes_sectors; - sector += max_write_zeroes_sectors; - } else { - bio->bi_iter.bi_size = nr_sects << 9; - nr_sects = 0; - } + bio->bi_iter.bi_size = len << SECTOR_SHIFT; + nr_sects -= len; + sector += len; cond_resched(); } @@ -322,7 +319,7 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, return -EPERM; blk_start_plug(&plug); - for (;;) { + while (nr_sects) { unsigned int len = min_t(sector_t, nr_sects, max_sectors); bio = blk_next_bio(bio, bdev, 0, REQ_OP_SECURE_ERASE, gfp); @@ -331,13 +328,12 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, sector += len; nr_sects -= len; - if (!nr_sects) { - ret = submit_bio_wait(bio); - bio_put(bio); - break; - } cond_resched(); } + if (bio) { + ret = submit_bio_wait(bio); + bio_put(bio); + } blk_finish_plug(&plug); return ret; diff --git a/block/blk-merge.c b/block/blk-merge.c index 65e75efa9..4e3483a16 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -115,17 +115,13 @@ static struct bio *bio_split_discard(struct bio *bio, *nsegs = 1; - /* Zero-sector (unknown) and one-sector granularities are the same. */ granularity = max(lim->discard_granularity >> 9, 1U); max_discard_sectors = min(lim->max_discard_sectors, bio_allowed_max_sectors(lim)); max_discard_sectors -= max_discard_sectors % granularity; - - if (unlikely(!max_discard_sectors)) { - /* XXX: warn */ + if (unlikely(!max_discard_sectors)) return NULL; - } if (bio_sectors(bio) <= max_discard_sectors) return NULL; @@ -730,7 +726,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, * which can be mixed are set in each bio and mark @rq as mixed * merged. */ -void blk_rq_set_mixed_merge(struct request *rq) +static void blk_rq_set_mixed_merge(struct request *rq) { blk_opf_t ff = rq->cmd_flags & REQ_FAILFAST_MASK; struct bio *bio; @@ -814,6 +810,10 @@ static struct request *attempt_merge(struct request_queue *q, if (rq_data_dir(req) != rq_data_dir(next)) return NULL; + /* Don't merge requests with different write hints. */ + if (req->write_hint != next->write_hint) + return NULL; + if (req->ioprio != next->ioprio) return NULL; @@ -941,6 +941,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (!bio_crypt_rq_ctx_compatible(rq, bio)) return false; + /* Don't merge requests with different write hints. */ + if (rq->write_hint != bio->bi_write_hint) + return false; + if (rq->ioprio != bio_prio(bio)) return false; diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 5cbeb9344..94668e72a 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -479,23 +479,6 @@ out: return res; } -static int hctx_run_show(void *data, struct seq_file *m) -{ - struct blk_mq_hw_ctx *hctx = data; - - seq_printf(m, "%lu\n", hctx->run); - return 0; -} - -static ssize_t hctx_run_write(void *data, const char __user *buf, size_t count, - loff_t *ppos) -{ - struct blk_mq_hw_ctx *hctx = data; - - hctx->run = 0; - return count; -} - static int hctx_active_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; @@ -624,7 +607,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"tags_bitmap", 0400, hctx_tags_bitmap_show}, {"sched_tags", 0400, hctx_sched_tags_show}, {"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show}, - {"run", 0600, hctx_run_show, hctx_run_write}, {"active", 0400, hctx_active_show}, {"dispatch_busy", 0400, hctx_dispatch_busy_show}, {"type", 0400, hctx_type_show}, diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 67c95f31b..451a2c1f1 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -324,8 +324,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) return; - hctx->run++; - /* * A return of -EAGAIN is an indication that hctx->dispatch is not * empty and we must run again in order to avoid starving flushes. diff --git a/block/blk-mq.c b/block/blk-mq.c index a71974a5e..32afb87ef 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -21,7 +21,6 @@ #include <linux/llist.h> #include <linux/cpu.h> #include <linux/cache.h> -#include <linux/sched/sysctl.h> #include <linux/sched/topology.h> #include <linux/sched/signal.h> #include <linux/delay.h> @@ -40,7 +39,6 @@ #include "blk-stat.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" -#include "blk-ioprio.h" static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd); @@ -323,7 +321,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) RB_CLEAR_NODE(&rq->rb_node); rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = BLK_MQ_NO_TAG; - rq->start_time_ns = ktime_get_ns(); + rq->start_time_ns = blk_time_get_ns(); rq->part = NULL; blk_crypto_rq_set_defaults(rq); } @@ -333,7 +331,7 @@ EXPORT_SYMBOL(blk_rq_init); static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) { if (blk_mq_need_time_stamp(rq)) - rq->start_time_ns = ktime_get_ns(); + rq->start_time_ns = blk_time_get_ns(); else rq->start_time_ns = 0; @@ -444,7 +442,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) /* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q)) - alloc_time_ns = ktime_get_ns(); + alloc_time_ns = blk_time_get_ns(); if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; @@ -629,7 +627,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, /* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q)) - alloc_time_ns = ktime_get_ns(); + alloc_time_ns = blk_time_get_ns(); /* * If the tag allocator sleeps we could get an allocation for a @@ -1037,7 +1035,7 @@ static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) inline void __blk_mq_end_request(struct request *rq, blk_status_t error) { if (blk_mq_need_time_stamp(rq)) - __blk_mq_end_request_acct(rq, ktime_get_ns()); + __blk_mq_end_request_acct(rq, blk_time_get_ns()); blk_mq_finish_request(rq); @@ -1080,7 +1078,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob) u64 now = 0; if (iob->need_ts) - now = ktime_get_ns(); + now = blk_time_get_ns(); while ((rq = rq_list_pop(&iob->req_list)) != NULL) { prefetch(rq->bio); @@ -1163,10 +1161,11 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq) if (force_irqthreads()) return false; - /* same CPU or cache domain? Complete locally */ + /* same CPU or cache domain and capacity? Complete locally */ if (cpu == rq->mq_ctx->cpu || (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) && - cpus_share_cache(cpu, rq->mq_ctx->cpu))) + cpus_share_cache(cpu, rq->mq_ctx->cpu) && + cpus_equal_capacity(cpu, rq->mq_ctx->cpu))) return false; /* don't try to IPI to an offline CPU */ @@ -1248,8 +1247,9 @@ void blk_mq_start_request(struct request *rq) trace_block_rq_issue(rq); - if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { - rq->io_start_time_ns = ktime_get_ns(); + if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) && + !blk_rq_is_passthrough(rq)) { + rq->io_start_time_ns = blk_time_get_ns(); rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; rq_qos_issue(q, rq); @@ -1404,22 +1404,10 @@ blk_status_t blk_execute_rq(struct request *rq, bool at_head) blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); blk_mq_run_hw_queue(hctx, false); - if (blk_rq_is_poll(rq)) { + if (blk_rq_is_poll(rq)) blk_rq_poll_completion(rq, &wait.done); - } else { - /* - * Prevent hang_check timer from firing at us during very long - * I/O - */ - unsigned long hang_check = sysctl_hung_task_timeout_secs; - - if (hang_check) - while (!wait_for_completion_io_timeout(&wait.done, - hang_check * (HZ/2))) - ; - else - wait_for_completion_io(&wait.done); - } + else + blk_wait_io(&wait.done); return wait.ret; } @@ -2579,6 +2567,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, rq->cmd_flags |= REQ_FAILFAST_MASK; rq->__sector = bio->bi_iter.bi_sector; + rq->write_hint = bio->bi_write_hint; blk_rq_bio_prep(rq, bio, nr_segs); /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */ @@ -2886,9 +2875,6 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, }; struct request *rq; - if (blk_mq_attempt_bio_merge(q, bio, nsegs)) - return NULL; - rq_qos_throttle(q, bio); if (plug) { @@ -2906,20 +2892,32 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, return NULL; } -/* return true if this @rq can be used for @bio */ -static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug, - struct bio *bio) +/* + * Check if there is a suitable cached request and return it. + */ +static struct request *blk_mq_peek_cached_request(struct blk_plug *plug, + struct request_queue *q, blk_opf_t opf) { - enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf); - enum hctx_type hctx_type = rq->mq_hctx->type; + enum hctx_type type = blk_mq_get_hctx_type(opf); + struct request *rq; - WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq); + if (!plug) + return NULL; + rq = rq_list_peek(&plug->cached_rq); + if (!rq || rq->q != q) + return NULL; + if (type != rq->mq_hctx->type && + (type != HCTX_TYPE_READ || rq->mq_hctx->type != HCTX_TYPE_DEFAULT)) + return NULL; + if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) + return NULL; + return rq; +} - if (type != hctx_type && - !(type == HCTX_TYPE_READ && hctx_type == HCTX_TYPE_DEFAULT)) - return false; - if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf)) - return false; +static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, + struct bio *bio) +{ + WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq); /* * If any qos ->throttle() end up blocking, we will have flushed the @@ -2932,15 +2930,6 @@ static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug, blk_mq_rq_time_init(rq, 0); rq->cmd_flags = bio->bi_opf; INIT_LIST_HEAD(&rq->queuelist); - return true; -} - -static void bio_set_ioprio(struct bio *bio) -{ - /* Nobody set ioprio so far? Initialize it based on task's nice value */ - if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE) - bio->bi_ioprio = get_current_ioprio(); - blkcg_set_ioprio(bio); } /** @@ -2962,51 +2951,43 @@ void blk_mq_submit_bio(struct bio *bio) struct blk_plug *plug = blk_mq_plug(bio); const int is_sync = op_is_sync(bio->bi_opf); struct blk_mq_hw_ctx *hctx; - struct request *rq = NULL; unsigned int nr_segs = 1; + struct request *rq; blk_status_t ret; bio = blk_queue_bounce(bio, q); - bio_set_ioprio(bio); - if (plug) { - rq = rq_list_peek(&plug->cached_rq); - if (rq && rq->q != q) - rq = NULL; - } - if (rq) { - if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { - bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); - if (!bio) - return; - } - if (!bio_integrity_prep(bio)) - return; - if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) - return; - if (blk_mq_can_use_cached_rq(rq, plug, bio)) - goto done; - percpu_ref_get(&q->q_usage_counter); - } else { + /* + * If the plug has a cached request for this queue, try use it. + * + * The cached request already holds a q_usage_counter reference and we + * don't have to acquire a new one if we use it. + */ + rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf); + if (!rq) { if (unlikely(bio_queue_enter(bio))) return; - if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { - bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); - if (!bio) - goto fail; - } - if (!bio_integrity_prep(bio)) - goto fail; } - rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); - if (unlikely(!rq)) { -fail: - blk_queue_exit(q); - return; + if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); + if (!bio) + goto queue_exit; + } + if (!bio_integrity_prep(bio)) + goto queue_exit; + + if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) + goto queue_exit; + + if (!rq) { + rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); + if (unlikely(!rq)) + goto queue_exit; + } else { + blk_mq_use_cached_rq(rq, plug, bio); } -done: trace_block_getrq(bio); rq_qos_track(q, rq, bio); @@ -3037,6 +3018,15 @@ done: } else { blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq)); } + return; + +queue_exit: + /* + * Don't drop the queue reference if we were trying to use a cached + * request and thus didn't acquire one. + */ + if (!rq) + blk_queue_exit(q); } #ifdef CONFIG_BLK_MQ_STACKING @@ -3098,7 +3088,7 @@ blk_status_t blk_insert_cloned_request(struct request *rq) blk_mq_run_dispatch_ops(q, ret = blk_mq_request_issue_directly(rq, true)); if (ret) - blk_account_io_done(rq, ktime_get_ns()); + blk_account_io_done(rq, blk_time_get_ns()); return ret; } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); @@ -3176,6 +3166,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, } rq->nr_phys_segments = rq_src->nr_phys_segments; rq->ioprio = rq_src->ioprio; + rq->write_hint = rq_src->write_hint; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; @@ -4077,15 +4068,16 @@ void blk_mq_release(struct request_queue *q) blk_mq_sysfs_deinit(q); } -static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, - void *queuedata) +struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, + struct queue_limits *lim, void *queuedata) { + struct queue_limits default_lim = { }; struct request_queue *q; int ret; - q = blk_alloc_queue(set->numa_node); - if (!q) - return ERR_PTR(-ENOMEM); + q = blk_alloc_queue(lim ? lim : &default_lim, set->numa_node); + if (IS_ERR(q)) + return q; q->queuedata = queuedata; ret = blk_mq_init_allocated_queue(set, q); if (ret) { @@ -4094,20 +4086,15 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, } return q; } - -struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) -{ - return blk_mq_init_queue_data(set, NULL); -} -EXPORT_SYMBOL(blk_mq_init_queue); +EXPORT_SYMBOL(blk_mq_alloc_queue); /** * blk_mq_destroy_queue - shutdown a request queue * @q: request queue to shutdown * - * This shuts down a request queue allocated by blk_mq_init_queue(). All future + * This shuts down a request queue allocated by blk_mq_alloc_queue(). All future * requests will be failed with -ENODEV. The caller is responsible for dropping - * the reference from blk_mq_init_queue() by calling blk_put_queue(). + * the reference from blk_mq_alloc_queue() by calling blk_put_queue(). * * Context: can sleep */ @@ -4128,13 +4115,14 @@ void blk_mq_destroy_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_mq_destroy_queue); -struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, + struct queue_limits *lim, void *queuedata, struct lock_class_key *lkclass) { struct request_queue *q; struct gendisk *disk; - q = blk_mq_init_queue_data(set, queuedata); + q = blk_mq_alloc_queue(set, lim, queuedata); if (IS_ERR(q)) return ERR_CAST(q); @@ -4388,7 +4376,7 @@ static void blk_mq_update_queue_map(struct blk_mq_tag_set *set) if (set->nr_maps == 1) set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues; - if (set->ops->map_queues && !is_kdump_kernel()) { + if (set->ops->map_queues) { int i; /* @@ -4487,14 +4475,12 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) /* * If a crashdump is active, then we are potentially in a very - * memory constrained environment. Limit us to 1 queue and - * 64 tags to prevent using too much memory. + * memory constrained environment. Limit us to 64 tags to prevent + * using too much memory. */ - if (is_kdump_kernel()) { - set->nr_hw_queues = 1; - set->nr_maps = 1; + if (is_kdump_kernel()) set->queue_depth = min(64U, set->queue_depth); - } + /* * There is no use for more h/w queues than cpus if we just have * a single map @@ -4524,7 +4510,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) GFP_KERNEL, set->numa_node); if (!set->map[i].mq_map) goto out_free_mq_map; - set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; + set->map[i].nr_queues = set->nr_hw_queues; } blk_mq_update_queue_map(set); diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index f48ee150d..37245c97e 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -118,7 +118,7 @@ static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio) static inline void rq_qos_done(struct request_queue *q, struct request *rq) { - if (q->rq_qos) + if (q->rq_qos && !blk_rq_is_passthrough(rq)) __rq_qos_done(q->rq_qos, rq); } diff --git a/block/blk-settings.c b/block/blk-settings.c index 7019b8e20..9d6033e01 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -26,52 +26,21 @@ void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); /** - * blk_set_default_limits - reset limits to default values - * @lim: the queue_limits structure to reset - * - * Description: - * Returns a queue_limit struct to its default state. - */ -void blk_set_default_limits(struct queue_limits *lim) -{ - lim->max_segments = BLK_MAX_SEGMENTS; - lim->max_discard_segments = 1; - lim->max_integrity_segments = 0; - lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; - lim->virt_boundary_mask = 0; - lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; - lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; - lim->max_user_sectors = lim->max_dev_sectors = 0; - lim->chunk_sectors = 0; - lim->max_write_zeroes_sectors = 0; - lim->max_zone_append_sectors = 0; - lim->max_discard_sectors = 0; - lim->max_hw_discard_sectors = 0; - lim->max_secure_erase_sectors = 0; - lim->discard_granularity = 0; - lim->discard_alignment = 0; - lim->discard_misaligned = 0; - lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; - lim->bounce = BLK_BOUNCE_NONE; - lim->alignment_offset = 0; - lim->io_opt = 0; - lim->misaligned = 0; - lim->zoned = BLK_ZONED_NONE; - lim->zone_write_granularity = 0; - lim->dma_alignment = 511; -} - -/** * blk_set_stacking_limits - set default limits for stacking devices * @lim: the queue_limits structure to reset * - * Description: - * Returns a queue_limit struct to its default state. Should be used - * by stacking drivers like DM that have no internal limits. + * Prepare queue limits for applying limits from underlying devices using + * blk_stack_limits(). */ void blk_set_stacking_limits(struct queue_limits *lim) { - blk_set_default_limits(lim); + memset(lim, 0, sizeof(*lim)); + lim->logical_block_size = SECTOR_SIZE; + lim->physical_block_size = SECTOR_SIZE; + lim->io_min = SECTOR_SIZE; + lim->discard_granularity = SECTOR_SIZE; + lim->dma_alignment = SECTOR_SIZE - 1; + lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; /* Inherit limits from component devices */ lim->max_segments = USHRT_MAX; @@ -82,9 +51,237 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_dev_sectors = UINT_MAX; lim->max_write_zeroes_sectors = UINT_MAX; lim->max_zone_append_sectors = UINT_MAX; + lim->max_user_discard_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); +static void blk_apply_bdi_limits(struct backing_dev_info *bdi, + struct queue_limits *lim) +{ + /* + * For read-ahead of large files to be effective, we need to read ahead + * at least twice the optimal I/O size. + */ + bdi->ra_pages = max(lim->io_opt * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); + bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT; +} + +static int blk_validate_zoned_limits(struct queue_limits *lim) +{ + if (!lim->zoned) { + if (WARN_ON_ONCE(lim->max_open_zones) || + WARN_ON_ONCE(lim->max_active_zones) || + WARN_ON_ONCE(lim->zone_write_granularity) || + WARN_ON_ONCE(lim->max_zone_append_sectors)) + return -EINVAL; + return 0; + } + + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED))) + return -EINVAL; + + if (lim->zone_write_granularity < lim->logical_block_size) + lim->zone_write_granularity = lim->logical_block_size; + + if (lim->max_zone_append_sectors) { + /* + * The Zone Append size is limited by the maximum I/O size + * and the zone size given that it can't span zones. + */ + lim->max_zone_append_sectors = + min3(lim->max_hw_sectors, + lim->max_zone_append_sectors, + lim->chunk_sectors); + } + + return 0; +} + +/* + * Check that the limits in lim are valid, initialize defaults for unset + * values, and cap values based on others where needed. + */ +static int blk_validate_limits(struct queue_limits *lim) +{ + unsigned int max_hw_sectors; + + /* + * Unless otherwise specified, default to 512 byte logical blocks and a + * physical block size equal to the logical block size. + */ + if (!lim->logical_block_size) + lim->logical_block_size = SECTOR_SIZE; + if (lim->physical_block_size < lim->logical_block_size) + lim->physical_block_size = lim->logical_block_size; + + /* + * The minimum I/O size defaults to the physical block size unless + * explicitly overridden. + */ + if (lim->io_min < lim->physical_block_size) + lim->io_min = lim->physical_block_size; + + /* + * max_hw_sectors has a somewhat weird default for historical reason, + * but driver really should set their own instead of relying on this + * value. + * + * The block layer relies on the fact that every driver can + * handle at lest a page worth of data per I/O, and needs the value + * aligned to the logical block size. + */ + if (!lim->max_hw_sectors) + lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; + if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SECTORS)) + return -EINVAL; + lim->max_hw_sectors = round_down(lim->max_hw_sectors, + lim->logical_block_size >> SECTOR_SHIFT); + + /* + * The actual max_sectors value is a complex beast and also takes the + * max_dev_sectors value (set by SCSI ULPs) and a user configurable + * value into account. The ->max_sectors value is always calculated + * from these, so directly setting it won't have any effect. + */ + max_hw_sectors = min_not_zero(lim->max_hw_sectors, + lim->max_dev_sectors); + if (lim->max_user_sectors) { + if (lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE) + return -EINVAL; + lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors); + } else { + lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP); + } + lim->max_sectors = round_down(lim->max_sectors, + lim->logical_block_size >> SECTOR_SHIFT); + + /* + * Random default for the maximum number of segments. Driver should not + * rely on this and set their own. + */ + if (!lim->max_segments) + lim->max_segments = BLK_MAX_SEGMENTS; + + lim->max_discard_sectors = + min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors); + + if (!lim->max_discard_segments) + lim->max_discard_segments = 1; + + if (lim->discard_granularity < lim->physical_block_size) + lim->discard_granularity = lim->physical_block_size; + + /* + * By default there is no limit on the segment boundary alignment, + * but if there is one it can't be smaller than the page size as + * that would break all the normal I/O patterns. + */ + if (!lim->seg_boundary_mask) + lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; + if (WARN_ON_ONCE(lim->seg_boundary_mask < PAGE_SIZE - 1)) + return -EINVAL; + + /* + * Stacking device may have both virtual boundary and max segment + * size limit, so allow this setting now, and long-term the two + * might need to move out of stacking limits since we have immutable + * bvec and lower layer bio splitting is supposed to handle the two + * correctly. + */ + if (lim->virt_boundary_mask) { + if (!lim->max_segment_size) + lim->max_segment_size = UINT_MAX; + } else { + /* + * The maximum segment size has an odd historic 64k default that + * drivers probably should override. Just like the I/O size we + * require drivers to at least handle a full page per segment. + */ + if (!lim->max_segment_size) + lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; + if (WARN_ON_ONCE(lim->max_segment_size < PAGE_SIZE)) + return -EINVAL; + } + + /* + * We require drivers to at least do logical block aligned I/O, but + * historically could not check for that due to the separate calls + * to set the limits. Once the transition is finished the check + * below should be narrowed down to check the logical block size. + */ + if (!lim->dma_alignment) + lim->dma_alignment = SECTOR_SIZE - 1; + if (WARN_ON_ONCE(lim->dma_alignment > PAGE_SIZE)) + return -EINVAL; + + if (lim->alignment_offset) { + lim->alignment_offset &= (lim->physical_block_size - 1); + lim->misaligned = 0; + } + + return blk_validate_zoned_limits(lim); +} + +/* + * Set the default limits for a newly allocated queue. @lim contains the + * initial limits set by the driver, which could be no limit in which case + * all fields are cleared to zero. + */ +int blk_set_default_limits(struct queue_limits *lim) +{ + /* + * Most defaults are set by capping the bounds in blk_validate_limits, + * but max_user_discard_sectors is special and needs an explicit + * initialization to the max value here. + */ + lim->max_user_discard_sectors = UINT_MAX; + return blk_validate_limits(lim); +} + +/** + * queue_limits_commit_update - commit an atomic update of queue limits + * @q: queue to update + * @lim: limits to apply + * + * Apply the limits in @lim that were obtained from queue_limits_start_update() + * and updated by the caller to @q. + * + * Returns 0 if successful, else a negative error code. + */ +int queue_limits_commit_update(struct request_queue *q, + struct queue_limits *lim) + __releases(q->limits_lock) +{ + int error = blk_validate_limits(lim); + + if (!error) { + q->limits = *lim; + if (q->disk) + blk_apply_bdi_limits(q->disk->bdi, lim); + } + mutex_unlock(&q->limits_lock); + return error; +} +EXPORT_SYMBOL_GPL(queue_limits_commit_update); + +/** + * queue_limits_set - apply queue limits to queue + * @q: queue to update + * @lim: limits to apply + * + * Apply the limits in @lim that were freshly initialized to @q. + * To update existing limits use queue_limits_start_update() and + * queue_limits_commit_update() instead. + * + * Returns 0 if successful, else a negative error code. + */ +int queue_limits_set(struct request_queue *q, struct queue_limits *lim) +{ + mutex_lock(&q->limits_lock); + return queue_limits_commit_update(q, lim); +} +EXPORT_SYMBOL_GPL(queue_limits_set); + /** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device @@ -127,8 +324,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto if ((max_hw_sectors << 9) < PAGE_SIZE) { max_hw_sectors = 1 << (PAGE_SHIFT - 9); - printk(KERN_INFO "%s: set to minimum %d\n", - __func__, max_hw_sectors); + pr_info("%s: set to minimum %u\n", __func__, max_hw_sectors); } max_hw_sectors = round_down(max_hw_sectors, @@ -140,7 +336,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto if (limits->max_user_sectors) max_sectors = min(max_sectors, limits->max_user_sectors); else - max_sectors = min(max_sectors, BLK_DEF_MAX_SECTORS); + max_sectors = min(max_sectors, BLK_DEF_MAX_SECTORS_CAP); max_sectors = round_down(max_sectors, limits->logical_block_size >> SECTOR_SHIFT); @@ -178,8 +374,11 @@ EXPORT_SYMBOL(blk_queue_chunk_sectors); void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors) { - q->limits.max_hw_discard_sectors = max_discard_sectors; - q->limits.max_discard_sectors = max_discard_sectors; + struct queue_limits *lim = &q->limits; + + lim->max_hw_discard_sectors = max_discard_sectors; + lim->max_discard_sectors = + min(max_discard_sectors, lim->max_user_discard_sectors); } EXPORT_SYMBOL(blk_queue_max_discard_sectors); @@ -248,8 +447,7 @@ void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments { if (!max_segments) { max_segments = 1; - printk(KERN_INFO "%s: set to minimum %d\n", - __func__, max_segments); + pr_info("%s: set to minimum %u\n", __func__, max_segments); } q->limits.max_segments = max_segments; @@ -285,8 +483,7 @@ void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size) { if (max_size < PAGE_SIZE) { max_size = PAGE_SIZE; - printk(KERN_INFO "%s: set to minimum %d\n", - __func__, max_size); + pr_info("%s: set to minimum %u\n", __func__, max_size); } /* see blk_queue_virt_boundary() for the explanation */ @@ -312,6 +509,9 @@ void blk_queue_logical_block_size(struct request_queue *q, unsigned int size) limits->logical_block_size = size; + if (limits->discard_granularity < limits->logical_block_size) + limits->discard_granularity = limits->logical_block_size; + if (limits->physical_block_size < size) limits->physical_block_size = size; @@ -342,6 +542,9 @@ void blk_queue_physical_block_size(struct request_queue *q, unsigned int size) if (q->limits.physical_block_size < q->limits.logical_block_size) q->limits.physical_block_size = q->limits.logical_block_size; + if (q->limits.discard_granularity < q->limits.physical_block_size) + q->limits.discard_granularity = q->limits.physical_block_size; + if (q->limits.io_min < q->limits.physical_block_size) q->limits.io_min = q->limits.physical_block_size; } @@ -390,15 +593,7 @@ EXPORT_SYMBOL(blk_queue_alignment_offset); void disk_update_readahead(struct gendisk *disk) { - struct request_queue *q = disk->queue; - - /* - * For read-ahead of large files to be effective, we need to read ahead - * at least twice the optimal I/O size. - */ - disk->bdi->ra_pages = - max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); - disk->bdi->io_pages = queue_max_sectors(q) >> (PAGE_SHIFT - 9); + blk_apply_bdi_limits(disk->bdi, &disk->queue->limits); } EXPORT_SYMBOL_GPL(disk_update_readahead); @@ -695,28 +890,29 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, EXPORT_SYMBOL(blk_stack_limits); /** - * disk_stack_limits - adjust queue limits for stacked drivers - * @disk: MD/DM gendisk (top) + * queue_limits_stack_bdev - adjust queue_limits for stacked devices + * @t: the stacking driver limits (top device) * @bdev: the underlying block device (bottom) * @offset: offset to beginning of data within component device + * @pfx: prefix to use for warnings logged * * Description: - * Merges the limits for a top level gendisk and a bottom level - * block_device. + * This function is used by stacking drivers like MD and DM to ensure + * that all component devices have compatible block sizes and + * alignments. The stacking driver must provide a queue_limits + * struct (top) and then iteratively call the stacking function for + * all component (bottom) devices. The stacking function will + * attempt to combine the values and ensure proper alignment. */ -void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, - sector_t offset) +void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, + sector_t offset, const char *pfx) { - struct request_queue *t = disk->queue; - - if (blk_stack_limits(&t->limits, &bdev_get_queue(bdev)->limits, - get_start_sect(bdev) + (offset >> 9)) < 0) + if (blk_stack_limits(t, &bdev_get_queue(bdev)->limits, + get_start_sect(bdev) + offset)) pr_notice("%s: Warning: Device %pg is misaligned\n", - disk->disk_name, bdev); - - disk_update_readahead(disk); + pfx, bdev); } -EXPORT_SYMBOL(disk_stack_limits); +EXPORT_SYMBOL_GPL(queue_limits_stack_bdev); /** * blk_queue_update_dma_pad - update pad mask @@ -744,8 +940,7 @@ void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask) { if (mask < PAGE_SIZE - 1) { mask = PAGE_SIZE - 1; - printk(KERN_INFO "%s: set to minimum %lx\n", - __func__, mask); + pr_info("%s: set to minimum %lx\n", __func__, mask); } q->limits.seg_boundary_mask = mask; @@ -845,8 +1040,6 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) blk_queue_flag_set(QUEUE_FLAG_FUA, q); else blk_queue_flag_clear(QUEUE_FLAG_FUA, q); - - wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); } EXPORT_SYMBOL_GPL(blk_queue_write_cache); @@ -888,81 +1081,22 @@ bool blk_queue_can_use_dma_map_merging(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging); -static bool disk_has_partitions(struct gendisk *disk) -{ - unsigned long idx; - struct block_device *part; - bool ret = false; - - rcu_read_lock(); - xa_for_each(&disk->part_tbl, idx, part) { - if (bdev_is_partition(part)) { - ret = true; - break; - } - } - rcu_read_unlock(); - - return ret; -} - /** - * disk_set_zoned - configure the zoned model for a disk - * @disk: the gendisk of the queue to configure - * @model: the zoned model to set - * - * Set the zoned model of @disk to @model. - * - * When @model is BLK_ZONED_HM (host managed), this should be called only - * if zoned block device support is enabled (CONFIG_BLK_DEV_ZONED option). - * If @model specifies BLK_ZONED_HA (host aware), the effective model used - * depends on CONFIG_BLK_DEV_ZONED settings and on the existence of partitions - * on the disk. + * disk_set_zoned - inidicate a zoned device + * @disk: gendisk to configure */ -void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model) +void disk_set_zoned(struct gendisk *disk) { struct request_queue *q = disk->queue; - unsigned int old_model = q->limits.zoned; - switch (model) { - case BLK_ZONED_HM: - /* - * Host managed devices are supported only if - * CONFIG_BLK_DEV_ZONED is enabled. - */ - WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)); - break; - case BLK_ZONED_HA: - /* - * Host aware devices can be treated either as regular block - * devices (similar to drive managed devices) or as zoned block - * devices to take advantage of the zone command set, similarly - * to host managed devices. We try the latter if there are no - * partitions and zoned block device support is enabled, else - * we do nothing special as far as the block layer is concerned. - */ - if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || - disk_has_partitions(disk)) - model = BLK_ZONED_NONE; - break; - case BLK_ZONED_NONE: - default: - if (WARN_ON_ONCE(model != BLK_ZONED_NONE)) - model = BLK_ZONED_NONE; - break; - } + WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)); - q->limits.zoned = model; - if (model != BLK_ZONED_NONE) { - /* - * Set the zone write granularity to the device logical block - * size by default. The driver can change this value if needed. - */ - blk_queue_zone_write_granularity(q, - queue_logical_block_size(q)); - } else if (old_model != BLK_ZONED_NONE) { - disk_clear_zone_settings(disk); - } + /* + * Set the zone write granularity to the device logical block + * size by default. The driver can change this value if needed. + */ + q->limits.zoned = true; + blk_queue_zone_write_granularity(q, queue_logical_block_size(q)); } EXPORT_SYMBOL_GPL(disk_set_zoned); diff --git a/block/blk-stat.c b/block/blk-stat.c index 7ff76ae6c..e42c263e5 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -27,7 +27,7 @@ void blk_rq_stat_init(struct blk_rq_stat *stat) /* src is a per-cpu stat, mean isn't initialized */ void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) { - if (!src->nr_samples) + if (dst->nr_samples + src->nr_samples <= dst->nr_samples) return; dst->min = min(dst->min, src->min); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 0b2d04766..8c8f69d8b 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -174,23 +174,29 @@ static ssize_t queue_discard_max_show(struct request_queue *q, char *page) static ssize_t queue_discard_max_store(struct request_queue *q, const char *page, size_t count) { - unsigned long max_discard; - ssize_t ret = queue_var_store(&max_discard, page, count); + unsigned long max_discard_bytes; + struct queue_limits lim; + ssize_t ret; + int err; + ret = queue_var_store(&max_discard_bytes, page, count); if (ret < 0) return ret; - if (max_discard & (q->limits.discard_granularity - 1)) + if (max_discard_bytes & (q->limits.discard_granularity - 1)) return -EINVAL; - max_discard >>= 9; - if (max_discard > UINT_MAX) + if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX) return -EINVAL; - if (max_discard > q->limits.max_hw_discard_sectors) - max_discard = q->limits.max_hw_discard_sectors; + blk_mq_freeze_queue(q); + lim = queue_limits_start_update(q); + lim.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT; + err = queue_limits_commit_update(q, &lim); + blk_mq_unfreeze_queue(q); - q->limits.max_discard_sectors = max_discard; + if (err) + return err; return ret; } @@ -226,35 +232,22 @@ static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page) static ssize_t queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) { - unsigned long var; - unsigned int max_sectors_kb, - max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1, - page_kb = 1 << (PAGE_SHIFT - 10); - ssize_t ret = queue_var_store(&var, page, count); + unsigned long max_sectors_kb; + struct queue_limits lim; + ssize_t ret; + int err; + ret = queue_var_store(&max_sectors_kb, page, count); if (ret < 0) return ret; - max_sectors_kb = (unsigned int)var; - max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb, - q->limits.max_dev_sectors >> 1); - if (max_sectors_kb == 0) { - q->limits.max_user_sectors = 0; - max_sectors_kb = min(max_hw_sectors_kb, - BLK_DEF_MAX_SECTORS >> 1); - } else { - if (max_sectors_kb > max_hw_sectors_kb || - max_sectors_kb < page_kb) - return -EINVAL; - q->limits.max_user_sectors = max_sectors_kb << 1; - } - - spin_lock_irq(&q->queue_lock); - q->limits.max_sectors = max_sectors_kb << 1; - if (q->disk) - q->disk->bdi->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); - spin_unlock_irq(&q->queue_lock); - + blk_mq_freeze_queue(q); + lim = queue_limits_start_update(q); + lim.max_user_sectors = max_sectors_kb << 1; + err = queue_limits_commit_update(q, &lim); + blk_mq_unfreeze_queue(q); + if (err) + return err; return ret; } @@ -309,14 +302,9 @@ QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0); static ssize_t queue_zoned_show(struct request_queue *q, char *page) { - switch (blk_queue_zoned_model(q)) { - case BLK_ZONED_HA: - return sprintf(page, "host-aware\n"); - case BLK_ZONED_HM: + if (blk_queue_is_zoned(q)) return sprintf(page, "host-managed\n"); - default: - return sprintf(page, "none\n"); - } + return sprintf(page, "none\n"); } static ssize_t queue_nr_zones_show(struct request_queue *q, char *page) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 16f576662..f4850a6f8 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1098,7 +1098,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg) while ((bio = throtl_peek_queued(&sq->queued[READ])) && tg_may_dispatch(tg, bio, NULL)) { - tg_dispatch_one_bio(tg, bio_data_dir(bio)); + tg_dispatch_one_bio(tg, READ); nr_reads++; if (nr_reads >= max_nr_reads) @@ -1108,7 +1108,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg) while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && tg_may_dispatch(tg, bio, NULL)) { - tg_dispatch_one_bio(tg, bio_data_dir(bio)); + tg_dispatch_one_bio(tg, WRITE); nr_writes++; if (nr_writes >= max_nr_writes) @@ -1815,7 +1815,7 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg) time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold); ret = tg->latency_target == DFL_LATENCY_TARGET || tg->idletime_threshold == DFL_IDLE_THRESHOLD || - (ktime_get_ns() >> 10) - tg->last_finish_time > time || + (blk_time_get_ns() >> 10) - tg->last_finish_time > time || tg->avg_idletime > tg->idletime_threshold || (tg->latency_target && tg->bio_cnt && tg->bad_bio_cnt * 5 < tg->bio_cnt); @@ -2060,7 +2060,7 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg) if (last_finish_time == 0) return; - now = ktime_get_ns() >> 10; + now = blk_time_get_ns() >> 10; if (now <= last_finish_time || last_finish_time == tg->checked_last_finish_time) return; @@ -2327,7 +2327,7 @@ void blk_throtl_bio_endio(struct bio *bio) if (!tg->td->limit_valid[LIMIT_LOW]) return; - finish_time_ns = ktime_get_ns(); + finish_time_ns = blk_time_get_ns(); tg->last_finish_time = finish_time_ns >> 10; start_time = bio_issue_time(&bio->bi_issue) >> 10; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index f8fda9cf5..64472134d 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -29,6 +29,7 @@ #include "blk-wbt.h" #include "blk-rq-qos.h" #include "elevator.h" +#include "blk.h" #define CREATE_TRACE_POINTS #include <trace/events/wbt.h> @@ -84,8 +85,6 @@ struct rq_wb { u64 sync_issue; void *sync_cookie; - unsigned int wc; - unsigned long last_issue; /* last non-throttled issue */ unsigned long last_comp; /* last non-throttled comp */ unsigned long min_lat_nsec; @@ -207,7 +206,8 @@ static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw, */ if (wb_acct & WBT_DISCARD) limit = rwb->wb_background; - else if (rwb->wc && !wb_recent_wait(rwb)) + else if (test_bit(QUEUE_FLAG_WC, &rwb->rqos.disk->queue->queue_flags) && + !wb_recent_wait(rwb)) limit = 0; else limit = rwb->wb_normal; @@ -275,13 +275,12 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat) static u64 rwb_sync_issue_lat(struct rq_wb *rwb) { - u64 now, issue = READ_ONCE(rwb->sync_issue); + u64 issue = READ_ONCE(rwb->sync_issue); if (!issue || !rwb->sync_cookie) return 0; - now = ktime_to_ns(ktime_get()); - return now - issue; + return blk_time_get_ns() - issue; } static inline unsigned int wbt_inflight(struct rq_wb *rwb) @@ -699,13 +698,6 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq) } } -void wbt_set_write_cache(struct request_queue *q, bool write_cache_on) -{ - struct rq_qos *rqos = wbt_rq_qos(q); - if (rqos) - RQWB(rqos)->wc = write_cache_on; -} - /* * Enable wbt if defaults are configured that way */ @@ -918,7 +910,6 @@ int wbt_init(struct gendisk *disk) rwb->last_comp = rwb->last_issue = jiffies; rwb->win_nsec = RWB_WINDOW_NSEC; rwb->enable_state = WBT_STATE_ON_DEFAULT; - rwb->wc = test_bit(QUEUE_FLAG_WC, &q->queue_flags); rwb->rq_depth.default_depth = RWB_DEF_DEPTH; rwb->min_lat_nsec = wbt_default_latency_nsec(q); rwb->rq_depth.queue_depth = blk_queue_depth(q); diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 8a029e138..e5fc653b9 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -12,8 +12,6 @@ u64 wbt_get_min_lat(struct request_queue *q); void wbt_set_min_lat(struct request_queue *q, u64 val); bool wbt_disabled(struct request_queue *); -void wbt_set_write_cache(struct request_queue *, bool); - u64 wbt_default_latency_nsec(struct request_queue *); #else @@ -24,9 +22,6 @@ static inline void wbt_disable_default(struct gendisk *disk) static inline void wbt_enable_default(struct gendisk *disk) { } -static inline void wbt_set_write_cache(struct request_queue *q, bool wc) -{ -} #endif /* CONFIG_BLK_WBT */ diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 619ee41a5..da0f4b2a8 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -11,7 +11,6 @@ #include <linux/kernel.h> #include <linux/module.h> -#include <linux/rbtree.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> #include <linux/mm.h> @@ -177,8 +176,7 @@ static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx, } } -static int blkdev_zone_reset_all_emulated(struct block_device *bdev, - gfp_t gfp_mask) +static int blkdev_zone_reset_all_emulated(struct block_device *bdev) { struct gendisk *disk = bdev->bd_disk; sector_t capacity = bdev_nr_sectors(bdev); @@ -205,7 +203,7 @@ static int blkdev_zone_reset_all_emulated(struct block_device *bdev, } bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC, - gfp_mask); + GFP_KERNEL); bio->bi_iter.bi_sector = sector; sector += zone_sectors; @@ -223,7 +221,7 @@ out_free_need_reset: return ret; } -static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask) +static int blkdev_zone_reset_all(struct block_device *bdev) { struct bio bio; @@ -238,7 +236,6 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask) * @sector: Start sector of the first zone to operate on * @nr_sectors: Number of sectors, should be at least the length of one zone and * must be zone size aligned. - * @gfp_mask: Memory allocation flags (for bio_alloc) * * Description: * Perform the specified operation on the range of zones specified by @@ -248,7 +245,7 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask) * or finish request. */ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, - sector_t sector, sector_t nr_sectors, gfp_t gfp_mask) + sector_t sector, sector_t nr_sectors) { struct request_queue *q = bdev_get_queue(bdev); sector_t zone_sectors = bdev_zone_sectors(bdev); @@ -285,12 +282,12 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, */ if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) { if (!blk_queue_zone_resetall(q)) - return blkdev_zone_reset_all_emulated(bdev, gfp_mask); - return blkdev_zone_reset_all(bdev, gfp_mask); + return blkdev_zone_reset_all_emulated(bdev); + return blkdev_zone_reset_all(bdev); } while (sector < end_sector) { - bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, gfp_mask); + bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL); bio->bi_iter.bi_sector = sector; sector += zone_sectors; @@ -419,8 +416,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, return -ENOTTY; } - ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors, - GFP_KERNEL); + ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); fail: if (cmd == BLKRESETZONE) @@ -498,7 +494,6 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, set_bit(idx, args->conv_zones_bitmap); break; case BLK_ZONE_TYPE_SEQWRITE_REQ: - case BLK_ZONE_TYPE_SEQWRITE_PREF: if (!args->seq_zones_wlock) { args->seq_zones_wlock = blk_alloc_zone_bitmap(q->node, args->nr_zones); @@ -506,6 +501,7 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, return -ENOMEM; } break; + case BLK_ZONE_TYPE_SEQWRITE_PREF: default: pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", disk->disk_name, (int)zone->type, zone->start); @@ -615,22 +611,3 @@ int blk_revalidate_disk_zones(struct gendisk *disk, return ret; } EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); - -void disk_clear_zone_settings(struct gendisk *disk) -{ - struct request_queue *q = disk->queue; - - blk_mq_freeze_queue(q); - - disk_free_zone_bitmaps(disk); - blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q); - q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE; - disk->nr_zones = 0; - disk->max_open_zones = 0; - disk->max_active_zones = 0; - q->limits.chunk_sectors = 0; - q->limits.zone_write_granularity = 0; - q->limits.max_zone_append_sectors = 0; - - blk_mq_unfreeze_queue(q); -} diff --git a/block/blk.h b/block/blk.h index 08a358bc0..d9f584984 100644 --- a/block/blk.h +++ b/block/blk.h @@ -4,6 +4,8 @@ #include <linux/blk-crypto.h> #include <linux/memblock.h> /* for max_pfn/max_low_pfn */ +#include <linux/sched/sysctl.h> +#include <linux/timekeeping.h> #include <xen/xen.h> #include "blk-crypto-internal.h" @@ -70,6 +72,18 @@ static inline int bio_queue_enter(struct bio *bio) return __bio_queue_enter(q, bio); } +static inline void blk_wait_io(struct completion *done) +{ + /* Prevent hang_check timer from firing at us during very long I/O */ + unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2; + + if (timeout) + while (!wait_for_completion_io_timeout(done, timeout)) + ; + else + wait_for_completion_io(done); +} + #define BIO_INLINE_VECS 4 struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, gfp_t gfp_mask); @@ -325,11 +339,10 @@ int ll_back_merge_fn(struct request *req, struct bio *bio, bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, struct request *next); unsigned int blk_recalc_rq_segments(struct request *rq); -void blk_rq_set_mixed_merge(struct request *rq); bool blk_rq_merge_ok(struct request *rq, struct bio *bio); enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); -void blk_set_default_limits(struct queue_limits *lim); +int blk_set_default_limits(struct queue_limits *lim); int blk_dev_init(void); /* @@ -395,14 +408,12 @@ static inline struct bio *blk_queue_bounce(struct bio *bio, #ifdef CONFIG_BLK_DEV_ZONED void disk_free_zone_bitmaps(struct gendisk *disk); -void disk_clear_zone_settings(struct gendisk *disk); int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg); int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); #else /* CONFIG_BLK_DEV_ZONED */ static inline void disk_free_zone_bitmaps(struct gendisk *disk) {} -static inline void disk_clear_zone_settings(struct gendisk *disk) {} static inline int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg) { @@ -449,7 +460,7 @@ static inline void bio_release_page(struct bio *bio, struct page *page) unpin_user_page(page); } -struct request_queue *blk_alloc_queue(int node_id); +struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id); int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode); @@ -518,4 +529,75 @@ static inline int req_ref_read(struct request *req) return atomic_read(&req->ref); } +static inline u64 blk_time_get_ns(void) +{ + struct blk_plug *plug = current->plug; + + if (!plug || !in_task()) + return ktime_get_ns(); + + /* + * 0 could very well be a valid time, but rather than flag "this is + * a valid timestamp" separately, just accept that we'll do an extra + * ktime_get_ns() if we just happen to get 0 as the current time. + */ + if (!plug->cur_ktime) { + plug->cur_ktime = ktime_get_ns(); + current->flags |= PF_BLOCK_TS; + } + return plug->cur_ktime; +} + +static inline ktime_t blk_time_get(void) +{ + return ns_to_ktime(blk_time_get_ns()); +} + +/* + * From most significant bit: + * 1 bit: reserved for other usage, see below + * 12 bits: original size of bio + * 51 bits: issue time of bio + */ +#define BIO_ISSUE_RES_BITS 1 +#define BIO_ISSUE_SIZE_BITS 12 +#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) +#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) +#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) +#define BIO_ISSUE_SIZE_MASK \ + (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) +#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) + +/* Reserved bit for blk-throtl */ +#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) + +static inline u64 __bio_issue_time(u64 time) +{ + return time & BIO_ISSUE_TIME_MASK; +} + +static inline u64 bio_issue_time(struct bio_issue *issue) +{ + return __bio_issue_time(issue->value); +} + +static inline sector_t bio_issue_size(struct bio_issue *issue) +{ + return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); +} + +static inline void bio_issue_init(struct bio_issue *issue, + sector_t size) +{ + size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; + issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | + (blk_time_get_ns() & BIO_ISSUE_TIME_MASK) | + ((u64)size << BIO_ISSUE_SIZE_SHIFT)); +} + +void bdev_release(struct file *bdev_file); +int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, + const struct blk_holder_ops *hops, struct file *bdev_file); +int bdev_permission(dev_t dev, blk_mode_t mode, void *holder); + #endif /* BLK_INTERNAL_H */ diff --git a/block/bounce.c b/block/bounce.c index 7cfcb242f..d6a5219f2 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -169,6 +169,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src) if (bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); bio->bi_ioprio = bio_src->bi_ioprio; + bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; diff --git a/block/bsg-lib.c b/block/bsg-lib.c index b3acdbdb6..bcc7dee6a 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -383,7 +383,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, if (blk_mq_alloc_tag_set(set)) goto out_tag_set; - q = blk_mq_init_queue(set); + q = blk_mq_alloc_queue(set, NULL, NULL); if (IS_ERR(q)) { ret = PTR_ERR(q); goto out_queue; diff --git a/block/fops.c b/block/fops.c index 0abaac705..679d9b752 100644 --- a/block/fops.c +++ b/block/fops.c @@ -73,6 +73,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb)); } bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; + bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio.bi_ioprio = iocb->ki_ioprio; ret = bio_iov_iter_get_pages(&bio, iter); @@ -203,6 +204,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, for (;;) { bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; + bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; @@ -321,6 +323,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, dio->flags = 0; dio->iocb = iocb; bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; + bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio->bi_end_io = blkdev_bio_end_io_async; bio->bi_ioprio = iocb->ki_ioprio; @@ -410,9 +413,24 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock, return 0; } -static int blkdev_writepage(struct page *page, struct writeback_control *wbc) +/* + * We cannot call mpage_writepages() as it does not take the buffer lock. + * We must use block_write_full_folio() directly which holds the buffer + * lock. The buffer lock provides the synchronisation with writeback + * that filesystems rely on when they use the blockdev's mapping. + */ +static int blkdev_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - return block_write_full_page(page, blkdev_get_block, wbc); + struct blk_plug plug; + int err; + + blk_start_plug(&plug); + err = write_cache_pages(mapping, wbc, block_write_full_folio, + blkdev_get_block); + blk_finish_plug(&plug); + + return err; } static int blkdev_read_folio(struct file *file, struct folio *folio) @@ -449,7 +467,7 @@ const struct address_space_operations def_blk_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = blkdev_read_folio, .readahead = blkdev_readahead, - .writepage = blkdev_writepage, + .writepages = blkdev_writepages, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, .migrate_folio = buffer_migrate_folio_norefs, @@ -467,7 +485,7 @@ static void blkdev_readahead(struct readahead_control *rac) } static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset) + struct inode *inode, loff_t offset, unsigned int len) { loff_t isize = i_size_read(inode); @@ -500,7 +518,7 @@ const struct address_space_operations def_blk_aops = { .readahead = blkdev_readahead, .writepages = blkdev_writepages, .is_partially_uptodate = iomap_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .migrate_folio = filemap_migrate_folio, }; #endif /* CONFIG_BUFFER_HEAD */ @@ -554,18 +572,17 @@ static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, blk_mode_t file_to_blk_mode(struct file *file) { blk_mode_t mode = 0; - struct bdev_handle *handle = file->private_data; if (file->f_mode & FMODE_READ) mode |= BLK_OPEN_READ; if (file->f_mode & FMODE_WRITE) mode |= BLK_OPEN_WRITE; /* - * do_dentry_open() clears O_EXCL from f_flags, use handle->mode to - * determine whether the open was exclusive for already open files. + * do_dentry_open() clears O_EXCL from f_flags, use file->private_data + * to determine whether the open was exclusive for already open files. */ - if (handle) - mode |= handle->mode & BLK_OPEN_EXCL; + if (file->private_data) + mode |= BLK_OPEN_EXCL; else if (file->f_flags & O_EXCL) mode |= BLK_OPEN_EXCL; if (file->f_flags & O_NDELAY) @@ -584,36 +601,31 @@ blk_mode_t file_to_blk_mode(struct file *file) static int blkdev_open(struct inode *inode, struct file *filp) { - struct bdev_handle *handle; + struct block_device *bdev; blk_mode_t mode; - - /* - * Preserve backwards compatibility and allow large file access - * even if userspace doesn't ask for it explicitly. Some mkfs - * binary needs it. We might want to drop this workaround - * during an unstable branch. - */ - filp->f_flags |= O_LARGEFILE; - filp->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; + int ret; mode = file_to_blk_mode(filp); - handle = bdev_open_by_dev(inode->i_rdev, mode, - mode & BLK_OPEN_EXCL ? filp : NULL, NULL); - if (IS_ERR(handle)) - return PTR_ERR(handle); + /* Use the file as the holder. */ + if (mode & BLK_OPEN_EXCL) + filp->private_data = filp; + ret = bdev_permission(inode->i_rdev, mode, filp->private_data); + if (ret) + return ret; - if (bdev_nowait(handle->bdev)) - filp->f_mode |= FMODE_NOWAIT; + bdev = blkdev_get_no_open(inode->i_rdev); + if (!bdev) + return -ENXIO; - filp->f_mapping = handle->bdev->bd_inode->i_mapping; - filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); - filp->private_data = handle; - return 0; + ret = bdev_open(bdev, mode, filp->private_data, NULL, filp); + if (ret) + blkdev_put_no_open(bdev); + return ret; } static int blkdev_release(struct inode *inode, struct file *filp) { - bdev_release(filp->private_data); + bdev_release(filp); return 0; } diff --git a/block/genhd.c b/block/genhd.c index d74fb5b4a..52a4521df 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -342,12 +342,10 @@ EXPORT_SYMBOL_GPL(disk_uevent); int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode) { - struct bdev_handle *handle; + struct file *file; int ret = 0; - if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) - return -EINVAL; - if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state)) + if (!disk_has_partscan(disk)) return -EINVAL; if (disk->open_partitions) return -EBUSY; @@ -366,12 +364,12 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode) } set_bit(GD_NEED_PART_SCAN, &disk->state); - handle = bdev_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, NULL, - NULL); - if (IS_ERR(handle)) - ret = PTR_ERR(handle); + file = bdev_file_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, + NULL, NULL); + if (IS_ERR(file)) + ret = PTR_ERR(file); else - bdev_release(handle); + fput(file); /* * If blkdev_get_by_dev() failed early, GD_NEED_PART_SCAN is still set, @@ -503,8 +501,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, goto out_unregister_bdi; /* Make sure the first partition scan will be proceed */ - if (get_capacity(disk) && !(disk->flags & GENHD_FL_NO_PART) && - !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state)) + if (get_capacity(disk) && disk_has_partscan(disk)) set_bit(GD_NEED_PART_SCAN, &disk->state); bdev_add(disk->part0, ddev->devt); @@ -1047,6 +1044,12 @@ static ssize_t diskseq_show(struct device *dev, return sprintf(buf, "%llu\n", disk->diskseq); } +static ssize_t partscan_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", disk_has_partscan(dev_to_disk(dev))); +} + static DEVICE_ATTR(range, 0444, disk_range_show, NULL); static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL); @@ -1060,6 +1063,7 @@ static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL); +static DEVICE_ATTR(partscan, 0444, partscan_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, @@ -1106,6 +1110,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_events_async.attr, &dev_attr_events_poll_msecs.attr, &dev_attr_diskseq.attr, + &dev_attr_partscan.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif @@ -1201,7 +1206,7 @@ static int block_uevent(const struct device *dev, struct kobj_uevent_env *env) return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq); } -struct class block_class = { +const struct class block_class = { .name = "block", .dev_uevent = block_uevent, }; @@ -1391,19 +1396,21 @@ out_free_disk: return NULL; } -struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) +struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node, + struct lock_class_key *lkclass) { + struct queue_limits default_lim = { }; struct request_queue *q; struct gendisk *disk; - q = blk_alloc_queue(node); - if (!q) - return NULL; + q = blk_alloc_queue(lim ? lim : &default_lim, node); + if (IS_ERR(q)) + return ERR_CAST(q); disk = __alloc_disk_node(q, node, lkclass); if (!disk) { blk_put_queue(q); - return NULL; + return ERR_PTR(-ENOMEM); } set_bit(GD_OWNS_QUEUE, &disk->state); return disk; diff --git a/block/ioctl.c b/block/ioctl.c index 438f79c56..f505f9c34 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -18,7 +18,7 @@ static int blkpg_do_ioctl(struct block_device *bdev, { struct gendisk *disk = bdev->bd_disk; struct blkpg_partition p; - sector_t start, length; + sector_t start, length, capacity, end; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -41,6 +41,13 @@ static int blkpg_do_ioctl(struct block_device *bdev, start = p.start >> SECTOR_SHIFT; length = p.length >> SECTOR_SHIFT; + capacity = get_capacity(disk); + + if (check_add_overflow(start, length, &end)) + return -EINVAL; + + if (start >= capacity || end > capacity) + return -EINVAL; switch (op) { case BLKPG_ADD_PARTITION: @@ -89,7 +96,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, unsigned long arg) { uint64_t range[2]; - uint64_t start, len; + uint64_t start, len, end; struct inode *inode = bdev->bd_inode; int err; @@ -110,7 +117,8 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, if (len & 511) return -EINVAL; - if (start + len > bdev_nr_bytes(bdev)) + if (check_add_overflow(start, len, &end) || + end > bdev_nr_bytes(bdev)) return -EINVAL; filemap_invalidate_lock(inode->i_mapping); @@ -469,7 +477,7 @@ static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode, int __user *argp) { int ret, n; - struct bdev_handle *handle; + struct file *file; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -481,12 +489,11 @@ static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode, if (mode & BLK_OPEN_EXCL) return set_blocksize(bdev, n); - handle = bdev_open_by_dev(bdev->bd_dev, mode, &bdev, NULL); - if (IS_ERR(handle)) + file = bdev_file_open_by_dev(bdev->bd_dev, mode, &bdev, NULL); + if (IS_ERR(file)) return -EBUSY; ret = set_blocksize(bdev, n); - bdev_release(handle); - + fput(file); return ret; } @@ -556,7 +563,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, return -EACCES; if (bdev_is_partition(bdev)) return -EINVAL; - return disk_scan_partitions(bdev->bd_disk, mode); + return disk_scan_partitions(bdev->bd_disk, + mode | BLK_OPEN_STRICT_SCAN); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACETEARDOWN: diff --git a/block/ioprio.c b/block/ioprio.c index b5a942519..73301a261 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -139,32 +139,6 @@ out: return ret; } -/* - * If the task has set an I/O priority, use that. Otherwise, return - * the default I/O priority. - * - * Expected to be called for current task or with task_lock() held to keep - * io_context stable. - */ -int __get_task_ioprio(struct task_struct *p) -{ - struct io_context *ioc = p->io_context; - int prio; - - if (p != current) - lockdep_assert_held(&p->alloc_lock); - if (ioc) - prio = ioc->ioprio; - else - prio = IOPRIO_DEFAULT; - - if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE) - prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p), - task_nice_ioprio(p)); - return prio; -} -EXPORT_SYMBOL_GPL(__get_task_ioprio); - static int get_task_ioprio(struct task_struct *p) { int ret; diff --git a/block/partitions/core.c b/block/partitions/core.c index f14602022..37b5f92d0 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -305,18 +305,10 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, * Partitions are not supported on zoned block devices that are used as * such. */ - switch (disk->queue->limits.zoned) { - case BLK_ZONED_HM: + if (bdev_is_zoned(disk->part0)) { pr_warn("%s: partitions not supported on host managed zoned block device\n", disk->disk_name); return ERR_PTR(-ENXIO); - case BLK_ZONED_HA: - pr_info("%s: disabling host aware zoned block device support due to partitions\n", - disk->disk_name); - disk_set_zoned(disk, BLK_ZONED_NONE); - break; - case BLK_ZONED_NONE: - break; } if (xa_load(&disk->part_tbl, partno)) @@ -427,21 +419,10 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, sector_t length) { - sector_t capacity = get_capacity(disk), end; struct block_device *part; int ret; mutex_lock(&disk->open_mutex); - if (check_add_overflow(start, length, &end)) { - ret = -EINVAL; - goto out; - } - - if (start >= capacity || end > capacity) { - ret = -EINVAL; - goto out; - } - if (!disk_live(disk)) { ret = -ENXIO; goto out; @@ -575,8 +556,8 @@ static bool blk_add_partition(struct gendisk *disk, part = add_partition(disk, p, from, size, state->parts[p].flags, &state->parts[p].info); if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) { - printk(KERN_ERR " %s: p%d could not be added: %ld\n", - disk->disk_name, p, -PTR_ERR(part)); + printk(KERN_ERR " %s: p%d could not be added: %pe\n", + disk->disk_name, p, part); return true; } @@ -592,10 +573,7 @@ static int blk_add_partitions(struct gendisk *disk) struct parsed_partitions *state; int ret = -EAGAIN, p; - if (disk->flags & GENHD_FL_NO_PART) - return 0; - - if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state)) + if (!disk_has_partscan(disk)) return 0; state = check_partition(disk); @@ -618,7 +596,7 @@ static int blk_add_partitions(struct gendisk *disk) /* * Partitions are not supported on host managed zoned block devices. */ - if (disk->queue->limits.zoned == BLK_ZONED_HM) { + if (bdev_is_zoned(disk->part0)) { pr_warn("%s: ignoring partition table on host managed zoned block device\n", disk->disk_name); ret = 0; diff --git a/block/partitions/mac.c b/block/partitions/mac.c index 7b521df00..c80183156 100644 --- a/block/partitions/mac.c +++ b/block/partitions/mac.c @@ -20,6 +20,7 @@ extern void note_bootable_part(dev_t dev, int part, int goodness); * Code to understand MacOS partition tables. */ +#ifdef CONFIG_PPC_PMAC static inline void mac_fix_string(char *stg, int len) { int i; @@ -27,6 +28,7 @@ static inline void mac_fix_string(char *stg, int len) for (i = len - 1; i >= 0 && stg[i] == ' '; i--) stg[i] = 0; } +#endif int mac_partition(struct parsed_partitions *state) { diff --git a/block/sed-opal.c b/block/sed-opal.c index fa4dba5d8..14fe0fef8 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1212,7 +1212,7 @@ static int cmd_start(struct opal_dev *dev, const u8 *uid, const u8 *method) static int start_opal_session_cont(struct opal_dev *dev) { u32 hsn, tsn; - int error = 0; + int error; error = parse_and_check_status(dev); if (error) @@ -1354,7 +1354,7 @@ static int get_active_key_cont(struct opal_dev *dev) { const char *activekey; size_t keylen; - int error = 0; + int error; error = parse_and_check_status(dev); if (error) @@ -2157,7 +2157,7 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data) u8 lr_buffer[OPAL_UID_LENGTH]; struct opal_lock_unlock *lkul = data; u8 read_locked = 1, write_locked = 1; - int err = 0; + int err; if (build_locking_range(lr_buffer, sizeof(lr_buffer), lkul->session.opal_key.lr) < 0) @@ -2580,7 +2580,7 @@ static int opal_get_discv(struct opal_dev *dev, struct opal_discovery *discv) const struct opal_step discovery0_step = { opal_discovery0, discv }; - int ret = 0; + int ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); @@ -3069,7 +3069,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) { struct opal_suspend_data *suspend; bool was_failure = false; - int ret = 0; + int ret; if (!dev) return false; @@ -3112,10 +3112,9 @@ static int opal_read_table(struct opal_dev *dev, { read_table_data, rw_tbl }, { end_opal_session, } }; - int ret = 0; if (!rw_tbl->size) - return ret; + return 0; return execute_steps(dev, read_table_steps, ARRAY_SIZE(read_table_steps)); @@ -3129,10 +3128,9 @@ static int opal_write_table(struct opal_dev *dev, { write_table_data, rw_tbl }, { end_opal_session, } }; - int ret = 0; if (!rw_tbl->size) - return ret; + return 0; return execute_steps(dev, write_table_steps, ARRAY_SIZE(write_table_steps)); diff --git a/block/t10-pi.c b/block/t10-pi.c index 914d8cddd..d90892fd6 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -12,14 +12,14 @@ #include <net/checksum.h> #include <asm/unaligned.h> -typedef __be16 (csum_fn) (void *, unsigned int); +typedef __be16 (csum_fn) (__be16, void *, unsigned int); -static __be16 t10_pi_crc_fn(void *data, unsigned int len) +static __be16 t10_pi_crc_fn(__be16 crc, void *data, unsigned int len) { - return cpu_to_be16(crc_t10dif(data, len)); + return cpu_to_be16(crc_t10dif_update(be16_to_cpu(crc), data, len)); } -static __be16 t10_pi_ip_fn(void *data, unsigned int len) +static __be16 t10_pi_ip_fn(__be16 csum, void *data, unsigned int len) { return (__force __be16)ip_compute_csum(data, len); } @@ -32,12 +32,16 @@ static __be16 t10_pi_ip_fn(void *data, unsigned int len) static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, csum_fn *fn, enum t10_dif_type type) { + u8 offset = iter->pi_offset; unsigned int i; for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct t10_pi_tuple *pi = iter->prot_buf; + struct t10_pi_tuple *pi = iter->prot_buf + offset; - pi->guard_tag = fn(iter->data_buf, iter->interval); + pi->guard_tag = fn(0, iter->data_buf, iter->interval); + if (offset) + pi->guard_tag = fn(pi->guard_tag, iter->prot_buf, + offset); pi->app_tag = 0; if (type == T10_PI_TYPE1_PROTECTION) @@ -56,12 +60,13 @@ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, csum_fn *fn, enum t10_dif_type type) { + u8 offset = iter->pi_offset; unsigned int i; BUG_ON(type == T10_PI_TYPE0_PROTECTION); for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct t10_pi_tuple *pi = iter->prot_buf; + struct t10_pi_tuple *pi = iter->prot_buf + offset; __be16 csum; if (type == T10_PI_TYPE1_PROTECTION || @@ -83,7 +88,9 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, goto next; } - csum = fn(iter->data_buf, iter->interval); + csum = fn(0, iter->data_buf, iter->interval); + if (offset) + csum = fn(csum, iter->prot_buf, offset); if (pi->guard_tag != csum) { pr_err("%s: guard tag error at sector %llu " \ @@ -134,8 +141,10 @@ static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) */ static void t10_pi_type1_prepare(struct request *rq) { - const int tuple_sz = rq->q->integrity.tuple_size; + struct blk_integrity *bi = &rq->q->integrity; + const int tuple_sz = bi->tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); + u8 offset = bi->pi_offset; struct bio *bio; __rq_for_each_bio(bio, rq) { @@ -154,7 +163,7 @@ static void t10_pi_type1_prepare(struct request *rq) p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len; j += tuple_sz) { - struct t10_pi_tuple *pi = p; + struct t10_pi_tuple *pi = p + offset; if (be32_to_cpu(pi->ref_tag) == virt) pi->ref_tag = cpu_to_be32(ref_tag); @@ -183,9 +192,11 @@ static void t10_pi_type1_prepare(struct request *rq) */ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { - unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp; - const int tuple_sz = rq->q->integrity.tuple_size; + struct blk_integrity *bi = &rq->q->integrity; + unsigned intervals = nr_bytes >> bi->interval_exp; + const int tuple_sz = bi->tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); + u8 offset = bi->pi_offset; struct bio *bio; __rq_for_each_bio(bio, rq) { @@ -200,7 +211,7 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { - struct t10_pi_tuple *pi = p; + struct t10_pi_tuple *pi = p + offset; if (be32_to_cpu(pi->ref_tag) == ref_tag) pi->ref_tag = cpu_to_be32(virt); @@ -280,20 +291,24 @@ const struct blk_integrity_profile t10_pi_type3_ip = { }; EXPORT_SYMBOL(t10_pi_type3_ip); -static __be64 ext_pi_crc64(void *data, unsigned int len) +static __be64 ext_pi_crc64(u64 crc, void *data, unsigned int len) { - return cpu_to_be64(crc64_rocksoft(data, len)); + return cpu_to_be64(crc64_rocksoft_update(crc, data, len)); } static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter, enum t10_dif_type type) { + u8 offset = iter->pi_offset; unsigned int i; for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct crc64_pi_tuple *pi = iter->prot_buf; + struct crc64_pi_tuple *pi = iter->prot_buf + offset; - pi->guard_tag = ext_pi_crc64(iter->data_buf, iter->interval); + pi->guard_tag = ext_pi_crc64(0, iter->data_buf, iter->interval); + if (offset) + pi->guard_tag = ext_pi_crc64(be64_to_cpu(pi->guard_tag), + iter->prot_buf, offset); pi->app_tag = 0; if (type == T10_PI_TYPE1_PROTECTION) @@ -319,10 +334,11 @@ static bool ext_pi_ref_escape(u8 *ref_tag) static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, enum t10_dif_type type) { + u8 offset = iter->pi_offset; unsigned int i; for (i = 0; i < iter->data_size; i += iter->interval) { - struct crc64_pi_tuple *pi = iter->prot_buf; + struct crc64_pi_tuple *pi = iter->prot_buf + offset; u64 ref, seed; __be64 csum; @@ -343,7 +359,11 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, goto next; } - csum = ext_pi_crc64(iter->data_buf, iter->interval); + csum = ext_pi_crc64(0, iter->data_buf, iter->interval); + if (offset) + csum = ext_pi_crc64(be64_to_cpu(csum), iter->prot_buf, + offset); + if (pi->guard_tag != csum) { pr_err("%s: guard tag error at sector %llu " \ "(rcvd %016llx, want %016llx)\n", @@ -373,8 +393,10 @@ static blk_status_t ext_pi_type1_generate_crc64(struct blk_integrity_iter *iter) static void ext_pi_type1_prepare(struct request *rq) { - const int tuple_sz = rq->q->integrity.tuple_size; + struct blk_integrity *bi = &rq->q->integrity; + const int tuple_sz = bi->tuple_size; u64 ref_tag = ext_pi_ref_tag(rq); + u8 offset = bi->pi_offset; struct bio *bio; __rq_for_each_bio(bio, rq) { @@ -393,7 +415,7 @@ static void ext_pi_type1_prepare(struct request *rq) p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len; j += tuple_sz) { - struct crc64_pi_tuple *pi = p; + struct crc64_pi_tuple *pi = p + offset; u64 ref = get_unaligned_be48(pi->ref_tag); if (ref == virt) @@ -411,9 +433,11 @@ static void ext_pi_type1_prepare(struct request *rq) static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { - unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp; - const int tuple_sz = rq->q->integrity.tuple_size; + struct blk_integrity *bi = &rq->q->integrity; + unsigned intervals = nr_bytes >> bi->interval_exp; + const int tuple_sz = bi->tuple_size; u64 ref_tag = ext_pi_ref_tag(rq); + u8 offset = bi->pi_offset; struct bio *bio; __rq_for_each_bio(bio, rq) { @@ -428,7 +452,7 @@ static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { - struct crc64_pi_tuple *pi = p; + struct crc64_pi_tuple *pi = p + offset; u64 ref = get_unaligned_be48(pi->ref_tag); if (ref == ref_tag) |